diff --git a/container/Dockerfile b/container/Dockerfile new file mode 100644 index 0000000..e93c772 --- /dev/null +++ b/container/Dockerfile @@ -0,0 +1,84 @@ +# syntax=docker/dockerfile:1 +# Stage 0 spike draft — the OUTER "builder" container image. +# +# Validates: unprivileged docker container running rootless podman inside. +# agent-server is deliberately NOT installed here (Stage 2 adds it); this image +# exists purely to prove the nesting works on the target host. +# +# See docs/plans/stage0-spike-brief.md for the validation tasks and +# container/SPIKE-FINDINGS.md for the (to-be-recorded) results. +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# podman + rootless plumbing: +# uidmap → newuidmap/newgidmap setuid helpers (subordinate id mapping) +# slirp4netns → rootless network namespace + port forwarding +# fuse-overlayfs → overlay storage in nested user namespaces (T4 tests native) +RUN apt-get update && apt-get install -y --no-install-recommends \ + podman uidmap slirp4netns fuse-overlayfs \ + ca-certificates curl git iproute2 libcap2-bin \ + && rm -rf /var/lib/apt/lists/* + +# CRITICAL FIX (see SPIKE-FINDINGS.md "newuidmap"): Ubuntu ships newuidmap / +# newgidmap as setuid-root. Inside an unprivileged docker container that makes +# euid=0 when they run, which fails the kernel's uid_map ownership shortcut and +# forces a CAP_SYS_ADMIN-in-init-userns check that docker's bounding set denies +# -> 'newuidmap: write to uid_map failed: Operation not permitted'. +# Fedora/quay.io/podman-stable instead ship them with file capabilities, so +# euid stays 1000 (== owner of the nested userns) and the ownership shortcut +# applies. Replicate that here: +# WARNING: this file-capabilities approach is INCOMPATIBLE with running the +# outer container under `--security-opt no-new-privileges`. Under no_new_privs, +# execve() cannot add file capabilities to the permitted set, so newuidmap would +# silently lose CAP_SETUID and rootless podman fails at namespace setup with the +# same 'Operation not permitted' shown above. Do NOT add no-new-privileges as a +# "hardening" flag without first switching these helpers back to another mapping +# mechanism. (https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html) +RUN chmod u-s /usr/bin/newuidmap /usr/bin/newgidmap \ + && setcap cap_setuid+ep /usr/bin/newuidmap \ + && setcap cap_setgid+ep /usr/bin/newgidmap + +# ubuntu:24.04 ships a default 'ubuntu' user at uid 1000; replace it with +# 'builder' so the unprivileged uid is ours and predictable. +RUN userdel -r ubuntu 2>/dev/null || true \ + && useradd -m -s /bin/bash -u 1000 builder + +# Subordinate uid/gid ranges — rootless podman maps inner-container users into +# these. Without them every nested `podman run` fails at user-namespace setup. +RUN echo 'builder:100000:65536' > /etc/subuid \ + && echo 'builder:100000:65536' > /etc/subgid + +# Podman config for a systemd-less nested environment: +# cgroup_manager=cgroupfs → no systemd inside this container +# events_logger=file → no journald inside this container +# storage: NATIVE rootless overlay (T4 winner: ~2.2x faster than +# fuse-overlayfs on this kernel 7.0 host, and needs no /dev/fuse device). +# fuse-overlayfs is left installed as a documented fallback only. +RUN mkdir -p /home/builder/.config/containers \ + && printf '[containers]\ndefault_sysctls = []\n\n[engine]\ncgroup_manager = "cgroupfs"\nevents_logger = "file"\n' \ + > /home/builder/.config/containers/containers.conf \ + && printf '[storage]\ndriver = "overlay"\n' \ + > /home/builder/.config/containers/storage.conf \ + && chown -R builder:builder /home/builder/.config + +# Pre-create the volume mountpoints OWNED BY builder. Docker initialises named +# volumes by copying ownership from the image path; without this both volumes +# mount root-owned and rootless podman dies with: +# mkdir /home/builder/.local/share/containers/storage: permission denied +RUN mkdir -p /workspace /home/builder/.local/share/containers \ + && chown -R builder:builder /workspace /home/builder/.local + +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod 0755 /usr/local/bin/entrypoint.sh + +USER builder +WORKDIR /workspace + +# No systemd-logind to provision /run/user/1000; entrypoint creates this. +ENV XDG_RUNTIME_DIR=/tmp/runtime-builder + +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +# Spike keeps the container alive for `docker exec` iteration; Stage 2 replaces +# this with the agent-server process. +CMD ["sleep", "infinity"] diff --git a/container/INNER-APP-SPIKE.md b/container/INNER-APP-SPIKE.md new file mode 100644 index 0000000..96f5f8b --- /dev/null +++ b/container/INNER-APP-SPIKE.md @@ -0,0 +1,217 @@ +# Inner-App Spike — Realistic Builds Under Nested Rootless Podman + +**Date:** 2026-06-12 +**Runs on:** the same Stage 0 spikebox (the `builder-outer` container is already +built and `./container/smoke.sh` passes). This is a follow-on, not a redo. +**Parent plan:** `docs/plans/builder-containers-plan.md` (D5, D6, Stage 2) +**Findings:** record everything in the "Findings" section at the bottom of this +file as you go. + +## Why + +Stage 0 only proved trivial images (nginx, a one-line Alpine build) run nested. +Before we commit to the template + two-container (DEV/PROD) deploy model, prove +that a **realistic multi-stage app build** works inside the outer container under +rootless podman + native overlay — including the things the real product depends +on: two published ports, a redeploy cycle, optional hot-reload, and +stack-agnosticism. + +## Hard rules + +- **Do not change the outer container's flags or image hardening.** Use the + `builder-outer` container exactly as Stage 0 left it (no `--privileged`, no new + caps). If something only works by weakening the outer container, that's a + finding, not a fix — stop and record it. +- Everything happens **inside** the outer container via `docker exec` / `podman`, + in `/workspace`. Apps publish in the already-forwarded **10000–10009** range. +- You write the sample apps yourself (small, known) — this is deterministic infra + validation, **not** an LLM app-building exercise. + +## Tasks + +### T1 — Multi-stage JS app, DEV + PROD, two ports +- [x] In `/workspace/vite-sample`, scaffold a **minimal Vite app** (plain `npm create vite` SPA, no extras) with a multi-stage `Dockerfile`: + - `dev` target: installs deps, runs the Vite dev server bound to `0.0.0.0` + - `build` target: `vite build` + - `prod` target: a lean final stage serving the built `dist/` (e.g. `nginx:alpine` or a tiny static server) — **only the build output**, no `node_modules`/source +- [x] `podman build --target dev -t vite-sample:dev .` and `--target prod -t vite-sample:prod .` both succeed inside the outer container +- [x] Run both at once: dev on `:10000`, prod on `:10001` +- [x] From the **host**: `curl 127.0.0.1:10000` and `curl 127.0.0.1:10001` both return the app +- [x] Record: build times (cold), prod image size vs dev image size, any overlay/permission errors + +### T2 — Redeploy cycle + layer cache +- [x] Edit a source file, then redeploy DEV: `podman build --target dev ... && podman rm -f && podman run ...` +- [x] Confirm the dependency-install layer is **cached** (rebuild only re-runs from the source-copy layer); record the warm rebuild time vs cold +- [x] Confirm the change is visible via host curl + +### T3 — DEV hot-reload via bind mount (the D6 open question) +- [x] Run the DEV container with the project dir bind-mounted: `podman run -d -v /workspace/vite-sample:/app -p 10000: vite-sample:dev` (adjust workdir so the mount lands where the dev server watches) +- [x] Edit a source file on the host side of the mount; confirm whether the Vite dev server inside the container **hot-reloads without a rebuild** (check via curl / the HMR endpoint) +- [x] Record: does a workspace bind mount work at all under nested rootless podman? Any uid/permission issues on the mounted files? Does file-watching/HMR fire across the mount? **This decides whether D6's hot-reload option is viable or we fall back to rebuild-redeploy.** + +### T4 — Stack-agnosticism smoke (non-JS) +- [x] In `/workspace/py-sample`, a trivial **Python** app (e.g. Flask/`http.server`) with its own single-stage `Dockerfile`, published on `:10002` +- [x] Build + run + host curl succeed +- [x] Purpose: confirm the *mechanics* (build any Dockerfile, publish a port) are framework-neutral — so the deploy skill/metadata don't need JS assumptions + +### T5 — Resource sanity on a small box +- [x] After T1–T4, record `podman images` total size, `df -h` on the podman storage volume, and peak memory during the heaviest build (`free -m` while building) +- [x] Note whether anything thrashed/OOM'd; this informs the outer container's `--memory`/`--cpus` limits (Stage 4) and the box sizing recommendation + +## Acceptance + +- T1, T2, T4 green (multi-stage dev/prod build + run + two/three ports + redeploy, JS and Python). +- T3 answered definitively (hot-reload works, or doesn't, with the reason). +- Findings section filled in. No outer-container weakening introduced. + +Timebox: a couple of hours. If a multi-stage build fundamentally fails under +nesting (e.g. native overlay chokes on many layers / large `node_modules`), stop +and capture the exact error + `podman info` storage section — that's a Stage 2 +blocker we need to know about now, not later. + +--- + +## Findings + +**Status:** COMPLETE — T1, T2, T4 green; T3 answered definitively (hot-reload +**viable**); T5 recorded. **No outer-container weakening introduced** — +`docker inspect` confirms `Privileged=false`, `CapAdd=[]`; the `builder-outer` +image and `run-outer.sh` flags are byte-for-byte the Stage 0 set. Re-ran +`./container/smoke.sh` first: 11/11 PASS. + +Method note: the outer container has **no node/npm** (correct — app builds happen +*inside* podman using base images). Sample apps were hand-written (deterministic, +per the brief) and `docker cp`'d into the `/workspace` volume as uid 1000 +(`builder`). All `curl`s below are from the **host** through both forwarding +layers (host → docker publish `127.0.0.1:1000x` → outer → rootless podman → +inner app). + +### Host / container +- Outer image base `ubuntu:24.04`; inner podman **4.9.3**; native rootless + **overlay** (`Store.GraphDriverName=overlay`, no `mount_program`). Unchanged + from Stage 0. Outer main process uid 1000, `Privileged=false`, no added caps. +- Box: 4 vCPU / 7.6 GiB RAM / 75 GiB disk (same Hetzner VM as Stage 0). +- `docker.io/library/node:20-alpine` pulled in ~3 s; native overlay healthy. + +### T1 — multi-stage dev/prod + two ports +- **dev build OK, prod build OK.** Cold times inside the outer container: + **dev ~6.8 s** (npm install dominates; base image pre-pulled), **prod ~2.2 s** + (its `build` stage reuses the `deps`/`npm install` layer already built for dev, + then `vite build` + nginx copy). A fully cold `--no-cache` dev build is ~6.5 s. +- **prod 63.7 MB vs dev 239 MB** (~3.7× smaller). Prod = `nginx:alpine` + the + built `dist/` only; dev = `node:20-alpine` + `node_modules`. +- **Both reachable from host:** `:10000` (dev) served the Vite HTML *with* + `/@vite/client` injected (dev server live) and served transformed source at + `/src/main.js`; `:10001` (prod) served the hashed/bundled assets, and the + bundle contained `BUILD_MARKER_V1` (proves it's the real build output). +- **prod is non-root:** container runs as `uid=101(nginx)`; image has **no + `/app`, no `node_modules`, no source** — only `index.html` + `assets/`. + Achieved with a custom `nginx.conf` (pid + temp paths under `/tmp`, `listen + 8080`) + `USER nginx`. +- **No overlay/permission errors.** Multi-stage build over native overlay was + clean; no layer-count or `node_modules`-size problems at this (minimal-SPA) scale. + +### T2 — redeploy + cache +- **Dep layer cached.** After editing `src/main.js`, the warm `--target dev` + rebuild printed `Using cache` for `COPY package.json` **and** `RUN npm install`; + only `COPY . .` (source) re-ran. **Warm rebuild ~0.67 s vs ~6.8 s cold** (~10×). +- **Change visible:** `rm -f && run` redeploy on `:10000`, host curl of + `/src/main.js` returned the edited `BUILD_MARKER_V2`. + +### T3 — bind-mount hot-reload (D6 decision) +- **Bind mount works** under nested rootless podman: + `podman run -v /workspace/vite-sample:/app -v /app/node_modules ...`. The Vite + dev server started normally (`ready in 183 ms`). +- **uid/permissions:** files owned by uid 1000 (`builder`) on the outer appear as + `uid=0(root)` *inside* the inner container — the expected rootless-podman + container-root ↔ outer-user mapping. **No permission issues**; files read/served + fine. **Gotcha (not a blocker):** a bare `-v :/app` makes the host dir + **shadow the image's `/app/node_modules`** and the dev server can't find its + deps. Fix used: add an **anonymous volume `-v /app/node_modules`** to keep the + image's installed deps under the mount. The deploy skill MUST do this for the + hot-reload path. +- **HMR fires on host-side edit without rebuild:** edited `src/main.js` + V2→V3 on the workspace side; with **no rebuild and no restart** the dev server + served `BUILD_MARKER_V3` and logged `[vite] page reload src/main.js`. inotify + propagates across the bind mount on native overlay — **no `usePolling` needed**. +- **Verdict: hot-reload is VIABLE.** D6's optional bind-mount hot-reload works + under nested rootless podman. Recommend the DEV container use it (bind mount + + anon `node_modules` volume); rebuild-redeploy remains the fallback and is what + PROD/promote uses anyway. + +### T4 — non-JS stack +- **Python/Flask: build + run + host curl all OK** on `:10002`. Single-stage + `python:3.12-alpine` Dockerfile; **`pip install` works under nesting** (cold + build ~6.2 s). Host curl returned `PY_MARKER_V1 flask-sample ok`. +- **Nothing stack-specific leaked.** Identical mechanics as JS: write a + Dockerfile, `podman build -t`, `podman run -p hostPort:containerPort`. The + only contract is "a Dockerfile that publishes on the given port" — no JS + assumptions. Confirms D5's stack-agnosticism premise. + +### T5 — resources +- **Images:** nominal sizes — node:20-alpine 138 MB, nginx:alpine 63.7 MB, + python:3.12-alpine 57.6 MB, vite dev 239 MB, vite prod 63.7 MB, py 71.2 MB. + **Actual on-disk podman storage volume: ~404 MB** (overlay layer dedup; the + alpine/node/nginx bases are shared across images). +- **Peak build memory: ~1469 MB used** during a `--no-cache` dev build + (baseline ~1128 MB → build adds **~340 MB**). **No thrash, no OOM** (`dmesg` + clean); the 7.6 GiB box was never near pressure. Builds are largely + single-core and short. +- **Implication for outer `--memory`/`--cpus` + box sizing:** build peaks are + modest (~350 MB delta) and brief; for the outer container, `--cpus 2` is + enough for snappy builds and a generous `--memory` (e.g. 2–4 GiB) leaves wide + headroom. At scale the **steady-state footprint of many idle inner containers** + (50 projects × 2) will dominate, not build spikes — size the box for resident + containers + image storage, not for build bursts. A 2 vCPU / 4 GiB box handles + the build/deploy loop comfortably; storage grows ~tens of MB per project after + base-image dedup. + +### Recommendations for the template (D5) and Stage 2 +- **Lean prod stage that worked (use as the template's shape):** + `build` stage runs `vite build`; `prod` = `nginx:alpine` + custom `nginx.conf` + (pid & temp paths under `/tmp`, `listen 8080`) + `COPY --from=build dist/` + + `USER nginx`. Result: 63.7 MB, non-root, no source/deps shipped. Avoids + orchestrator's "ship the whole build tree as root" anti-pattern (D5). +- **`deps` layer as a cache anchor:** `COPY package.json* && RUN npm install` + *before* `COPY . .`, with `dev`/`build` both `FROM deps`. Gives sub-second warm + redeploys (T2) and lets prod's build reuse dev's install. +- **Deploy skill MUST do / avoid:** + - **Hot-reload DEV:** `-v :/app` **plus** `-v /app/node_modules` + (anon volume) — without the second, the mount shadows deps and the dev server + breaks. (JS-specific; the skill should apply it only when a deps dir would be + shadowed.) + - **Don't assume container port 80.** The non-root prod stage listens on **8080** + and dev on **5173**; the skill maps `-p :`, so the + container port is a template detail, not the reserved appx port. Keep them distinct. + - **Use fully-qualified image refs** (`docker.io/library/...`) in template + Dockerfiles — podman 4.9.3 has no implicit Docker Hub default; short names can + prompt/fail non-interactively. All builds here used FQ refs and were clean. + - Bind 0.0.0.0 in the dev server (`vite.config.js server.host`); prod nginx is fine. +- **Open questions / blockers:** none blocking Stage 2. + - **Vite `allowedHosts` (verify + likely fix at Stage 1/3):** we only curled + via `127.0.0.1`, which Vite always allows, so the spike never exercised this. + Vite 5.4.x ships a `Host`-header allow-list (anti DNS-rebinding). Served + through appx's public **`*-dev.`** subdomain, the dev server sees a + domain that isn't on its list and answers *"Blocked request. This host is not + allowed."* — the user gets an error instead of their app. **PROD is + unaffected** (plain nginx, no host check). Fix: set + `server.allowedHosts` in the template's `vite.config.js`. Since the template + is baked once but the domain is per-project, make it env-driven and let the + deploy skill pass the value from `.pi/deployment.json`: + ```js + const devHost = process.env.VITE_DEV_ALLOWED_HOST; + server: { host: "0.0.0.0", port: 5173, + allowedHosts: devHost ? [devHost] : [] } + ``` + then `podman run -e VITE_DEV_ALLOWED_HOST=eventx-dev.example.com ...`. + Alternatives: a leading-dot wildcard `[".example.com"]` (all dev subdomains + under one owned domain) or `true` (disable the check) — looser. Decide when + the template + skill are written; one-line change, not an infra blocker. + - **HMR websocket through the proxy (sibling of the above):** HMR uses a + `ws://`/`wss://` upgrade on the same published dev port and the same domain; + it worked on loopback here. appx's subdomain proxy must forward the WS + upgrade or the app loads but hot-reload silently dies — + track in the appx plan. + - The `container-smoke.sh` (Stage 2) should build **this Vite template's** + `dev`+`prod` targets and the bind-mount hot-reload run, not just nginx, to + keep these guarantees from regressing. diff --git a/container/SPIKE-FINDINGS.md b/container/SPIKE-FINDINGS.md new file mode 100644 index 0000000..146cf24 --- /dev/null +++ b/container/SPIKE-FINDINGS.md @@ -0,0 +1,259 @@ +# Stage 0 Spike Findings + +**Status:** COMPLETE — `./container/smoke.sh` exits 0 (11/11) under all hard constraints. +**Brief:** `docs/plans/stage0-spike-brief.md` + +## Host + +- Provider / instance type: Hetzner KVM VM ("appx"), 4 vCPU, 7.6 GiB RAM, 75 GiB disk, 4 GiB swap +- Distro + kernel (`lsb_release -ds`, `uname -rm`): **Ubuntu 26.04 LTS** (brief assumed 24.04 — see note), kernel `7.0.0-15-generic x86_64` +- Arch: x86_64 +- Docker version (`docker --version`): Docker version 29.5.3, build d1c06ef (security options: apparmor, seccomp profile=builtin, cgroupns) +- Outer image base: `ubuntu:24.04` (matches the brief's production target even though the host is 26.04) +- Podman version inside outer (`podman --version`): **4.9.3** (Ubuntu 24.04 repo) + +**Note on distro:** the box is Ubuntu 26.04, not the 24.04 the brief targets. The +relevant hardening is the same or stricter: `kernel.apparmor_restrict_unprivileged_userns = 1` +(the 24.04 default that blocks nested userns) is active here too, and AppArmor is enabled +(`/sys/module/apparmor/parameters/enabled = Y`). The OUTER IMAGE is `ubuntu:24.04`, so the +in-image findings (podman 4.9.3, packaging, configs) are exactly the production target. The +operator should still re-verify the host-side flags on a real 24.04 host before production. + +## Result summary + +**Yes — the unprivileged nested chain works on this host**, and `./container/smoke.sh` exits 0 +(11/11) with no `--privileged`, no added capabilities (no `SYS_ADMIN`), and the outer main +process running as non-root uid 1000 (`builder`). The full path is proven: host → docker +publish (`127.0.0.1:10000`) → outer container → rootless podman + slirp4netns → inner nginx, +plus a working `podman build`, persistence across `docker restart`, and clean recovery via +`podman start --all`. The single decisive fix was repackaging `newuidmap`/`newgidmap` with +file capabilities (see headline finding); after that only four `docker run` knobs are needed, +and `seccomp=unconfined` was further replaced by a strictly-tighter tailored profile. +Remarkably, **no host-level sysctl/apparmor change was required** — the hardened Ubuntu +defaults (`apparmor_restrict_unprivileged_userns=1`) are left untouched. + +## Headline finding: setuid-root `newuidmap` breaks rootless podman in an unprivileged container + +The single biggest blocker. Symptom on first run: + +``` +running `/usr/bin/newuidmap 0 1000 1 1 100000 65536`: newuidmap: write to uid_map failed: Operation not permitted +Error: cannot set up namespace using "/usr/bin/newuidmap": exit status 1 +``` + +**Root cause (traced with bpftrace, not guessed):** Ubuntu ships `newuidmap`/`newgidmap` +as **setuid-root** (`-rwsr-xr-x`). Inside an unprivileged docker container they therefore +run with `euid=0`. The kernel's `/proc//uid_map` write path (`new_idmap_permitted`) +has a shortcut: if the writer's euid equals the uid that *created* the target user +namespace (here uid 1000 = `builder`), a single-extent self-map is allowed without any +capability. With `euid=0` that shortcut does **not** apply, so the kernel instead requires +`CAP_SYS_ADMIN` **in the initial user namespace**. docker's default capability bounding set +(`0x00000000a80425fb`) excludes `CAP_SYS_ADMIN`, so the check fails. bpftrace on +`cap_capable` confirmed the final failing check is `cap=21` (CAP_SYS_ADMIN), returning -1. + +This is **not** AppArmor and **not** seccomp — it fails identically with every AppArmor / +seccomp sysctl set to 0. It is purely the setuid-vs-filecap packaging difference. + +**Fix (matches Fedora / `quay.io/podman/stable` / Dan Walsh's "Podman inside a container" +blog):** ship the helpers with **file capabilities** instead of setuid-root, so euid stays +1000 and the ownership shortcut applies: + +```dockerfile +RUN chmod u-s /usr/bin/newuidmap /usr/bin/newgidmap \ + && setcap cap_setuid+ep /usr/bin/newuidmap \ + && setcap cap_setgid+ep /usr/bin/newgidmap +``` + +Verified: after this change `newuidmap 0 1000 1 1 100000 65536` returns OK with **no** +added capabilities and `apparmor_restrict_unprivileged_userns=1` left at its hardened default. +This is why `quay.io/podman/stable` "just works" as a nested image — it already does this. + +## Final `docker run` flag set + +From `container/run-outer.sh` (deletion-tested in T2 — each flag removed individually and the +exact resulting error recorded): + +``` +docker run -d --name builder-outer \ + --device /dev/net/tun \ + --security-opt seccomp=$(pwd)/seccomp-builder.json \ + --security-opt apparmor=unconfined \ + --security-opt systempaths=unconfined \ + -v builder-workspace:/workspace \ + -v builder-podman-storage:/home/builder/.local/share/containers \ + -p 127.0.0.1:10000-10009:10000-10009 \ + builder-outer +``` + +| Flag | Needed? | Exact error when removed | +| --- | --- | --- | +| `--device /dev/net/tun` | **Yes** | `FAIL@run: /usr/bin/slirp4netns failed: "open(\"/dev/net/tun\"): No such file or directory"` — rootless slirp4netns networking is dead without it | +| `--security-opt seccomp=seccomp-builder.json` | **Yes** | With docker's DEFAULT profile: `FAIL@info: Error: cannot re-exec process` (default profile blocks `mount(2)` and friends). Tailored profile is strictly tighter than `unconfined` — see T2 below | +| `--security-opt apparmor=unconfined` | **Yes** | `FAIL@info: mount /home/builder/.local/share/containers/storage/overlay...: permission denied` — docker-default AppArmor profile (`docker-default`) blocks the rootless overlay `mount(2)`. **NB this is NOT the host `apparmor_restrict_unprivileged_userns` problem** — that one is solved entirely by the file-cap `newuidmap` fix. TODO: replace with a tailored AppArmor profile (deferred; containment loss is bounded — seccomp + userns + caps still apply) | +| `--security-opt systempaths=unconfined` | **Yes** | `FAIL@run: crun: mount \`proc\` to \`proc\`: Operation not permitted` — docker masks `/proc` submounts (`/proc/sys`, `/proc/kcore`, ...); the kernel `mount_too_revealing()` check then refuses the inner container's fresh `proc` mount. `systempaths=unconfined` clears docker's `MaskedPaths`/`ReadonlyPaths`. **Adds no capabilities and no privilege**; the inner containers still get their own `/proc` masks from crun | +| `--device /dev/fuse` | **No (removed)** | Was in the draft for fuse-overlayfs. Native overlay (T4) needs no FUSE device, so this flag was deleted | +| `-v builder-workspace` | **Yes** | persistence: project files must survive container recreate (T3 verified) | +| `-v builder-podman-storage` | **Yes** | persistence: inner images/containers metadata must survive recreate (T3 verified) | +| `-p 127.0.0.1:10000-10009` | **Yes** | the host→inner port chain; loopback-only so appx proxies in. Without it the host curl check cannot reach the inner nginx | + +No `--cap-add` of any kind is used. `docker inspect` confirms `Privileged=false`. + +## T2 — tailored seccomp profile (replaces `seccomp=unconfined`) + +The brief asks to prefer Podman's `seccomp.json` over `unconfined` if it works. Result: + +- Podman's **stock** `seccomp.json` (from `containers-common`, present in the image at + `/usr/share/containers/seccomp.json`) gets further than docker's default (it allows + `mount`, so `podman info` succeeds) but the inner `podman run` dies at + `crun: sethostname: Operation not permitted`. Reason: the stock profile *allow-lists* + `sethostname` (and `setdomainname`, `setns`, plus `bpf`, `perf_event_open`, `quotactl`, + `fanotify_init`, `lookup_dcookie`) only `includes.caps = [CAP_SYS_ADMIN]`. Our unprivileged + outer has no `CAP_SYS_ADMIN`, so the runtime drops those allow-rules and the syscalls fall + through to `ERRNO`. +- **Fix adopted:** `container/seccomp-builder.json` = stock profile with the `CAP_SYS_ADMIN` + gate removed from **only** `sethostname`, `setdomainname`, `setns` (the namespace-setup + syscalls the nested runtime needs). The genuinely dangerous gated syscalls + (`bpf`, `perf_event_open`, `quotactl`, `fanotify_init`, `lookup_dcookie`) stay **denied**. + This is **strictly tighter than `unconfined`**. `container/gen-seccomp.sh` regenerates it + from the image's stock profile and documents the provenance. Smoke stays 11/11 with it. + +## T2 sub-question — outer runtime: docker vs podman (informs `system-setup.sh`) + +Host change: installed `podman` 5.7.0 on the host to test it as the *outer* runtime. + +- **Rootless podman as outer: DOES NOT WORK.** Fails at `newuidmap` before anything else: + rootless podman runs the outer container inside *spike's* user namespace, whose `uid_map` + is `0 1000 1 / 1 100000 65536` — i.e. only 65536 subuids exist *inside* the outer userns. + The nested `builder` then asks to map its own `builder:100000:65536` range, which does not + fit → `newuidmap: write to uid_map failed: Operation not permitted`. This is the classic + rootless-in-rootless subuid-exhaustion problem; it would need a vastly larger host subuid + allocation **and** nested-range planning. Not viable as-is. (The seccomp advantage is moot + because the chain breaks earlier.) +- **Rootful podman as outer (`sudo podman run`): WORKS, with a SMALLER security-flag set.** + Real-root model (like docker) so `newuidmap` is fine. `podman info` + inner `podman run` + creation succeed with **only** `--device /dev/net/tun --security-opt apparmor=unconfined`: + - **No `seccomp=` override needed** — podman's *default* seccomp profile allows `mount(2)` + (confirms the brief's premise). This is podman-outer's real advantage. + - **No `systempaths=unconfined` needed** — podman does not mask `/proc` the way docker + does, so the inner `proc` mount is not blocked. + - Still needs `apparmor=unconfined` (podman's default container AppArmor profile also + blocks the overlay `mount`) and `--device /dev/net/tun`. + - **Caveat (new delta):** podman's default network gave the outer container **no working + DNS** (even the outer could not resolve `registry-1.docker.io`), so image *pulls* fail + until DNS is configured (`--dns`, or host `aardvark-dns`/`netavark` setup). Docker's + default bridge ships an embedded resolver (`127.0.0.11`), so docker-outer has DNS for + free. +- **Recommendation:** docker-outer is the proven, complete, lowest-friction path and is what + `run-outer.sh` uses. Rootful-podman-outer is a viable alternative that trades two security + flags (`seccomp`, `systempaths`) for (a) running the supervisor as root and (b) a DNS-config + requirement. If `system-setup.sh` later prefers podman-on-host for a smaller flag surface, + it must run podman **rootful** and configure container DNS. Rootless-podman-outer is a dead + end without large nested-subuid provisioning. + +## Host prerequisites + +**None required for the docker-outer path.** This is the headline operational result: + +- `kernel.apparmor_restrict_unprivileged_userns` was left at its hardened default **`1`**. + (It was toggled to 0 *during diagnosis only* and restored; the final green smoke runs with + it `=1`.) The file-cap `newuidmap` fix is what makes nested userns work, not a host sysctl. +- No host AppArmor profile added. +- No host sysctl changes persisted. +- Only host package needed: **docker** (already required). The image installs its own + `podman`, `uidmap`, `slirp4netns`, `fuse-overlayfs`, `libcap2-bin`. +- `podman` 5.7.0 was installed on the host **only to answer the T2 outer-runtime + sub-question**; it is NOT needed for the docker-outer path and can be removed. + +So `system-setup.sh` needs nothing beyond a docker install for the docker-outer design. + +### Host changes log (everything touched on the box) + +| Change | Persisted? | Purpose | Needed for the solution? | +| --- | --- | --- | --- | +| `sysctl kernel.apparmor_restrict_unprivileged_userns` toggled 1↔0 | **No** — restored to `1` | Diagnosis only (proved the blocker was NOT this sysctl) | No | +| `sysctl kernel.apparmor_restrict_unprivileged_unconfined` toggled 1↔0 | **No** — restored to `1` | Diagnosis only | No | +| `sysctl kernel.unprivileged_userns_apparmor_policy` toggled 1↔0 | **No** — restored to `1` | Diagnosis only | No | +| `apt-get install podman` (5.7.0) | Yes (removable) | Answer the T2 outer-runtime sub-question | No (docker-outer path) | +| `apt-get install bpftrace` | Yes (removable) | Trace the `newuidmap` EPERM to `cap_capable cap=21` | No | +| `apt-get install strace gcc libc6-dev` **inside the outer container** | container-only | Diagnosis of the setuid/cap behaviour | No | + +Final host sysctl state verified: all three `= 1` (hardened defaults). The green smoke run +uses **zero** persisted host changes beyond the pre-existing docker install. + +## Storage driver (T4) + +- **fuse-overlayfs:** works. Needs `--device /dev/fuse` and the `fuse-overlayfs` binary + + `mount_program` in `storage.conf`. Build benchmark (300-file image): **~1281 ms**. +- **native rootless overlayfs (no `mount_program`):** works on this kernel 7.0 host (kernel + ≥ 5.13 supports rootless native overlay). Needs **no** `/dev/fuse` device. Build benchmark: + **~582 ms** — **~2.2× faster** than fuse-overlayfs. +- **vfs (last-resort fallback):** not needed and not pinned — native overlay works, so the + slow full-copy VFS driver was not required. +- **Pinned choice:** **native rootless overlay** (`storage.conf` = `[storage] driver = + "overlay"` with no `mount_program`). Faster *and* lets us drop `--device /dev/fuse`. + `fuse-overlayfs` is left installed as a documented fallback only. + +## Warmup timing (T3) + +- Cold first `podman info` (fresh storage volume): **~0.25 s** (`time` logged by entrypoint). +- Warmed (entrypoint already ran / after restart): **~0.16–0.23 s**. +- Negligible either way with native overlay; no warmup optimisation needed for Stage 2. + +## Restart behaviour (T3) + +- **Workspace volume:** survives `docker restart` (marker file intact). ✓ +- **Podman image store:** survives `docker restart` (built image still present). ✓ +- **Running inner containers after `docker restart`:** stop — they come back in state + `created` (not `running`). Expected: a `docker restart` kills all inner processes. +- **`podman start --all` viable as the Stage 4 recovery mechanism? YES**, but only after the + entrypoint wipes the **stale transient runtime state** on each boot. `XDG_RUNTIME_DIR` + (`/tmp/runtime-builder`) lives in the container FS and *survives* `docker restart`, but the + rootless-podman **pause process** and **crun** state it references do not. Left stale, + podman fails with `invalid internal status, try resetting the pause process with "podman + system migrate"` and `podman start` fails with `crun: container already exists`. The + entrypoint now `rm -rf`s `$XDG_RUNTIME_DIR/{libpod,containers,netns,crun}` on every boot; + after that `podman start --all` cleanly resurrects the inner container **with its port + forwarding** (host curl succeeds again). The smoke test exercises exactly this path. + +## Port chain notes + +- Full chain works: host `127.0.0.1:10000` → docker publish → outer netns → rootless + **slirp4netns** → inner nginx `:80`. No latency surprises on loopback. +- **slirp4netns requires `/dev/net/tun`** in the outer container (`--device /dev/net/tun`); + this is the only device the docker-outer path needs. +- IPv6: harmless warnings only (`failed to set net.ipv6.conf.default.accept_dad ...`); IPv4 + forwarding unaffected. +- The read-only `net.ipv4.ping_group_range` sysctl that crun tries to set on container + create is suppressed by `default_sysctls = []` in `containers.conf` (baked into the image). +- DNS *inside inner containers* under the docker-outer path works (docker bridge resolver). + Under podman-outer it does not (see T2 sub-question). + +## Recommendations for Stage 2 + +The Stage 2 image and appx's Stage 3 supervisor should transcribe this verbatim: + +1. **Keep the four `docker run` knobs** exactly: `--device /dev/net/tun`, + `--security-opt seccomp=`, `--security-opt apparmor=unconfined`, + `--security-opt systempaths=unconfined`. No `--privileged`, no `--cap-add`. +2. **Keep the `newuidmap`/`newgidmap` file-cap fix** in the Dockerfile — it is the linchpin. + If Stage 2 switches the base to `quay.io/podman/stable`, that image already does this. +3. **Pin native overlay** storage (no `mount_program`); do not re-add `--device /dev/fuse`. +4. **Keep the entrypoint runtime-state wipe** — it is what makes `docker restart` + + `podman start --all` a reliable Stage 4 recovery mechanism. +5. Ship `seccomp-builder.json` alongside the deploy scripts and reference it by absolute path; + `gen-seccomp.sh` regenerates it if the base podman version changes. +6. Replace the spike `CMD ["sleep","infinity"]` with the agent-server process; publish 4001 + and the app port range; add `AGENT_SERVER_*` env. The security flags are unaffected. +7. **Deferred TODO:** replace `apparmor=unconfined` with a tailored AppArmor profile that + permits the overlay `mount` (mirrors what we did for seccomp). Bounded containment loss + for now (seccomp + userns + cap-bounding still apply). + +## Open questions / blockers + +- **None blocking.** The chain works unprivileged with hardened host defaults. +- Re-verify on a genuine **Ubuntu 24.04** host (this box is 26.04, though the image is 24.04) + before production — expected to pass, but the host kernel/apparmor build differs. +- Tailored AppArmor profile (item 7 above) is the one remaining hardening refinement. +- If appx ever wants podman-on-host, settle the rootful-podman DNS configuration + (`aardvark-dns`/`netavark` or `--dns`) noted in the T2 sub-question. + diff --git a/container/entrypoint.sh b/container/entrypoint.sh new file mode 100755 index 0000000..aed9caf --- /dev/null +++ b/container/entrypoint.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Outer-container entrypoint (Stage 0 spike). +# +# 1. Provision the runtime dir rootless podman expects (no systemd-logind here). +# 2. Warm up podman storage so the first real build/run isn't slow and so a +# broken nested environment is visible in `docker logs` immediately. +# 3. Exec the CMD (spike: sleep infinity; Stage 2: agent-server). +set -euo pipefail + +mkdir -p "${XDG_RUNTIME_DIR:-/tmp/runtime-$(id -un)}" + +# XDG_RUNTIME_DIR is supposed to be ephemeral (tmpfs, wiped on boot). Here it +# lives in the container filesystem, so it SURVIVES `docker restart` — but the +# rootless-podman pause process it points at does NOT. The stale pause-pid then +# makes every podman call fail with: +# "invalid internal status, try resetting the pause process with +# 'podman system migrate': could not find any running process" +# Wiping the transient runtime state on each boot restores clean-start +# semantics; persistent state (images/containers metadata) lives in the +# ~/.local/share/containers named volume and is untouched. +rm -rf "${XDG_RUNTIME_DIR:?}/libpod" "${XDG_RUNTIME_DIR:?}/containers" \ + "${XDG_RUNTIME_DIR:?}/netns" "${XDG_RUNTIME_DIR:?}/crun" 2>/dev/null || true + +echo "[entrypoint] podman warmup starting ($(date -Is))" +if time podman info > /tmp/podman-info.log 2>&1; then + echo "[entrypoint] podman warmup OK" +else + # Don't die: keep the container alive so the spike agent can exec in and debug. + echo "[entrypoint] WARNING: podman info FAILED — see /tmp/podman-info.log:" + tail -n 20 /tmp/podman-info.log || true +fi + +exec "$@" diff --git a/container/gen-seccomp.sh b/container/gen-seccomp.sh new file mode 100755 index 0000000..baaadea --- /dev/null +++ b/container/gen-seccomp.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Regenerate container/seccomp-builder.json from podman's stock seccomp profile. +# +# Why this profile exists (Stage 0 spike, task T2): docker's DEFAULT seccomp +# profile blocks mount(2), which rootless podman needs even for unprivileged +# overlay/bind mounts, so the naive fix is seccomp=unconfined. Podman ships a +# profile that allows mount, but it gates a handful of syscalls behind +# "CAP_SYS_ADMIN" via the runtime's `includes.caps` mechanism. Our OUTER +# container is unprivileged (no CAP_SYS_ADMIN), so those rules are dropped and +# the gated syscalls fall through to ERRNO. Inner-container setup then dies at +# `sethostname: Operation not permitted`. +# +# This profile = podman's stock profile with the CAP_SYS_ADMIN gate removed +# from ONLY the namespace-setup syscalls the nested runtime needs +# (sethostname, setdomainname, setns). The genuinely dangerous gated syscalls +# (bpf, perf_event_open, quotactl, fanotify_init, lookup_dcookie) stay denied. +# Net result: a tailored profile that is strictly tighter than `unconfined`. +set -euo pipefail +cd "$(dirname "$0")" +docker build -t builder-outer . >/dev/null +cid=$(docker create builder-outer) +docker cp "$cid:/usr/share/containers/seccomp.json" /tmp/stock-seccomp.json +docker rm "$cid" >/dev/null +python3 - <<'PY' +import json +d=json.load(open('/tmp/stock-seccomp.json')) +NEED={'sethostname','setdomainname','setns'} +for s in d['syscalls']: + inc=s.get('includes',{}) + if s['action']=='SCMP_ACT_ALLOW' and inc.get('caps')==['CAP_SYS_ADMIN']: + s['names']=[n for n in s['names'] if n in NEED] + s.pop('includes',None) + # Podman's stock profile also ships a complementary SCMP_ACT_ERRNO (deny) + # rule gated on excludes.caps=[CAP_SYS_ADMIN] that ALSO names these three + # syscalls. If left in place the generated profile both ALLOWs and ERRNOs + # the same syscalls; which rule wins is libseccomp/runtime-version-defined. + # Strip the names from the deny rule so the ALLOW above is unambiguous. + if s['action']=='SCMP_ACT_ERRNO' and s.get('excludes',{}).get('caps')==['CAP_SYS_ADMIN']: + s['names']=[n for n in s['names'] if n not in NEED] +json.dump(d,open('seccomp-builder.json','w'),indent=1) +print("wrote seccomp-builder.json") +PY diff --git a/container/run-outer.sh b/container/run-outer.sh new file mode 100755 index 0000000..ddb4531 --- /dev/null +++ b/container/run-outer.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Build and (re)start the outer builder container — Stage 0 spike. +# +# The flag set below is the FINAL PROVEN minimal set (task T2 complete): each +# flag was deletion-tested and carries a one-line justification below and in +# SPIKE-FINDINGS.md. `./smoke.sh` exits 0 with exactly these flags. +# +# Hard constraints honoured: no --privileged, no --cap-add SYS_ADMIN, non-root +# user (outer main process is uid 1000 'builder'). +set -euo pipefail +cd "$(dirname "$0")" + +readonly IMAGE="builder-outer" +readonly NAME="builder-outer" +readonly SECCOMP="$(pwd)/seccomp-builder.json" + +docker build -t "$IMAGE" . +docker rm -f "$NAME" 2>/dev/null || true + +docker run -d --name "$NAME" \ + --device /dev/net/tun \ + --security-opt seccomp="$SECCOMP" \ + --security-opt apparmor=unconfined \ + --security-opt systempaths=unconfined \ + -v builder-workspace:/workspace \ + -v builder-podman-storage:/home/builder/.local/share/containers \ + -p 127.0.0.1:10000-10009:10000-10009 \ + "$IMAGE" + +# Final proven flag set (deletion-tested in T2; see SPIKE-FINDINGS.md): +# --device /dev/net/tun rootless slirp4netns networking opens /dev/net/tun; +# without it: 'open("/dev/net/tun"): No such file' +# seccomp=seccomp-builder.json tailored profile (podman's stock + ungated +# sethostname/setdomainname/setns). Docker's +# DEFAULT seccomp blocks mount(2) -> 'cannot +# re-exec process'. Strictly tighter than +# unconfined; see gen-seccomp.sh for provenance +# apparmor=unconfined docker-default apparmor blocks the overlay +# mount(2): 'mount ...overlay...: permission +# denied'. (Host apparmor_restrict_unprivileged_ +# userns is handled by the file-cap newuidmap fix, +# NOT by this flag.) TODO: tailored apparmor profile +# systempaths=unconfined docker masks /proc submounts; kernel +# mount_too_revealing() then blocks the inner +# container's fresh proc mount: 'mount proc to +# proc: Operation not permitted'. No caps/privilege +# builder-workspace volume project files must survive container recreate +# builder-podman-storage vol inner images/containers must survive recreate +# -p 127.0.0.1:10000-10009 app port range, loopback-only (appx proxies in) + +sleep 2 +docker logs "$NAME" +echo +echo "outer container '$NAME' is up. Try: docker exec -it $NAME podman info" diff --git a/container/seccomp-builder.json b/container/seccomp-builder.json new file mode 100644 index 0000000..c5a34e8 --- /dev/null +++ b/container/seccomp-builder.json @@ -0,0 +1,1038 @@ +{ + "defaultAction": "SCMP_ACT_ERRNO", + "defaultErrnoRet": 38, + "defaultErrno": "ENOSYS", + "archMap": [ + { + "architecture": "SCMP_ARCH_X86_64", + "subArchitectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ] + }, + { + "architecture": "SCMP_ARCH_AARCH64", + "subArchitectures": [ + "SCMP_ARCH_ARM" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64" + ] + }, + { + "architecture": "SCMP_ARCH_S390X", + "subArchitectures": [ + "SCMP_ARCH_S390" + ] + } + ], + "syscalls": [ + { + "names": [ + "bdflush", + "io_pgetevents", + "kexec_file_load", + "kexec_load", + "migrate_pages", + "move_pages", + "nfsservctl", + "nice", + "oldfstat", + "oldlstat", + "oldolduname", + "oldstat", + "olduname", + "pciconfig_iobase", + "pciconfig_read", + "pciconfig_write", + "sgetmask", + "ssetmask", + "swapcontext", + "swapoff", + "swapon", + "sysfs", + "uselib", + "userfaultfd", + "ustat", + "vm86", + "vm86old", + "vmsplice" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": {}, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "_llseek", + "_newselect", + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "clone", + "clone3", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsconfig", + "fsetxattr", + "fsmount", + "fsopen", + "fspick", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "get_mempolicy", + "get_robust_list", + "get_thread_area", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "io_destroy", + "io_getevents", + "io_setup", + "io_submit", + "ioctl", + "ioprio_get", + "ioprio_set", + "ipc", + "keyctl", + "kill", + "landlock_add_rule", + "landlock_create_ruleset", + "landlock_restrict_self", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "mbind", + "membarrier", + "memfd_create", + "memfd_secret", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mount", + "mount_setattr", + "move_mount", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "name_to_handle_at", + "nanosleep", + "newfstatat", + "open", + "open_tree", + "openat", + "openat2", + "pause", + "pidfd_getfd", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "pivot_root", + "pkey_alloc", + "pkey_free", + "pkey_mprotect", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "process_mrelease", + "process_vm_readv", + "process_vm_writev", + "pselect6", + "pselect6_time64", + "ptrace", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readdir", + "readlink", + "readlinkat", + "readv", + "reboot", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "set_mempolicy", + "set_robust_list", + "set_thread_area", + "set_tid_address", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setns", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "setsid", + "setsockopt", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaction", + "sigaltstack", + "signal", + "signalfd", + "signalfd4", + "sigpending", + "sigprocmask", + "sigreturn", + "sigsuspend", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "syscall", + "sysinfo", + "syslog", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "umount", + "umount2", + "uname", + "unlink", + "unlinkat", + "unshare", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 0, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 8, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131072, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131080, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 4294967295, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "ppc64le" + ] + }, + "excludes": {} + }, + { + "names": [ + "arm_fadvise64_64", + "arm_sync_file_range", + "breakpoint", + "cacheflush", + "set_tls", + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "arm", + "arm64" + ] + }, + "excludes": {} + }, + { + "names": [ + "arch_prctl" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "amd64", + "x32" + ] + }, + "excludes": {} + }, + { + "names": [ + "modify_ldt" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "amd64", + "x32", + "x86" + ] + }, + "excludes": {} + }, + { + "names": [ + "s390_pci_mmio_read", + "s390_pci_mmio_write", + "s390_runtime_instr" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "arches": [ + "s390", + "s390x" + ] + }, + "excludes": {} + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + }, + "excludes": {} + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "setdomainname", + "sethostname", + "setns" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "excludes": {} + }, + { + "names": [ + "bpf", + "fanotify_init", + "lookup_dcookie", + "perf_event_open", + "quotactl" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + }, + "excludes": {} + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "delete_module", + "finit_module", + "init_module", + "query_module" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_MODULE" + ] + }, + "excludes": {} + }, + { + "names": [ + "delete_module", + "finit_module", + "init_module", + "query_module" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_MODULE" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_PACCT" + ] + }, + "excludes": {} + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_PACCT" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "kcmp", + "process_madvise" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + }, + "excludes": {} + }, + { + "names": [ + "kcmp", + "process_madvise" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "ioperm", + "iopl" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + }, + "excludes": {} + }, + { + "names": [ + "ioperm", + "iopl" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "clock_settime", + "clock_settime64", + "settimeofday", + "stime" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_TIME" + ] + }, + "excludes": {} + }, + { + "names": [ + "clock_settime", + "clock_settime64", + "settimeofday", + "stime" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_TIME" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "comment": "", + "includes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + }, + "excludes": {} + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ERRNO", + "args": [], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + }, + "errnoRet": 1, + "errno": "EPERM" + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ERRNO", + "args": [ + { + "index": 0, + "value": 16, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + }, + { + "index": 2, + "value": 9, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + }, + "errnoRet": 22, + "errno": "EINVAL" + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 2, + "value": 9, + "valueTwo": 0, + "op": "SCMP_CMP_NE" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + } + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 16, + "valueTwo": 0, + "op": "SCMP_CMP_NE" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + } + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 2, + "value": 9, + "valueTwo": 0, + "op": "SCMP_CMP_NE" + } + ], + "comment": "", + "includes": {}, + "excludes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + } + }, + { + "names": [ + "socket" + ], + "action": "SCMP_ACT_ALLOW", + "args": null, + "comment": "", + "includes": { + "caps": [ + "CAP_AUDIT_WRITE" + ] + }, + "excludes": {} + } + ] +} \ No newline at end of file diff --git a/container/smoke.sh b/container/smoke.sh new file mode 100755 index 0000000..e5d47a0 --- /dev/null +++ b/container/smoke.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# Stage 0 acceptance test — exits 0 iff every REQUIRED check passes. +# +# This is the spike's definition of done (docs/plans/stage0-spike-brief.md). +# It validates the full nested chain on a fresh box: +# host → docker port publish → outer container → rootless podman → inner app +# +# Checks marked [observe] never fail the run; their outcome is recorded for +# SPIKE-FINDINGS.md (e.g. whether inner containers survive an outer restart). +set -uo pipefail +cd "$(dirname "$0")" + +readonly NAME="builder-outer" +readonly APP_PORT=10000 +PASS_COUNT=0 +FAIL_COUNT=0 + +# ── helpers ────────────────────────────────────────────────────────────────── + +pass() { echo " PASS: $1"; PASS_COUNT=$((PASS_COUNT + 1)); } +fail() { echo " FAIL: $1"; FAIL_COUNT=$((FAIL_COUNT + 1)); } + +check() { # check + local description="$1" + shift + if "$@" > /tmp/smoke-last.log 2>&1; then + pass "$description" + else + fail "$description" + sed 's/^/ | /' /tmp/smoke-last.log | tail -n 15 + fi +} + +outer_exec() { docker exec "$NAME" "$@"; } + +curl_app() { + curl -fsS --retry 10 --retry-delay 1 --retry-connrefused --retry-all-errors \ + "http://127.0.0.1:${APP_PORT}" > /dev/null +} + +# ── 1. fresh outer container ───────────────────────────────────────────────── + +echo "[1] build + start outer container" +check "run-outer.sh brings up the outer container" ./run-outer.sh + +echo "[2] outer container is unprivileged" +check "main process uid is 1000 (builder)" \ + bash -c "[ \"\$(docker exec $NAME id -u)\" = '1000' ]" +check "container is not privileged" \ + bash -c "[ \"\$(docker inspect -f '{{.HostConfig.Privileged}}' $NAME)\" = 'false' ]" + +echo "[3] podman works inside (warmup ran in entrypoint; see 'docker logs $NAME' for timing)" +check "podman info succeeds" outer_exec podman info + +# ── 2. inner run: pull + serve + full port chain ───────────────────────────── + +echo "[4] inner container serves through both forwarding layers" +outer_exec podman rm -f spike-web > /dev/null 2>&1 +check "podman run nginx publishing :${APP_PORT}" \ + outer_exec podman run -d --name spike-web \ + -p "${APP_PORT}:80" docker.io/library/nginx:alpine +check "host curl 127.0.0.1:${APP_PORT} reaches the inner nginx" curl_app + +# ── 3. inner build: storage driver + build path ────────────────────────────── + +echo "[5] podman build works inside" +check "podman build of a trivial image" outer_exec bash -c ' + build_dir=$(mktemp -d) && + printf "FROM docker.io/library/alpine:3.20\nRUN echo built-ok > /built\n" \ + > "$build_dir/Dockerfile" && + podman build -q -t spike-build-test "$build_dir" +' +check "built image runs and contains its layer" outer_exec bash -c \ + '[ "$(podman run --rm spike-build-test cat /built)" = "built-ok" ]' + +# ── 4. restart semantics ───────────────────────────────────────────────────── + +echo "[6] outer restart: persistence + recovery" +outer_exec bash -c "echo persists > /workspace/spike-marker" > /dev/null 2>&1 +docker restart "$NAME" > /dev/null +sleep 3 + +check "workspace volume survived restart" outer_exec bash -c \ + '[ "$(cat /workspace/spike-marker)" = "persists" ]' +check "podman image store survived restart" outer_exec bash -c \ + 'podman images --format "{{.Repository}}" | grep -q spike-build-test' + +inner_state=$(outer_exec podman inspect -f '{{.State.Status}}' spike-web 2>/dev/null || echo "gone") +echo " [observe] inner container state after outer restart: ${inner_state}" +if outer_exec podman start spike-web > /dev/null 2>&1 && curl_app; then + echo " [observe] 'podman start' resurrected the inner app (good for Stage 4: podman start --all)" + pass "app reachable again after restart (via podman start)" +else + echo " [observe] 'podman start' did NOT resurrect it — record in findings; trying re-create" + outer_exec podman rm -f spike-web > /dev/null 2>&1 + check "app reachable again after restart (via re-create)" outer_exec \ + podman run -d --name spike-web -p "${APP_PORT}:80" docker.io/library/nginx:alpine + check "host curl after re-create" curl_app +fi + +# ── summary ────────────────────────────────────────────────────────────────── + +echo +echo "──────────────────────────────────────────" +echo "smoke result: ${PASS_COUNT} passed, ${FAIL_COUNT} failed" +if [ "$FAIL_COUNT" -eq 0 ]; then + echo "STAGE 0 SMOKE: PASS" + exit 0 +fi +echo "STAGE 0 SMOKE: FAIL" +exit 1 diff --git a/docs/architecture/important/builder-container-architecture.md b/docs/architecture/important/builder-container-architecture.md index f48edc6..346c9cb 100644 --- a/docs/architecture/important/builder-container-architecture.md +++ b/docs/architecture/important/builder-container-architecture.md @@ -75,6 +75,26 @@ Build a system where: • Inner containers: untrusted, run LLM-generated code, no creds ``` +## Decision: appx terminates and proxies app traffic + +The two early container rigs published ports differently — the Stage 0 spike +(`container/`) bound `-p 127.0.0.1:10000-10009` (loopback only), while the +production draft (`docker/builder/`) published ports directly to the host +(`-p 4001`, `-p 3000-3010`). This is a real design decision, not an +implementation detail, so it is recorded here as the single source of truth: + +**appx terminates app traffic and proxies it into the outer container.** The +outer container publishes its ports on loopback only (`127.0.0.1:...`); appx is +the edge that accepts external traffic and forwards it to those loopback ports. +App ports are therefore **not** published directly to the host's external +interfaces. + +Rationale: keeping the outer container loopback-only means a single trusted +edge (appx) owns TLS termination, routing, and access control, and no +agent-spawned inner app is ever directly reachable from outside the host. Run +scripts and compose files should follow this contract; direct host publishing +is only for local manual testing. + ## Component Mapping | Concept | What it maps to in code | @@ -204,7 +224,7 @@ These aren't blockers for the stated case, just worth knowing: 2. **All projects share the outer container's filesystem quota.** One project filling `/workspace` affects everyone. Disk quota or per-project mount points if it matters. 3. **No process-level isolation between projects.** A bug in agent-server affects all projects. For single-admin, fine. 4. **First-time podman storage init is slow.** Add `podman info` to the entrypoint to warm up. -5. **Inner container ports must be allocated.** Either expose a port range (`-p 3000-3010:3000-3010`) and let the agent pick, or have a registry that hands out ports. The latter scales better. +5. **Inner container ports must be allocated.** Either expose a port range (`-p 127.0.0.1:3000-3010:3000-3010`) and let the agent pick, or have a registry that hands out ports. The latter scales better. Per the decision above, the outer container binds these on loopback only and appx proxies external traffic in. 6. **Outer container restart kills inner containers.** Inner Podman state lives in the outer container's filesystem. If you `docker restart builder`, all running apps die. Mount Podman storage as a volume if you want persistence: `-v podman-storage:/home/builder/.local/share/containers`. None of these are dealbreakers; just trade-offs to be aware of. diff --git a/docs/architecture/other/containerisation-plan.md b/docs/architecture/other/containerisation-plan.md new file mode 100644 index 0000000..3a3c7c1 --- /dev/null +++ b/docs/architecture/other/containerisation-plan.md @@ -0,0 +1,111 @@ +I've read through both codebases. Before proposing the plan, here's the grounding — what already exists and what the real gaps are. + +## Current state (gap analysis) + +**Already built (more than the doc's "What Needs to Be Built" assumes):** + +| Piece | Status | +|---|---| +| agent-server multi-project registry, `/v1/projects` lifecycle, per-project sessions, shared AuthStorage | ✅ done | +| appx → agent-server registration (`agentserver.Client`, `EnsureProject`/`DeleteProject`, startup reconcile) | ✅ done | +| appx port allocation (10000–10999 in SQLite, atomic) | ✅ done | +| appx subdomain proxy `.` → `127.0.0.1:` | ✅ done | +| Chat proxy/mirror (`/api/pi/...`) with allow-list | ✅ done | +| Bearer token seam (`AGENT_SERVER_TOKEN`) | ✅ done | + +**Actual gaps:** + +1. **Port → agent handshake.** appx allocates the port but agent-server never learns it. The builder agent has no way to know it must run `podman run -p 10007:3000`. There's no metadata field on `POST /v1/projects`. +2. **Builder deploy conventions** — system prompt / skill telling the agent: podman exists, here's your port, here's the redeploy procedure, name containers `-app`. +3. **Outer container image** — Dockerfile with nested rootless podman (the draft in `rootless-podman-isolation.md` is untested). +4. **appx managing the outer container** — today the deploy is two systemd units on the host; nothing in Go touches docker. +5. **Ops glue** — egress proxy reachability from inside the container, restart semantics (inner containers die with outer), volumes, deploy script rewrite. + +## Key design decisions to lock first + +These determine how cleanly the stages compose, so I'd decide them up front: + +**D1 — Port strategy: publish the appx port range on the outer container at create time.** +`docker run -p 127.0.0.1:10000-10099:10000-10099`. The decisive argument: **appx's subdomain proxy already targets `127.0.0.1:` and needs zero changes** whether agent-server runs on host (early stages) or in the container (later stages). Caveats: +- Docker spawns a `docker-proxy` process per published port — shrink the range to ~100 ports (single admin doesn't need 1000 projects; the DB range constant can stay, just cap allocation). +- Escalation path if it ever hurts: a single in-container reverse proxy (agent-server itself or caddy) on one published port, with appx sending `X-Appx-Port`. Don't build it now; it's a clean swap later because appx's routing is already centralized in one handler. +- Reject `--network=host` — it throws away the network isolation boundary the whole architecture exists for. + +**D2 — Deployment metadata handshake: extend `POST /v1/projects` with a `deployment` object.** +appx sends `{name, deployment: {port: 10007, url: "https://eventx.example.com"}}`; agent-server persists it in `projects.json` **and materializes it as `/.pi/deployment.json`** so the agent can `cat` it, plus injects a short deployment section into the system prompt context. File + prompt beats prompt-only: survives context compaction, agent can re-read it. + +**D3 — Outer container management in Go: shell out to the `docker` CLI behind an interface, not the Docker SDK.** +A `containerruntime` package with an interface (matching the existing `AgentRegistrar` fake-based testing pattern), implemented by exec-ing `docker` with `--format json`. Rationale: one container's lifecycle (ensure image, create, start, inspect, health-wait) doesn't justify the Docker SDK's dependency tree, and CLI-compat means the host runtime can be docker *or* podman for free. Industry practice is split here; for this scope CLI wins on simplicity and testability. + +**D4 — Builder agent's container runtime is config, not hardcoded.** +In dev on macOS you'll run agent-server on the host where the "podman" might be podman-machine or Docker Desktop. Make the deploy skill reference `$APP_CONTAINER_RUNTIME` (default `podman`) so stage-1 dev work transfers untouched to the nested setup. + +## Staging: yes — podman-first, outer container later, plus one early de-risking spike + +Your instinct is right, with one amendment. The user-visible flow (steps 2–6 of your list) is ~80% of the work and is **completely independent of the outer container** — prompt engineering, the port handshake, redeploy UX all behave identically whether agent-server is containerized. The outer container is packaging/ops. *But* there's one real risk in deferring it: "works on host, breaks nested" — nested rootless podman has fragile flags (`/dev/fuse`, fuse-overlayfs, uidmap, seccomp). So: + +### Stage 0 — Nested-podman spike (timeboxed, ~1 day, parallel to everything) +Pure validation, no feature code. On a Linux box (Hetzner/Lima/OrbStack VM — **not** macOS-native): +- Build the draft Dockerfile from `rootless-podman-isolation.md`, fix it until `podman run -d -p 10000:80 nginx` works inside, and `curl 127.0.0.1:10000` works from the host through both forwarding layers. +- Verify: volume-mounted `/workspace` and podman storage survive container restart; `podman info` warmup time; behavior after `docker restart`. +- **Deliverable:** a known-good `Dockerfile` + `run-outer.sh` checked into `agent-server/` (or `appx/deploy/`), and a short findings note. This de-risks stages 3–4 and informs the stage-1 skill conventions. + +### Stage 1 — Full user flow, agent-server on host ("podman without outer container") +The whole loop from your list (create project → agent builds app → container → subdomain → refine → redeploy), minimal moving parts: + +- **agent-server:** `deployment` metadata on project create (contract + `ProjectStore` record + `deployment.json` materialization + system-prompt injection). All unit-testable with the existing `node:test` suites — extend `projectLifecycle.test.ts`. +- **appx:** `agentserver.Client.EnsureProject` gains the deployment payload (port from the store record, URL from `BaseDomain`); `Manager.Create` threads it through. Unit-test with the existing fake-registrar pattern. +- **Builder deploy skill/prompt:** conventions — read `.pi/deployment.json`, `$APP_CONTAINER_RUNTIME build -t -app .`, run with `-p :`, named containers, redeploy = `stop && rm && run`, health-check with curl before declaring success. This is where iteration time goes. +- **Dev environment:** `task local` on macOS + Docker Desktop/podman-machine as the agent's runtime. The appx health checker (`AppRunning` TCP dial) already gives the UI deploy status for free. + +**Acceptance:** your steps 2–6 work end-to-end on `*.127.0.0.1.sslip.io` locally, including the refinement/redeploy cycle. + +### Stage 2 — agent-server inside the outer container, started manually +Take the Stage 0 image, make it real: install agent-server in it, entrypoint (podman warmup, env-key injection — already supported via `ANTHROPIC_API_KEY` config), volumes for `/workspace` + podman storage, publish `127.0.0.1:4001` + the app port range. Run it via script/compose; point host-run appx at it via `APPX_AGENT_SERVER_URL` — **no appx code changes**, because of D1. + +**Acceptance:** the exact Stage 1 e2e passes with agent-server containerized. This isolates "does the nested environment break the flow" from "does appx manage containers correctly". + +### Stage 3 — appx creates/supervises the outer container at startup +- `internal/containerruntime`: interface + docker-CLI implementation + fake; ensure-image / create-if-missing / start / health-wait (poll agent-server `/`), reconcile on appx boot. +- Config: `APPX_AGENT_CONTAINER=true`, image ref, port range, key passthrough, generated `AGENT_SERVER_TOKEN` (now **mandatory** — the port is published, and OWASP A01 says don't rely on loopback alone once another party could reach it). +- Egress: outer container needs `--add-host=host.docker.internal:host-gateway` and `HTTPS_PROXY` pointed at the host egress proxy; the proxy must listen on the docker bridge, not just loopback — this is a real change to check in `egress.ProxyAddr`. +- Rewrite `deploy/`: drop `agent-server.service`, system-setup installs docker, appx.service gains docker group access. + +**Acceptance:** fresh VM → bootstrap → appx up → container exists → full e2e. + +### Stage 4 — Hardening +- **Restart semantics:** outer restart kills inner apps (doc limitation #6). Cheapest fix: entrypoint runs `podman start --all` after warmup; appx `AppRunning` already reflects reality in the UI either way. +- bash-tool `spawnHook` stripping `*_API_KEY` from child env (defense in depth per the architecture doc). +- `--memory`/`--cpus` on the outer container; idle-runtime eviction only if memory proves a problem. +- Security review pass (you have a precedent format in `docs/security/`). + +## Testing strategy (the cross-service/networking question) + +Apply the test pyramid, with an explicit rule for what each layer is allowed to touch: + +**1. Unit tests (every PR, no docker, both repos).** Everything behind seams: deployment-metadata plumbing (node:test), port-capped allocation, `containerruntime` logic against a fake CLI runner, client payloads against `httptest` fake agent-server. The repos already do exactly this style — extend, don't invent. + +**2. Deterministic infra smoke test — the key idea: bypass the LLM.** The expensive, flaky part of e2e is the agent; but the agent only ever runs bash commands. So script the *exact commands the deploy skill prescribes*: +- `scripts/smoke-deploy.sh`: start agent-server (host or container per stage) + `appx --http`, create a project via `POST /api/projects`, assert agent-server has it with the right port in `deployment.json`, then deploy a canned hello-world app by running the skill's literal command sequence (via `docker exec` into the outer container in stages 2+), and `curl http://.127.0.0.1.sslip.io:8080` expecting 200. Then redeploy a modified version and assert the change is visible. +- This validates ports, volumes, proxy chain, registration — everything except LLM judgment — deterministically and in seconds-to-minutes. + +**3. Nested-container integration (Linux only, CI nightly or on-demand).** Stages 0/2/3 artifacts get a GitHub Actions job (ubuntu runners are full VMs; `--device /dev/fuse` works there — the podman project itself tests podman-in-docker this way): build outer image, run, podman hello-world inside, full port-chain curl. Don't attempt this on macOS at all; accept that macOS dev covers stage-1-shaped flows and Linux covers nesting. + +**4. LLM e2e — manual checklist + one optional scripted "golden prompt".** ("Build a single-page todo app and deploy it.") Asserts HTTP 200 on the subdomain within N minutes. Run before releases, not in CI — it's for catching prompt/skill regressions, and it's the only layer where prompt quality is visible. + +The principle threading through: **every networking boundary gets tested by a real connection at exactly one layer, and faked everywhere else.** No mocked-docker unit tests pretending to verify port forwarding, and no LLM in the loop for infrastructure verification. + +## Risk register (worth keeping visible) + +1. **Nested podman flags on target OS** — mitigated by Stage 0 spike before feature investment. +2. **Port-range publish overhead** — mitigated by capping at ~100; escalation to in-container proxy is pre-designed (D1). +3. **macOS/Linux divergence** — accepted and bounded: macOS = flow/prompt dev, Linux = nesting truth. +4. **Outer restart kills apps** — Stage 4 `podman start --all`; UI already shows honest status. +5. **Egress proxy reachability from container** — explicitly scoped into Stage 3; easy to forget and "works in dev" (where agent-server is on host). +6. **Token becomes security-critical at Stage 3** — published port means bearer auth is no longer optional. + +--- + +Rough effort: Stage 0 ~1 day, Stage 1 ~2–3 days (mostly skill iteration), Stage 2 ~1 day, Stage 3 ~2–3 days, Stage 4 ~1–2 days. + +Want me to write this up as `docs/plans/` entries in the respective repos (the agent-server contract change and the appx container-supervisor design each deserve a short ADR), and/or create todos for the stages so work can be claimed per-stage? \ No newline at end of file diff --git a/docs/architecture/other/orchestrator-comparison.md b/docs/architecture/other/orchestrator-comparison.md new file mode 100644 index 0000000..2e02d0c --- /dev/null +++ b/docs/architecture/other/orchestrator-comparison.md @@ -0,0 +1,108 @@ +# appx-orchestrator vs. agent-server + appx + +**Date:** 2026-06-12 +**Context:** `appx-orchestrator` is a parallel implementation of the same +`docs/architecture/important/builder-container-architecture.md`. It is further +along on product surface (template, preview, theme) but stops at the network +edge. This doc is the concise "what to borrow, what to avoid, how appx closes +the gap" reference. + +Both share the same core: one agent-server process, per-project builder agents, +one outer container, rootless podman, inner app-only containers. They diverge on +**how the app reaches the user**, **who owns the app's port**, and **how +hardened the outer container is**. + +--- + +## 1. Good things worth adopting into agent-server + +| Feature | What it is | Adoption note | +| --- | --- | --- | +| **App template seeding** | A runnable Next.js app baked into the outer image; copied into `/workspace/` on first `POST /v1/projects`. Projects start as a working app, not an empty dir. | Highest-value borrow. agent-server needs the `templateDir` + `cpSync` seam (orchestrator already has it; ours doesn't yet). | +| **Iframe preview UX** | Live app embedded next to the chat panel. | The product surface users love. appx makes it work remotely (§3). | +| **DEV/PROD container split** | Multi-stage template Dockerfile: `dev` (hot-reload, preview) vs `prod` (built, served). | Good lifecycle model; adopt the multi-stage template. | +| **Theme bridge** | Shell theme propagates into the app via `--app-*` CSS tokens + origin-checked `postMessage`; app can push `appx:start-prompt`. | Nice-to-have polish; adopt once the core loop works. | +| **Global builder `AGENTS.md`** | Entrypoint installs a shared builder system prompt into `.pi-global/AGENTS.md`. | Complements our planned `deploy-app` skill — use both (prompt for "you're a builder", skill for deploy mechanics). | +| **Headless backend / client-only UI** | `agent-client` is imported only by the frontend; agent-server has zero UI deps. | We already do this (appx embeds the React UI). Confirms the boundary is right. | + +--- + +## 2. Issues with the orchestrator approach + +| # | Issue | Why it bites | +| --- | --- | --- | +| 1 | **Dangerously weakened outer container (the security boundary).** `run.sh` uses `--cap-add SYS_ADMIN` + `seccomp=unconfined` + `apparmor=unconfined` (+ `--device /dev/fuse`). | `--cap-add SYS_ADMIN` gives the outer container nearly-root powers over the host kernel (mounting, namespaces — the classic container-escape toolkit). Combined with their disabled seccomp+AppArmor, it badly weakens the very sandbox the whole design depends on — and our spike showed it's unnecessary: a 3-line `newuidmap` fix removes the need for it entirely (§3), with ~2× faster builds too. | +| 2 | **Localhost-only preview — unusable on a deployed server.** The iframe `src` is `http://127.0.0.1:`, and an iframe URL is resolved by the *viewer's browser*, not the server. | `127.0.0.1` always means "the machine asking" — i.e. the viewer's own laptop. It only works when the browser and the outer container are the **same machine** (a dev laptop). Host orchestrator on a server (e.g. Hetzner) and open it from your laptop: the shell + chat still work (chat is proxied server-side), but the preview iframe asks *your laptop* for `127.0.0.1:`, where nothing is running → blank. Pointing it at the server's IP instead doesn't save it: that's an unauthenticated, non-TLS, port-collision-prone exposure, and an `http://` iframe inside an HTTPS shell is blocked as mixed content. **Net: the orchestrator preview is a single-machine dev tool; it cannot show a hosted app to a remote user.** | +| 3 | **Agent-authoritative ports + `app.json` discovery.** The agent picks a port from a fixed 3000–3010 range and writes `.pi/app.json`; the shell reads (and must sanitize) that agent-supplied URL. | See §2.3 — fragile, insecure, and inverts the authority we want. ~10 ports also caps concurrent apps and invites collisions. | +| 4 | **No external exposure.** No reverse proxy, no public URL, no per-user routing. | You can't hand anyone a link to a built app. The "PROD container" in the docs has no mechanism behind it — aspirational. | +| 5 | **No TLS, no auth in front of apps.** | Any attempt to expose the raw port range is unauthenticated + unencrypted. An HTTPS shell also can't embed an `http://` iframe (mixed-content block). | +| 6 | **Thin control plane.** The Next.js `app-shell` has no egress control, no durable per-project/user records, no health model beyond a preview ping. | No allowlist on outbound agent traffic; no multi-tenant accounting. | + +**Root cause of 2–5:** the preview is built on the assumption that the viewer +sits at the machine running the app. The moment it's hosted, that assumption is +false. + +### 2.3 Why `app.json` is an anti-pattern for us (the port-authority point) + +orchestrator is **agent-authoritative** on the port: the agent chooses it, so +the shell has no other way to learn it — hence the agent writes `.pi/app.json` +and the shell reads it back. That forces: + +- **Fragility** — routing depends on an LLM correctly writing a file: it can be + missing, malformed, or stale after a redeploy. +- **A security smell** — the control plane consumes **agent-authored data for + routing**, so it must sanitize an agent-supplied URL (orchestrator literally + has `sanitizePreviewUrl`). If you have to sanitize it, you shouldn't be + trusting it as a source of truth. +- **Inverted authority** — it reintroduces, through the back door, the agent + control over ports that a deterministic design removes. + +We want the opposite direction of authorship: + +> **`deployment.json` (ours): written by the control plane, read by the agent — an instruction.** +> **`app.json` (theirs): written by the agent, read by the control plane — a discovery.** + +In our model **appx allocates the port** and already knows the URL at allocation +time. There is nothing to discover. The agent's only obligation is to **publish +the app on the port appx assigned** (`podman run -p :`). +Readiness — the one useful thing `app.json` gave orchestrator — is covered +better by appx's own **TCP health check on the assigned port** (the existing +`AppRunning`): deterministic, ground-truth, agent-independent. So `app.json` is +dropped entirely. + +--- + +## 3. How appx unblocks each issue + +appx is the **public edge + control plane** that orchestrator lacks. + +| Issue (§2) | appx capability that resolves it | +| --- | --- | +| 1 — weakened outer container | Our **Stage 0 spike** outer image: file-cap `newuidmap` (drops `SYS_ADMIN`), **tailored seccomp** (replaces `unconfined`), **native overlay** (drops `/dev/fuse`, ~2× faster). Restores all three sandbox barriers — strictly more secure + faster. Adopt into the outer image; orchestrator's builder should too. | +| 2 — localhost-only preview | **Subdomain reverse proxy.** Each project gets `https://.`, resolved server-side to the inner container's loopback port → the iframe loads from **any** browser, anywhere. | +| 3 — agent ports + app.json | appx **allocates a dedicated port per project** (durable store) and **pushes it down** as `deployment.json` + prompt context. The agent *must* publish on that port; it never chooses one. No agent-written routing file; readiness via appx's port health check. Stable, collision-free, deterministic URLs. | +| 4 — no external exposure | appx **is** the public origin. It routes `Host: .` → `127.0.0.1:` inside the host. "PROD" stops being aspirational: the subdomain *is* the production surface. | +| 5 — no TLS/auth | appx **terminates TLS** (Let's Encrypt via CertMagic) and runs **auth middleware in front of every app and the chat**. HTTPS end-to-end means the iframe embeds cleanly; auth means apps aren't open to the world. | +| 6 — thin control plane | appx adds the **egress allowlist proxy**, durable per-project records, password auth, and a real health model. | + +**Iframe caveat appx turns into a non-issue:** because appx is the doorman in +front of every app, it can set/strip `X-Frame-Options` / CSP `frame-ancestors` +at the proxy, so framing works over HTTPS regardless of what the generated app +emits. + +--- + +## 4. Target shape (the synthesis) + +> Take orchestrator's **product surface** — template seeding, iframe preview, +> DEV/PROD split, theme bridge — and run it on **our agent-server**, with **appx +> providing the edge** (subdomain + TLS + auth + **deterministic port +> allocation**) and the **hardened outer container** from the Stage 0 spike. + +Result: *build on a server, view the live app from anywhere — embedded in an +iframe or at its own URL — securely.* That is precisely what orchestrator cannot +do today and what the appx control plane exists to enable. + +**Port authority — decided, not open:** appx allocates → the agent must bind the +app to that port via podman → appx routes the subdomain to it and health-checks +it. No agent-authored `app.json`; the control plane owns the address end to end. diff --git a/docs/architecture/other/sidecar-vs-orchestrator.md b/docs/architecture/other/sidecar-vs-orchestrator.md new file mode 100644 index 0000000..83e0f87 --- /dev/null +++ b/docs/architecture/other/sidecar-vs-orchestrator.md @@ -0,0 +1,90 @@ +I now have a clear picture of both. Note: the sidecar app (`create-appx-app`) actually _consumes_ `@appx/agent-server` — but it wires it in a fundamentally different **topology** than what agent-server was designed for. Here's the comparison. + +## First, what the two approaches actually are + +**Approach A — Sidecar (create-appx-app):** The builder is _embedded inside the app it builds_. One app = one agent-server pointed at `PROJECT_DIR = its own repo`. The preview pane and the chat dock are sibling React components in the _same_ running app, and the agent edits the _same source files_ that are being served. 1:1:1 — one app, one agent, one project dir. + +**Approach B — Orchestrator (agent-server's intended design):** One long-lived agent-server process is a _central control plane_ that manages **many** isolated projects. `ProjectRegistry` → lazily-built `ProjectRuntime` per project → `Map`. The builder is **separate** from the apps it builds; agents modify code in `WORKSPACE_DIR//` and spin up the actual apps as **inner containers** (rootless podman) inside an unprivileged outer container. The builder never _is_ the app. + +The key structural difference: in A the agent edits the process that hosts the agent UI; in B the agent and the app it produces are different processes (indeed different containers). + +--- + +## Dimension-by-dimension + +### 1. Isolation & security + +- **Sidecar:** Weak. Agent runs with `PROJECT_DIR` = the live app repo, same filesystem, same trust zone as the served UI. LLM-generated code and the control plane share one process boundary. Fine for a trusted single developer locally; not safe for running untrusted generated code or hostile prompts. +- **Orchestrator:** Strong, by design. Three trust zones (host / trusted outer container with creds / untrusted inner containers running LLM code with **no creds**). Generated apps run nested in podman, can't touch the host, never see LLM keys. This is the whole point of `builder-container-architecture.md`. + +**Winner: Orchestrator** (clear, for anything beyond local single-user dev). + +### 2. Preview / feedback loop + +- **Sidecar:** Tightest possible loop. Agent writes a file → Next.js Fast Refresh → preview updates in-place in milliseconds, _in the same window as the chat_. No build, no container, no port allocation. This is the "build and preview simultaneously" magic. +- **Orchestrator:** Looser. Agent runs `podman build` + `podman run`, allocates a port, you hit the app on a separate URL. Seconds-to-minutes per iteration, and preview is a separate surface from the builder UI. + +**Winner: Sidecar** (for iteration speed and the unified build-while-previewing UX). + +### 3. Multi-tenancy & scale + +- **Sidecar:** None. One app, one agent. To build N apps you run N independent app+agent pairs, each with its own credentials wiring. No shared model registry, no central registry of projects. +- **Orchestrator:** Built for this. Durable `projects.json` registry, lazy per-project runtimes, **one shared `AuthStorage`/`ModelRegistry`** across all projects (set keys once at boot), idempotent project creation, restart-safe via the mounted volume. + +**Winner: Orchestrator.** + +### 4. Blast radius / self-modification risk + +- **Sidecar:** The dangerous one. The agent edits the _same code that renders the agent's own chat UI_. As we discussed, an edit to `app-shell.tsx`, `layout.tsx`, or `pi-runtime-provider.tsx` can break the refresh boundary or introduce a syntax error that takes down the builder UI itself — a chicken-and-egg failure. The `AGENTS.md` prompt explicitly begs the agent to "keep the chat shell working," which is a soft guardrail around a hard structural hazard. +- **Orchestrator:** The builder control plane is physically separate from the built app. A broken generated app crashes its inner container; the agent-server, its UI, and other projects are untouched. (Caveat: all projects share _one_ agent-server process, so an agent-server bug — not an app bug — affects everyone. Documented limitation #3.) + +**Winner: Orchestrator** for control-plane safety. + +### 5. Credentials handling + +- **Sidecar:** Token injected server-side by the Next proxy (`AGENT_SERVER_TOKEN`), keys via env / LiteLLM extension. Decent, but the agent and the LLM keys live in the same trust zone as the served app. +- **Orchestrator:** Keys live only in agent-server's heap in the trusted outer container; inner app containers never receive them unless explicitly passed; defense-in-depth via bash `spawnHook`. Cleaner separation. + +**Winner: Orchestrator.** + +### 6. Simplicity & developer experience + +- **Sidecar:** Dead simple to grasp and run: `pnpm agent:dev` + `pnpm dev`, or one Docker container with a bind mount. No podman, no nested containers, no port broker, no project registry. The repo _is_ the template — clone, point the agent at it, go. Great for "scaffold me an app and let me watch it form." +- **Orchestrator:** More moving parts: outer container, rootless podman, FUSE device, port allocation, project provisioning, registry persistence. More to operate and debug. + +**Winner: Sidecar** for low-friction local/single-app use. + +### 7. Resource management & failure modes + +- **Sidecar:** No shared-resource contention because there's nothing shared — but also no limits and no orchestration; you manage each pair by hand. +- **Orchestrator:** Central place to apply `--memory`/`--cpus`, idle-eviction sweeps, disk quotas. But projects share the outer container's CPU/mem/disk, so a runaway build in one project can starve another (documented limitations #1, #2). + +**Roughly even** — different trade-offs. + +### 8. Deployment lifecycle + +- **Sidecar:** There isn't really one — the "app" is the dev server. Great for prototyping, awkward as a path to a deployable artifact (you'd graduate it out of the sidecar shell). +- **Orchestrator:** Produces real container images via podman — a natural path to deploy/scale (escalates cleanly to per-user containers, k8s pods, Sysbox/microVMs per the escalation table). + +**Winner: Orchestrator** for productionization. + +--- + +## How to think about it + +They're not really competitors — they're optimized for different points in the lifecycle, and the sidecar is essentially **one constrained configuration** of the orchestrator (single project, `projectDir` = the app, no inner containers, hot-reload instead of podman): + +| | Sidecar (create-appx-app) | Orchestrator (agent-server) | +| -------------------- | ----------------------------------------------- | ------------------------------------------- | +| Best for | Single dev, single app, fast in-place iteration | Many projects, isolation, deployable apps | +| Preview | Same window, instant Fast Refresh | Separate URL, container build | +| Isolation | Same process/trust zone | Outer + nested inner containers | +| Multi-project | No | Yes (registry, shared auth) | +| Control-plane safety | Agent can break its own UI | Builder separate from built app | +| Credentials | Same zone as served app | Isolated in outer container heap | +| Ops complexity | Minimal | Higher (podman, ports, registry) | +| Scale path | Re-architect | Layered escalation (per-user, k8s, microVM) | + +**Bottom line:** The sidecar buys an unbeatable build-while-previewing feedback loop and trivial setup, at the cost of isolation, multi-tenancy, and the structural hazard of an agent that can crash its own UI. The orchestrator buys security boundaries, multi-project scale, shared credentials, deployable artifacts, and a clean separation between builder and built — at the cost of a slower loop and more operational machinery. + +A pragmatic product often wants **both**: the orchestrator as the backbone (isolation, multi-project, deploy), with a sidecar-style hot-reload preview surface bolted on for the inner-loop UX during active editing. diff --git a/docs/superpowers/plans/2026-05-27-credentials-extraction.md b/docs/plans/2026-05-27-credentials-extraction.md similarity index 100% rename from docs/superpowers/plans/2026-05-27-credentials-extraction.md rename to docs/plans/2026-05-27-credentials-extraction.md diff --git a/docs/superpowers/plans/2026-06-02-pi-conventions-alignment.md b/docs/plans/2026-06-02-pi-conventions-alignment.md similarity index 100% rename from docs/superpowers/plans/2026-06-02-pi-conventions-alignment.md rename to docs/plans/2026-06-02-pi-conventions-alignment.md diff --git a/docs/plans/builder-containers-plan.md b/docs/plans/builder-containers-plan.md new file mode 100644 index 0000000..ff5bc1d --- /dev/null +++ b/docs/plans/builder-containers-plan.md @@ -0,0 +1,306 @@ +# Plan: Containerised Apps — agent-server Side + +**Date:** 2026-06-11 +**Status:** Draft +**Scope:** Deployment metadata contract (dev + prod), app template seeding, two-container (dev/prod) deploy model, builder deploy skill/prompt, outer container image (nested rootless podman), smoke tests +**Canonical architecture:** `docs/architecture/important/builder-container-architecture.md` +**Sibling plan:** appx repo, `docs/plans/phase_9_plan.md` (control plane: port allocation, container supervision, subdomain routing) + +--- + +## Goal + +Implement agent-server's half of the containerised apps architecture: + +1. appx starts ONE outer container at boot (agent-server + rootless podman inside). +2. User creates a project in the appx UI; appx allocates **two ports** (a DEV and a PROD port) and registers the project here **with deployment metadata** (both ports + their public URLs). +3. New projects are **seeded from a baked-in app template**, so they start as a runnable, containerised app (a lean multi-stage Dockerfile — no framework dev-server). +4. The builder agent builds **one image** and runs it as **two inner podman containers** — DEV (iterate) and PROD (stable/shared) — each publishing its reserved port. DEV and PROD are the **same build** ("what you see is what ships"). +5. The user iterates against the DEV URL; refinements rebuild + redeploy DEV. When happy, the agent **promotes** (rebuilds PROD from current source). Both are real `https://…` URLs exposed by appx. + +## Division of labour + +| Concern | Owner | +|---|---| +| Port allocation (**two ports/project**), subdomain proxy (**dev + prod**), outer container lifecycle, host deploy scripts | appx | +| Project registry + deployment metadata persistence, surfacing metadata to the agent, deploy skill, outer container **image**, podman-in-container validation | agent-server (this plan) | + +agent-server stays appx-agnostic: it receives a generic `deployment` object (dev + prod `{port, url}` pairs) on project create and makes it available to the agent. It never knows how appx mints ports or subdomains — only that two pairs were handed to it. + +> **appx-side implication (track in `phase_9_plan.md`):** appx must allocate a **pair** of ports per project and route **two** subdomains (prod `…`, dev e.g. `…-dev.`). The 100-port publish cap therefore means ~50 projects, not 100 — revisit the cap there. + +--- + +## Design decisions + +### D1 — Deployment metadata rides on `POST /v1/projects` (dev + prod) + +Extend the create-project contract with an optional `deployment` object carrying +**two environments**, each a `{port, url}` pair: + +```jsonc +POST /v1/projects +{ + "name": "eventx", + "deployment": { + "dev": { "port": 10006, "url": "https://eventx-dev.example.com" }, + "prod": { "port": 10007, "url": "https://eventx.example.com" } + } +} +``` + +- Both `dev` and `prod` (and their fields) are optional, so a single-port local dev setup or a partial registration is still valid. +- Persisted on the `ProjectRecord` in `projects.json`. +- **Idempotent re-POST with the same name updates `deployment`** (today it returns the existing record untouched). This lets appx's startup reconcile heal drift — e.g. a project that predates this feature gets its ports on the next boot. +- Returned by `GET /v1/projects` / `GET /v1/projects/:id` so the control plane can verify registration. +- agent-server never *reads a port back* from the agent — the pairs flow one way (appx → metadata → agent). Readiness is appx's concern (a health check on each port), never an agent-authored file. (Contrast: orchestrator's `.pi/app.json` — see `docs/architecture/other/orchestrator-comparison.md` §2.3.) + +### D2 — Metadata is surfaced to the agent as file + prompt section + +Two mechanisms, both generated from the same record: + +1. **`/.pi/deployment.json`** — materialised on create/update. The agent can `cat` it any time; it survives context compaction and session restarts. +2. **System prompt injection** — `ProjectRuntime` appends a short generated "Deployment" section to the resolved system prompt when metadata exists (after the `.pi/AGENTS.md` content, never replacing it): + +``` +## Deployment +This project runs as TWO containers from the SAME build (two instances, not two builds): +- DEV (iterate here): host port 10006 → https://eventx-dev.example.com (container -app-dev) +- PROD (stable, shared): host port 10007 → https://eventx.example.com (container -app-prod) +Refinements rebuild + redeploy DEV; PROD changes only when you "promote". +The app listens on its container port; map it with -p :. +Container runtime: podman. See the deploy-app skill for build/run/redeploy/promote conventions. +Machine-readable copy: .pi/deployment.json +``` + +File-only would risk the agent never reading it; prompt-only would risk loss on compaction. Both is cheap. + +### D3 — Container runtime for the agent is env config, not hardcoded + +`APP_CONTAINER_RUNTIME` env var (default `podman`), threaded into the prompt section and the skill. In the outer container it is always `podman`; in local macOS dev (Stage 1, agent-server on host) it may be `docker` (Docker Desktop) or podman-machine. The skill references `$APP_CONTAINER_RUNTIME` so Stage 1 prompt iteration transfers untouched to the nested setup. + +### D4 — Deploy conventions live in a skill, not only in AGENTS.md + +Ship a `deploy-app` skill in this repo (`skills/deploy-app/SKILL.md`), loaded via `PI_SKILL_PATHS` in the outer image. Skills are versioned with agent-server, independent of any one project's `.pi/`, and the prompt section stays short (conventions load only when the agent deploys). + +### D5 — New projects are seeded from a baked-in app template + +`POST /v1/projects` copies a configured template dir into a freshly-created +`WORKSPACE_DIR/{id}/` (only when the dir did not already exist; existing projects +are untouched). A seeded, runnable starting point means the deploy skill's +`build` + `run` commands work from the very first prompt instead of +relying on the LLM to scaffold a correct app + Dockerfile from scratch. The +**seeding mechanism** (a `templateDir` + recursive copy with a skip filter for +`node_modules`/build output) is lifted from appx-orchestrator (comparison §1). + +- Config: `APPX_TEMPLATE_DIR` (absent ⇒ projects start empty, as today). +- The outer image bakes the template at a fixed path and points `APPX_TEMPLATE_DIR` at it. + +**The template *content* is deliberately undecided.** We do not yet know the +target stack — it may not even be a JS frontend (a Python backend, a full-stack +framework, or several selectable templates are all on the table). Consequences: + +- **Start minimal:** a small **Vite SPA** is the provisional default, not Next.js + — fewer moving parts, faster builds under nested podman, easier to reason about + while the core loop is proven. Treat it as a throwaway placeholder, not a commitment. +- **No framework assumptions leak outward.** agent-server, the deployment metadata, + and the deploy skill stay stack-agnostic — the only contract is "a Dockerfile + that builds a lean image serving on a port." Swapping the + template, or supporting several, must require zero changes outside `APPX_TEMPLATE_DIR`. +- **We author the template's Dockerfile; we don't copy orchestrator's.** + Orchestrator's is a useful *structural* reference for the multi-stage shape + (deps → build → lean runtime), but its final stage ships the whole build tree + (dev deps + source) and runs as root — not optimal. Ours has a lean runtime + stage and a non-root user. The inner-app spike (`container/INNER-APP-SPIKE.md`) + **proved this builds and serves under nested rootless podman + native overlay** + (a 64 MB non-root Vite image; a Python app worked identically). + +### D6 — Two containers per app: DEV and PROD (same build, two instances) + +Each project deploys as two inner containers built from the **same Dockerfile** +(one build target — no dev/prod divergence), on the two ports appx allocated (D1): + +| Env | Container | Image | Host port | Purpose | +|---|---|---|---|---| +| DEV | `-app-dev` | `-app:dev` | dev port | What the user iterates against; rebuilt + redeployed on each refinement. | +| PROD | `-app-prod` | `-app:prod` | prod port | The stable/shared URL; rebuilt only on **promote**. | + +- **DEV is built exactly like PROD** (real built image, real server) — no + framework dev-server, no hot-reload. "What you see in DEV is what ships," and + every stack is treated uniformly (build an image, run it). This deliberately + keeps app-specific dev-server quirks (Vite `allowedHosts`, HMR WebSockets) out + of the template, the skill, **and** appx. See *Potential improvements* for the + hot-reload escape hatch if rebuild latency ever bites. +- **Refinement loop (goal step 5):** rebuild the image and redeploy **DEV only** + (~seconds; the spike measured ~0.7 s warm rebuild + a fast `rm`/`run`). PROD's + URL stays stable while the user iterates. +- **Promote** = rebuild PROD from current source and restart its container, so + PROD matches what the user approved in DEV. +- Two image tags (`:dev`/`:prod`) keep the instances independent snapshots even + though they come from one Dockerfile. + +--- + +## Staging (shared with appx plan) + +| Stage | What | Repo focus | +|---|---|---| +| 0 | Nested rootless podman spike (timeboxed ~1 day) | agent-server | +| 1 | Full user flow with agent-server **on host** ("podman without outer container") | both | +| 2 | agent-server inside the outer container, started manually | agent-server | +| 3 | appx creates/supervises the outer container at startup | appx | +| 4 | Hardening (restarts, key stripping, resource limits) | both | + +Rationale: the user-visible flow (Stage 1) is ~80% of the value and is independent of the outer container; the outer container is packaging. The Stage 0 spike de-risks the one thing that could invalidate Stage 1 decisions — nested podman flag fragility ("works on host, breaks nested"). + +--- + +## Stage 0 — Nested-podman spike ✅ DONE + +Validated on a Linux host. Artifacts committed under `container/`; full writeup in +`container/SPIKE-FINDINGS.md`. Nested rootless podman works **unprivileged** with +hardened host defaults intact. + +**Proven outer-container recipe (supersedes the old draft flags):** +- **No `--privileged`, no `--cap-add SYS_ADMIN`**; the outer process runs as uid 1000. +- The linchpin fix: repackage `newuidmap`/`newgidmap` with **file capabilities** (not setuid-root) — this, not any host sysctl, is what makes nested userns work. +- Required `docker run` flags (each deletion-tested): `--device /dev/net/tun`, `--security-opt seccomp=container/seccomp-builder.json` (tailored, **stricter than `unconfined`**), `--security-opt apparmor=unconfined` (TODO: tailor), `--security-opt systempaths=unconfined`. +- **Native rootless overlay** storage — `/dev/fuse` dropped, ~2× faster builds. +- Restart recovery: entrypoint wipes stale `XDG_RUNTIME_DIR` runtime state, then `podman start --all` cleanly resurrects inner containers (informs Stage 4). +- No host changes needed beyond installing docker. + +**Committed:** `container/{Dockerfile, run-outer.sh, entrypoint.sh, smoke.sh, seccomp-builder.json, gen-seccomp.sh, SPIKE-FINDINGS.md}`. Stages 2+ transcribe this recipe verbatim. One caveat: re-verify on a genuine Ubuntu 24.04 host (the spike box was 26.04 / kernel 7.0; the in-image podman target is 24.04). + +--- + +## Stage 1 — Deployment metadata + deploy skill (agent-server on host) + +### Contract & registry + +- [ ] `src/contract`: add `deployment` (optional `{ dev?: {port?, url?}; prod?: {port?, url?} }`) to the create-project request and the `ProjectInfo` response schemas; regenerate `openapi.json` +- [ ] `src/runtime/projectStore.ts`: `ProjectRecord` gains optional `deployment`; loader tolerates records without it (backward compatible) +- [ ] `src/runtime/projectRegistry.ts`: + - `createProject({ name, deployment })` persists metadata; **same-name re-POST updates `deployment`** and rewrites the materialised file + - materialise `/.pi/deployment.json` (pretty-printed, stable key order) on create/update + - **template seeding (D5):** when the project dir is created fresh and `APPX_TEMPLATE_DIR` is set, recursively copy it in (skip `node_modules`/`.next`/`dist`/caches); leave existing dirs untouched. Lift orchestrator's `cpSync` + filter implementation +- [ ] `src/http/projectsRoutes.ts`: accept/return the new field; validation: each present port must be an integer in 1024–65535 (reject privileged/garbage values at the boundary — fail fast) +- [ ] `src/config.ts`: add `APPX_TEMPLATE_DIR` (optional; absent ⇒ no seeding) + +### Runtime / prompt + +- [ ] `src/config.ts`: add `APP_CONTAINER_RUNTIME` (default `"podman"`), validated non-empty string +- [ ] `src/runtime/projectRuntime.ts`: extend `resolveSystemPrompt` (or a sibling helper) to append the generated Deployment section when the project has metadata. Keep generation in one pure function (`buildDeploymentPromptSection(deployment, containerRuntime)`) so it is unit-testable without a runtime + +### Deploy skill + +- [ ] `skills/deploy-app/SKILL.md` with the conventions (DEV + PROD, per D6 — same build, two instances): + - read `.pi/deployment.json` for the dev/prod ports and URLs + - DEV (refine): `$APP_CONTAINER_RUNTIME build -t -app:dev .` → `run -d --name -app-dev -p : -app:dev` + - PROD (promote): `$APP_CONTAINER_RUNTIME build -t -app:prod .` → `run -d --name -app-prod -p : -app:prod` + - no `--target`: the template's Dockerfile has one final (lean, non-root) image; DEV and PROD differ only by tag/instance/port + - redeploy: `stop && rm && build && run` under the same `--name` (idempotent; never accumulate containers); refinements rebuild **DEV only**, promote rebuilds PROD + - `` is a template detail (e.g. 8080); always map `-p :`, never assume they're equal + - multi-container apps (db etc.): suffix names `-db`, only the app publishes the reserved port(s); inter-container traffic via a `` podman network + - health check before declaring success: `curl -fsS 127.0.0.1:` with retries; report the relevant public URL to the user + - **never** pass `*_API_KEY` env vars into app containers +- [ ] Wire the skill into local dev runs via `PI_SKILL_PATHS` (document in README); the outer image bakes it in at Stage 2 + +### Tests (Stage 1) + +- [ ] `test/projectLifecycle.test.ts`: deployment metadata (dev+prod) round-trips create → get → list; re-POST same name updates it; `.pi/deployment.json` written and rewritten; absent metadata ⇒ no file, no prompt section; **template seeding** copies into a fresh dir and skips an existing one +- [ ] New `test/deploymentPrompt.test.ts`: `buildDeploymentPromptSection` output for dev-only / prod-only / both / absent metadata +- [ ] Manual e2e (with appx running locally — see appx plan): create project in UI (seeded template runs immediately) → prompt a small change → DEV URL updates → promote → PROD URL reflects it. This is where skill iteration happens. + +**Acceptance:** the full create → deploy → view → refine → redeploy loop works locally with agent-server run via `npm run dev` and Docker Desktop/podman as `APP_CONTAINER_RUNTIME`. + +--- + +## Stage 2 — Outer container image + +Promote the **committed Stage 0 artifacts** (`container/Dockerfile`, +`run-outer.sh`, `entrypoint.sh`, `seccomp-builder.json`) from "keeps the container +alive for exec" to "runs agent-server". Keep the proven flag set and the +`newuidmap` file-cap + native-overlay fixes **verbatim** — do not reintroduce +`/dev/fuse`, `SYS_ADMIN`, or `seccomp=unconfined`. + +- [ ] `container/Dockerfile` — extend the spike image: + - **multi-stage build** (lift orchestrator's pattern): a Node build stage that compiles agent-server, then copy the pruned runtime into the spike's ubuntu:24.04 stage (e.g. `npm ci && build` then copy `dist/` + production deps; orchestrator uses `pnpm deploy --prod /app`) + - keep the spike's rootless-podman setup (file-cap helpers, native-overlay `storage.conf`, `containers.conf`, subuid/subgid) unchanged + - bake `skills/deploy-app` at a fixed path; set `PI_SKILL_PATHS` + - bake the **app template** (provisional: a minimal Vite SPA, see D5 — lean multi-stage, single runtime target, non-root) at a fixed path; set `APPX_TEMPLATE_DIR`. `container-smoke.sh` builds it under nested rootless podman (proven in the inner-app spike; the smoke guards against regression) +- [ ] `container/entrypoint.sh` — extend the spike entrypoint: + - keep the stale-runtime-state wipe + `podman info` warmup + - replace `sleep infinity` with agent-server (env: `WORKSPACE_DIR=/workspace`, `ANTHROPIC_API_KEY`, `AGENT_SERVER_TOKEN`, `APP_CONTAINER_RUNTIME=podman`, `APPX_TEMPLATE_DIR=...`, `AGENT_SERVER_HOST=0.0.0.0` — the container boundary takes over loopback's role; the **publish** stays loopback-only on the host side) +- [ ] `container/run-outer.sh` — extend the spike script: + - add `-p 127.0.0.1:4001:4001` (API) alongside the existing app-port range publish (now a **pair-aware** range; see appx plan for the revised cap given two ports/project) + - keep volumes (workspace + named podman-storage volume) and the proven security flags +- [ ] Run the **same Stage 1 manual e2e** with host-run appx pointed at the container via `APPX_AGENT_SERVER_URL=http://127.0.0.1:4001` — zero appx code changes expected + +### Tests (Stage 2) + +- [ ] `scripts/container-smoke.sh` (Linux): build image → run → poll `GET /` until healthy → `POST /v1/projects` with deployment metadata → assert `deployment.json` inside the container → `docker exec` the skill's literal command sequence to build **the seeded template** once and run it as DEV + PROD instances on the two ports (a realistic multi-stage build under nested rootless podman — not just nginx) → `curl 127.0.0.1:` and `` from the host → restart outer container → assert registry + workspace survived. + This deliberately **bypasses the LLM**: the agent only ever runs bash commands, so executing the skill's exact commands validates all infrastructure (ports, volumes, nesting) deterministically. +- [ ] CI: nightly/on-demand GitHub Actions job (ubuntu runners are full VMs; `--device /dev/fuse` works there) running `container-smoke.sh` + +**Acceptance:** Stage 1 e2e passes with agent-server containerised; `container-smoke.sh` green on Linux. + +--- + +## Stage 4 — Hardening (agent-server items) + +(Stage 3 is appx-side; see sibling plan.) + +- [ ] Entrypoint resurrects inner apps after an outer restart: **wipe stale `XDG_RUNTIME_DIR` runtime state first**, then `podman start --all` (the spike proved bare `podman start --all` fails without the wipe; `entrypoint.sh` already does this — confirm it covers both DEV and PROD containers). Architecture doc limitation #6 +- [ ] Bash tool `spawnHook` strips `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` / `*_API_KEY` from child process env — defence in depth so keys can't leak into `podman run -e`-style invocations even by accident (OWASP secrets-management alignment; keys live in the process heap, not child envs) +- [ ] Optional golden-prompt LLM smoke (manual, pre-release): "build a single-page todo app and deploy it" → assert HTTP 200 on the reserved port within N minutes. Catches prompt/skill regressions; not CI + +--- + +## Testing strategy summary + +Every networking boundary is tested by a real connection at exactly one layer and faked everywhere else: + +| Layer | What | Where | +|---|---|---| +| Unit (`node:test`, every PR) | metadata round-trip, prompt section builder, config validation | this repo | +| Infra smoke (deterministic, no LLM) | full port/volume/nesting chain via skill's literal commands | `scripts/container-smoke.sh`, Linux CI | +| Cross-service smoke | appx ↔ agent-server ↔ subdomain chain | appx repo, `scripts/smoke-deploy.sh` | +| LLM e2e | prompt/skill quality | manual golden-prompt checklist | + +## Potential improvements (deferred — not v1) + +Validated or low-risk upgrades we defer to keep v1 simple and uniform. None +require app-specific logic in appx. + +### Hot-reload DEV (instant refinements) + +The inner-app spike (`container/INNER-APP-SPIKE.md`, T3) **proved** a faster +refinement loop is feasible: bind-mount the project dir into the DEV container +(`-v :/app` plus an anonymous `-v /app/node_modules` so the mount +doesn't shadow installed deps) and run the framework's dev server. The agent then +edits files in `/workspace` and the running DEV app **hot-reloads with no rebuild +or redeploy** — HMR fired across the mount on native overlay, no polling needed. + +Deferred because it breaks v1's uniformity: +- It's **framework-specific** (needs a dev server with HMR; a built static app or + a Python service has no equivalent), so it can't be the universal model. +- It reintroduces dev-server quirks the template + skill must handle — Vite's + `server.allowedHosts` must include the project's dev subdomain (fed via env from + `.pi/deployment.json`), and the dev server's HMR WebSocket must traverse appx's + subdomain proxy. + +Safe to add later because it needs **no appx change specific to it**: +`allowedHosts` lives in the template + skill; WebSocket forwarding is a generic +proxy capability appx needs for user apps regardless. Adopt per-template if the +rebuild-redeploy latency (a few seconds) proves to be real friction. + +## Risks + +1. **Nested podman flags on target OS** — retired by Stage 0 (proven recipe committed); only residual is re-verifying on a genuine Ubuntu 24.04 host. +2. **"Works on host, breaks nested"** — mitigated by D3 (`APP_CONTAINER_RUNTIME`) + skill conventions written against `deployment.json`, not host assumptions. +3. **Skill quality** — the only part needing real-LLM iteration; isolated in Stage 1 where the feedback loop is fastest (no containers in the way). +4. **Outer restart kills inner apps** — addressed in Stage 4 (stale-state wipe + `podman start --all`); appx UI already shows honest per-port health. +5. **Two ports/project halves project density** under appx's published-port cap and doubles subdomains — tracked in `phase_9_plan.md`; revisit the cap. +6. **Refinement latency** — dev=prod means every refinement is a rebuild + redeploy (~seconds, not instant). Accepted for v1; hot-reload (see *Potential improvements*) is the escape hatch and needs no appx change. + +(Realistic multi-stage builds under nesting — once a risk — are now **validated** by `container/INNER-APP-SPIKE.md`: dev+prod instances on two ports, redeploy with layer cache, and a Python app all worked unprivileged; Stage 2 smoke guards against regression.) diff --git a/docs/superpowers/plans/project-runtime-and-session-split.md b/docs/plans/project-runtime-and-session-split.md similarity index 100% rename from docs/superpowers/plans/project-runtime-and-session-split.md rename to docs/plans/project-runtime-and-session-split.md diff --git a/docs/plans/stage0-spike-brief.md b/docs/plans/stage0-spike-brief.md new file mode 100644 index 0000000..a0ddd0a --- /dev/null +++ b/docs/plans/stage0-spike-brief.md @@ -0,0 +1,181 @@ +# Stage 0 Spike Brief — Nested Rootless Podman ("Outer Builder Container") + +**Date:** 2026-06-11 +**Parent plan:** `docs/plans/builder-containers-plan.md` (Stage 0) +**Architecture reference:** `docs/architecture/important/builder-container-architecture.md` +**Background reading:** `docs/misc/other/rootless-podman-isolation.md` (the untested draft this spike validates) + +This document has two audiences: + +- **Section 0** is the runbook for the human operator preparing the box. +- **Sections 1+** are the brief for the coding agent executing the spike. + +--- + +## 0. Operator runbook (human — do this before handing off) + +Target: a throwaway Linux cloud VM. **Ubuntu 24.04** (it is the assumed production host OS and ships the strictest user-namespace defaults — if the spike passes here, easier distros are free). Minimum 2 vCPU / 4 GB RAM / 40 GB disk; see the hardware discussion in the parent plan thread. + +```bash +# ── as root on the fresh server ────────────────────────────────────────────── +apt-get update && apt-get install -y git curl rsync tmux jq +curl -fsSL https://get.docker.com | sh # Docker CE from the official repo +curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && apt-get install -y nodejs + +# Work user for the coding agent. docker group is root-equivalent, and the +# agent additionally gets passwordless sudo because T2 requires testing +# host-level mitigations (sysctls, AppArmor profiles). Acceptable ONLY because +# this box is throwaway and holds nothing but the spike + a disposable API key. +adduser --disabled-password --gecos "" spike +usermod -aG docker spike +echo 'spike ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/spike && chmod 0440 /etc/sudoers.d/spike + +# SSH access for spike: the account has NO password (--disabled-password locks +# it), so reuse the key Hetzner provisioned for root. +mkdir -p /home/spike/.ssh +cp /root/.ssh/authorized_keys /home/spike/.ssh/authorized_keys +chown -R spike:spike /home/spike/.ssh +chmod 700 /home/spike/.ssh && chmod 600 /home/spike/.ssh/authorized_keys + +# Swap (mandatory on a 4 GB box; harmless on bigger ones) +fallocate -l 4G /swapfile && chmod 600 /swapfile && mkswap /swapfile && swapon /swapfile +echo '/swapfile none swap sw 0 0' >> /etc/fstab + +# ── ship the repo: public repo → plain HTTPS clone, no credentials needed ──── +# First, from your laptop: commit + push the spike files (container/, docs/plans/) +# so the clone includes them. Then, as spike on the box: +git clone https://github.com/appx-org/agent-server.git ~/agent-server +cd ~/agent-server && git switch -c stage0-spike +git config user.name "stage0 spike agent" && git config user.email spike@localhost +# Deliberately NO push credentials on the box (the agent has sudo); results +# come back via git-over-SSH fetch from the laptop — see acceptance below. + +# ── as spike user: install the coding agent + a DISPOSABLE API key ────────── +ssh spike@ +npm config set prefix ~/.npm-global && echo 'export PATH=$HOME/.npm-global/bin:$PATH' >> ~/.bashrc +npm install -g @earendil-works/pi-coding-agent # or however you install pi +echo 'export ANTHROPIC_API_KEY=sk-ant-...' >> ~/.bashrc # fresh key, revoke after spike +source ~/.bashrc + +# ── launch (inside tmux so it survives SSH drops) ─────────────────────────── +tmux new -s spike +cd ~/agent-server && pi +``` + +Kickoff prompt to paste into the agent: + +> Read `docs/plans/stage0-spike-brief.md` in this repo — section 0 is already +> done; execute sections 1–7. You have passwordless sudo for host-level +> changes; record every host change and every finding in +> `container/SPIKE-FINDINGS.md` as you go. Commit your work to the current +> `stage0-spike` branch in small, described steps (you cannot push — that's +> expected; the operator fetches from this box). The definition of done is +> `./container/smoke.sh` exiting 0 under the brief's hard constraints (no +> `--privileged`, no `SYS_ADMIN`, non-root outer user). + +**Acceptance (operator, when the agent reports done):** + +```bash +# Re-verify from a clean slate — proves the findings, not the accumulated state: +cd ~/agent-server/container +docker rm -f builder-outer; docker volume rm -f builder-workspace builder-podman-storage +docker system prune -af +./smoke.sh # must exit 0 +``` + +Then: check `SPIKE-FINDINGS.md` is fully filled (every flag justified, host +prereqs listed), and pull the agent's branch straight off the box — commit +history included, still no credentials on the server: + +```bash +# from laptop, inside the agent-server repo +git remote add spikebox spike@:agent-server +git fetch spikebox stage0-spike +git switch stage0-spike # review, then merge/PR and push from the laptop +``` + +Finally: revoke the spike API key; destroy the server or keep it for Stage 1/2 +iteration (resizing up is easier than re-provisioning if you keep it). + +--- + +## 1. Mission (coding agent starts here) + +Prove that an **unprivileged** Docker container can run **rootless Podman** well enough to build and serve real apps, on this exact host. Produce a known-good, *minimal* configuration that later stages will copy verbatim. + +Success is binary: `./container/smoke.sh` exits 0 on this box, with a flag set you can justify line by line. + +You are NOT building agent-server integration, prompts, or anything product-shaped. Infrastructure validation only. + +## 2. What is on disk + +| Path | What it is | +|---|---| +| `container/Dockerfile` | Draft outer image (Ubuntu 24.04 + podman stack). Starting point — expect to fix it. | +| `container/entrypoint.sh` | Runtime-dir setup + podman warmup, then execs CMD. | +| `container/run-outer.sh` | Builds the image and (re)starts the outer container with the **candidate** flag set. | +| `container/smoke.sh` | The acceptance test. Your iteration loop is: edit → `./smoke.sh` → read failures → repeat. | +| `container/SPIKE-FINDINGS.md` | Findings template. Fill it in **as you go**, not at the end. | + +## 3. Hard constraints + +1. **No `--privileged`. Ever.** The outer container being unprivileged *is the security boundary of the whole architecture* — a privileged "pass" is worthless. +2. **No `--cap-add SYS_ADMIN`** unless you have exhausted alternatives; if you genuinely cannot avoid it, that is a major finding — document the exact error and stop to flag it. +3. The outer container's main process must run as a **non-root user** (uid 1000 `builder`). `--user 0` workarounds are failures. +4. Host-level changes (sysctls, apparmor profiles, packages) are **allowed but must be recorded** in findings — they become requirements for appx's deploy scripts (`system-setup.sh`). +5. Work only in `~/agent-server/container/` and on docker state. Don't touch the rest of the repo. + +## 4. Tasks and acceptance criteria + +### T1 — Make the nested chain work +- [ ] `./container/run-outer.sh` brings up the outer container; `docker exec builder-outer id -u` → `1000` +- [ ] Inside: `podman run -d -p 10000:80 docker.io/library/nginx:alpine` succeeds +- [ ] From the **host**: `curl -fsS http://127.0.0.1:10000` returns the nginx page (host → docker publish → outer netns → podman forward → inner container) +- [ ] Inside: `podman build` of a trivial image succeeds and the built image runs + +### T2 — Minimise and justify the flag set +- [ ] Deletion-test every `docker run` security flag and every host-level change: remove one, re-run `smoke.sh`, record the exact error it causes (or remove it permanently if nothing breaks) +- [ ] Try replacing `seccomp=unconfined` with a tailored profile (Podman ships one that allows `mount`; see hints). If it works, prefer it; if not, record why — `unconfined` is acceptable for now with a documented TODO +- [ ] **Outer-runtime sub-question (informs appx Stage 3):** the host runtime can be docker *or* podman. Podman's default seccomp profile allows `mount(2)` where docker's blocks it, so a podman *outer* may not need `seccomp=unconfined` at all. If podman is available on the box, run the same nested test with `podman run` as the outer command and record which flags become unnecessary. This decides whether `system-setup.sh` should prefer podman-on-host for a smaller attack surface +- [ ] Outcome: `run-outer.sh` contains only flags that each carry a one-line justification in findings + +### T3 — Persistence and restart semantics +- [ ] `docker restart builder-outer`: workspace volume content and podman images (named volume) survive +- [ ] Record what happens to *running* inner containers across the restart (expected: stopped). Test whether `podman start --all` resurrects them cleanly — this decides Stage 4's recovery mechanism +- [ ] Record first-`podman info` cold warmup time vs warmed (entrypoint logs it) + +### T4 — Storage driver determination +- [ ] The draft pins `fuse-overlayfs`. Test native rootless overlayfs (kernel ≥ 5.13 supports it; this host is 6.8+): remove `mount_program` from `storage.conf`, reset podman storage, re-run smoke. Record which works and which is faster; pin the winner +- [ ] Last-resort fallback if both overlay variants fail: `driver = "vfs"` — needs no FUSE device and no overlay nesting at all, at the cost of full-copy layers (slow, disk-hungry). If only VFS works, that's a major finding: record it and flag before Stage 2 builds on it + +### T5 — Findings +- [ ] `container/SPIKE-FINDINGS.md` fully filled in (template provided). The Stage 2 image and appx's Stage 3 container-supervisor transcribe your flag set verbatim — incomplete findings = repeated debugging later + +## 5. Known pitfalls (read before debugging blind) + +These are researched, not guessed — check them in this order when something EPERMs: + +1. **Ubuntu 24.04 blocks unprivileged user namespaces via AppArmor.** `kernel.apparmor_restrict_unprivileged_userns=1` is default; nested podman fails with `apparmor="DENIED" operation="userns_create"` (visible in host `dmesg`/`journalctl -k`). Candidate fixes, in preference order — test which is actually sufficient: + a. `--security-opt apparmor=unconfined` on the outer container (containment loss is acceptable: seccomp/userns remain); + b. a host AppArmor profile granting `userns` to the container runtime; + c. host sysctl `kernel.apparmor_restrict_unprivileged_userns=0` (bluntest; if this is the only thing that works, record it as a deploy-script requirement). +2. **Docker's default seccomp profile blocks `mount(2)`**, which rootless podman needs even for unprivileged FUSE/bind mounts. Hence `seccomp=unconfined` in the draft. The tailored alternative: Podman's own `seccomp.json` (in the `containers-common` package, `/usr/share/containers/seccomp.json`) allows `mount` — try `--security-opt seccomp=/path/on/host/seccomp.json`. +3. **`/etc/subuid` / `/etc/subgid`** entries for `builder` must exist *inside the image* (draft has them). Errors like `cannot find UID/GID for user builder` point here; `podman system migrate` after changing them. +4. **No systemd inside the container** → `cgroup_manager = "cgroupfs"` and `events_logger = "file"` (draft sets both in `containers.conf`). Resource limits inside the nest may be unavailable; that's fine, record it. +5. **`XDG_RUNTIME_DIR`** must exist and be writable (no systemd-logind to create `/run/user/1000`). Draft uses `/tmp/runtime-builder` via entrypoint. +6. **Use fully-qualified image names** (`docker.io/library/nginx:alpine`) — Ubuntu's podman has no unqualified-search registries configured and will error or prompt. +7. **`--userns=keep-id` is a podman flag, not docker.** The reference doc's draft run command mixes them up; ignore it. With docker, "unprivileged" = `USER builder` in the image + no added caps. +8. **Sanity-check trick:** `quay.io/podman/stable` is the upstream podman-in-container reference image. If our image fails mysteriously, run the same nested command in `podman/stable` with the same docker flags — if that also fails, the problem is host/flags; if it passes, the problem is our Dockerfile. Note the image only solves the *in-image* half (packages, subuid, conf); the docker-run flags and host prereqs are required with it too. +9. **Sanctioned fallback:** if our Ubuntu-based Dockerfile fights you past ~2 hours of in-image issues, switching the base to `quay.io/podman/stable` (adding the `builder` uid-1000 user on top) is an acceptable T1 outcome — record the trade-off (Fedora base, unpinned podman version) in findings and keep the rest of the constraints unchanged. Host-side flag minimisation (T2) is unaffected by the base choice. There is field evidence this matters: in-image config differences alone have made the difference between needing `--privileged` and not (stackoverflow.com/q/75244579). +10. **Canonical reference:** Dan Walsh's "How to use Podman inside of a container" (redhat.com/en/blog/podman-inside-container) is the authoritative walkthrough of every rootful/rootless nesting combination; our candidate flag set matches its non-privileged rootless-in-docker recipe. Consult it before inventing anything novel. + +## 6. Method + +- Iterate exclusively through `./container/smoke.sh` — it is the definition of done. Improve it if it misses something real (e.g. you discover DNS inside inner containers is broken — add a check), but never weaken a check to pass. +- One change at a time; record each finding immediately in `SPIKE-FINDINGS.md`. +- Host kernel logs are your AppArmor/seccomp oracle: `sudo journalctl -k --since -5min | grep -i -E 'apparmor|audit'`. +- Disk hygiene on a small box: `docker system prune -f` and `podman system prune -f` (inside) between heavy iterations. + +## 7. Timebox & escalation + +This spike is timeboxed to ~1 day of focused work. If the chain fundamentally cannot work unprivileged on Ubuntu 24.04 (constraint 1–2 violations are the only outs), stop and write up: the exact failure, kernel/audit evidence, and which of the architecture's escalation paths (Sysbox runtime, different host distro, host-level podman) looks cheapest. Do not silently downgrade the constraints to get a green smoke run. diff --git a/docs/superpowers/plans/use-agent-session-services.md b/docs/plans/use-agent-session-services.md similarity index 100% rename from docs/superpowers/plans/use-agent-session-services.md rename to docs/plans/use-agent-session-services.md