From 0033d9757a483df9afba0b49a59e8be69ba02400 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 22:58:34 -0700
Subject: [PATCH 01/14] wip

---
 .github/workflows/release-dev.yml | 2 +-
 .github/workflows/release-tag.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-dev.yml b/.github/workflows/release-dev.yml
index ff51540e..61a545d5 100644
--- a/.github/workflows/release-dev.yml
+++ b/.github/workflows/release-dev.yml
@@ -70,7 +70,7 @@ jobs:
 
   tag-ghcr-dev:
     name: Tag GHCR Images as Dev
-    needs: [build-gateway, build-cluster, e2e]
+    needs: [build-gateway, build-cluster]
     runs-on: build-amd64
     timeout-minutes: 10
     steps:
diff --git a/.github/workflows/release-tag.yml b/.github/workflows/release-tag.yml
index cda74d1e..84a66818 100644
--- a/.github/workflows/release-tag.yml
+++ b/.github/workflows/release-tag.yml
@@ -177,7 +177,7 @@ jobs:
 
   publish-python:
     name: Publish Python
-    needs: [build-python-wheels]
+    needs: [build-python-wheels, e2e]
     runs-on: [self-hosted, nv]
     timeout-minutes: 10
     env:

From 364965a6cc7289a938ea63fa3a2e0f66dbe7756d Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:01:13 -0700
Subject: [PATCH 02/14] ci(release): gate python wheels on e2e for tagged
 releases

- Add e2e to publish-python needs in release-tag.yml so wheels are not
  published to Artifactory until e2e passes
- Remove e2e gate from tag-ghcr-dev in release-dev.yml since dev Docker
  images do not need to wait for e2e
- Replace gitlab-master.nvidia.com references with generic example host
  in policy-advisor CTF example
---
 examples/policy-advisor/README.md | 4 ++--
 examples/policy-advisor/ctf.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/policy-advisor/README.md b/examples/policy-advisor/README.md
index 9dbfcb0b..a90f0a72 100644
--- a/examples/policy-advisor/README.md
+++ b/examples/policy-advisor/README.md
@@ -47,7 +47,7 @@ the script progresses through each gate.
 | 4 | The Oracle | `api.github.com:443` | Concurrent with 5 and 6 |
 | 5 | The Jester | `icanhazdadjoke.com:443` | Concurrent with 4 and 6 |
 | 6 | The Sphinx | `catfact.ninja:443` | Concurrent with 4 and 5 |
-| 7 | The Vault | `gitlab-master.nvidia.com:443` | Internal IP -- mapper adds `allowed_ips` |
+| 7 | The Vault | `internal.corp.example.com:443` | Internal IP -- mapper adds `allowed_ips` |
 
 Gates 1-3 run sequentially so you can observe the single-approval flow.
 Gate 3 uses `curl` to hit `ifconfig.me:80` -- a different endpoint that only
@@ -123,7 +123,7 @@ Gate 3 uses `curl` to reach `ifconfig.me:80`.  You'll see a new rule for
 When Gates 4-6 start, all three denials arrive together.  Press `A` to approve
 all pending recommendations at once.
 
-Gate 7 requires `allowed_ips` because `gitlab-master.nvidia.com` resolves to a
+Gate 7 requires `allowed_ips` because `internal.corp.example.com` resolves to a
 private IP.  The mapper detects this automatically and includes the resolved IPs
 in the proposed rule.
 
diff --git a/examples/policy-advisor/ctf.py b/examples/policy-advisor/ctf.py
index d62c4504..5974268f 100644
--- a/examples/policy-advisor/ctf.py
+++ b/examples/policy-advisor/ctf.py
@@ -155,9 +155,9 @@ def log(level: str, msg: str, **kv: object) -> None:
     {
         "num": 7,
         "name": "The Vault",
-        "host": "gitlab-master.nvidia.com",
+        "host": "internal.corp.example.com",
         "port": 443,
-        "url": "https://gitlab-master.nvidia.com/",
+        "url": "https://internal.corp.example.com/",
         "method": "GET",
         "headers": {"User-Agent": "openshell-ctf"},
         "body": None,

From c33d7dc81b55045d4fad153d753c72b2cf71273e Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:25:46 -0700
Subject: [PATCH 03/14] ci(canary): switch release canary to Docker-in-Docker
 with single-command test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch the release canary from Docker-outside-of-Docker (host socket
mount) to true Docker-in-Docker. The CI container now starts its own
dockerd, so the gateway cluster container is a child process and
127.0.0.1 port bindings are reachable directly.

This enables testing the real zero-to-sandbox user path: a single
`openshell sandbox create` that auto-bootstraps the gateway, pulls the
cluster image, and creates a sandbox — no --gateway-host workaround.

Dockerfile.ci changes:
- Add iptables (required by dockerd for container networking)
- Extract full Docker daemon suite (dockerd, containerd, runc) instead
  of CLI only

release-canary.yml changes:
- Remove /var/run/docker.sock volume mount
- Add dockerd startup step
- Remove gateway host resolution and explicit gateway start steps
- Simplify canary to single auto-bootstrap sandbox create command
---
 .github/workflows/release-canary.yml | 41 +++++++++++++++++-----------
 deploy/docker/Dockerfile.ci          | 14 ++++++++--
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 76b0d317..284e5fe2 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -35,14 +35,16 @@ jobs:
             target: aarch64-unknown-linux-musl
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 30
+    # Run with Docker-in-Docker: the CI container starts its own dockerd so
+    # the gateway cluster container is a child process. This means 127.0.0.1
+    # port bindings are reachable directly, matching the real user experience
+    # without needing --gateway-host workarounds.
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
         username: ${{ github.actor }}
         password: ${{ secrets.GITHUB_TOKEN }}
       options: --privileged
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
     env:
       OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
@@ -71,6 +73,18 @@ jobs:
             fi
           fi
 
+      - name: Start Docker daemon
+        run: |
+          # Start dockerd for Docker-in-Docker. The CI container runs
+          # --privileged so the daemon can create networks and cgroups.
+          # Using DinD means the gateway container is a child of this
+          # container's daemon, so 127.0.0.1 port bindings are reachable
+          # directly — no --gateway-host workaround needed.
+          dockerd &>/var/log/dockerd.log &
+          echo "Waiting for Docker daemon..."
+          timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
+          echo "Docker daemon ready"
+
       - name: Install CLI from GitHub Release
         run: ./install.sh
         env:
@@ -80,28 +94,23 @@ jobs:
       - name: Verify CLI installation
         run: openshell --version
 
-      - name: Resolve gateway host
-        run: |
-          # On Linux CI runners host.docker.internal is not set automatically
-          # (it's a Docker Desktop feature). Add it via the Docker bridge IP.
-          if ! getent hosts host.docker.internal >/dev/null 2>&1; then
-            BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}')
-            echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal"
-            echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts
-          fi
-
-      - name: Start gateway
-        run: openshell gateway start --gateway-host host.docker.internal
-
       - name: Run canary test
         run: |
           set -euo pipefail
 
-          echo "Creating sandbox and running 'echo hello world'..."
+          # Single-command canary: tests the full zero-to-sandbox path.
+          # With no gateway configured, `sandbox create` auto-bootstraps a
+          # gateway (pulls the cluster image from GHCR, starts k3s, deploys
+          # the control plane, generates mTLS PKI), then creates a sandbox
+          # and runs the command inside it.
+          echo "Creating sandbox (with auto-bootstrap) and running 'echo hello world'..."
           OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || {
             EXIT_CODE=$?
             echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}"
             echo "$OUTPUT"
+            echo ""
+            echo "--- dockerd logs ---"
+            cat /var/log/dockerd.log || true
             exit $EXIT_CODE
           }
 
diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci
index 55ff8032..3ce19d8a 100644
--- a/deploy/docker/Dockerfile.ci
+++ b/deploy/docker/Dockerfile.ci
@@ -17,7 +17,9 @@ ENV MISE_DATA_DIR=/opt/mise
 ENV MISE_CACHE_DIR=/opt/mise/cache
 ENV PATH="/opt/mise/shims:/root/.cargo/bin:/root/.local/bin:$PATH"
 
-# Install system dependencies
+# Install system dependencies.
+# iptables is required for Docker-in-Docker networking (dockerd uses it for
+# container NAT and bridge rules).
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates \
     curl \
@@ -34,16 +36,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     xz-utils \
     jq \
     rsync \
+    iptables \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Docker CLI and buildx plugin used by CI jobs
+# Install Docker CLI, daemon, and buildx plugin.
+# The full daemon (dockerd + containerd + runc) is needed for Docker-in-Docker
+# workflows like the release canary test, where the CI container runs its own
+# Docker daemon instead of mounting the host socket.
 RUN case "$TARGETARCH" in \
       amd64) docker_arch=x86_64; buildx_arch=amd64 ;; \
       arm64) docker_arch=aarch64; buildx_arch=arm64 ;; \
       *) echo "Unsupported TARGETARCH: $TARGETARCH"; exit 1 ;; \
     esac \
     && curl -fsSL "https://download.docker.com/linux/static/stable/${docker_arch}/docker-${DOCKER_VERSION}.tgz" \
-    | tar xz --strip-components=1 -C /usr/local/bin docker/docker \
+    | tar xz --strip-components=1 -C /usr/local/bin \
+        docker/docker docker/dockerd docker/containerd \
+        docker/containerd-shim-runc-v2 docker/runc \
     && mkdir -p /usr/local/lib/docker/cli-plugins \
     && curl -fsSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-${buildx_arch}" \
       -o /usr/local/lib/docker/cli-plugins/docker-buildx \

From 6cf4c4447a5ba951a800f2a7f11d2ae365307bd6 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:32:07 -0700
Subject: [PATCH 04/14] fix(ci): add docker-proxy binary and use dedicated DinD
 socket

The first canary run revealed two issues:

1. dockerd failed to start because docker-proxy was not extracted from
   the Docker static binary tarball. Add it to the extraction list.

2. The GitHub Actions runner injects its own Docker socket into job
   containers. Without an explicit DOCKER_HOST, the openshell CLI
   connected to the runner's host Docker daemon instead of our DinD
   daemon. Start dockerd on a dedicated socket (/var/run/dind.sock)
   and export DOCKER_HOST so all subsequent steps use it.
---
 .github/workflows/release-canary.yml | 15 +++++++++++----
 deploy/docker/Dockerfile.ci          |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 284e5fe2..5d90e9f0 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -75,15 +75,22 @@ jobs:
 
       - name: Start Docker daemon
         run: |
-          # Start dockerd for Docker-in-Docker. The CI container runs
+          # Start our own dockerd for Docker-in-Docker. The CI container runs
           # --privileged so the daemon can create networks and cgroups.
           # Using DinD means the gateway container is a child of this
           # container's daemon, so 127.0.0.1 port bindings are reachable
           # directly — no --gateway-host workaround needed.
-          dockerd &>/var/log/dockerd.log &
-          echo "Waiting for Docker daemon..."
-          timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
+          #
+          # We listen on a dedicated socket and set DOCKER_HOST so that the
+          # openshell CLI (and bollard) connects to our DinD daemon rather
+          # than the GitHub Actions runner's injected host socket.
+          export DOCKER_HOST="unix:///var/run/dind.sock"
+          dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log &
+          echo "Waiting for Docker daemon on ${DOCKER_HOST}..."
+          timeout 30 sh -c 'until docker --host "$DOCKER_HOST" info >/dev/null 2>&1; do sleep 1; done'
           echo "Docker daemon ready"
+          # Persist DOCKER_HOST for subsequent steps
+          echo "DOCKER_HOST=${DOCKER_HOST}" >> "$GITHUB_ENV"
 
       - name: Install CLI from GitHub Release
         run: ./install.sh
diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci
index 3ce19d8a..01b55e86 100644
--- a/deploy/docker/Dockerfile.ci
+++ b/deploy/docker/Dockerfile.ci
@@ -51,7 +51,7 @@ RUN case "$TARGETARCH" in \
     && curl -fsSL "https://download.docker.com/linux/static/stable/${docker_arch}/docker-${DOCKER_VERSION}.tgz" \
     | tar xz --strip-components=1 -C /usr/local/bin \
         docker/docker docker/dockerd docker/containerd \
-        docker/containerd-shim-runc-v2 docker/runc \
+        docker/containerd-shim-runc-v2 docker/runc docker/docker-proxy \
     && mkdir -p /usr/local/lib/docker/cli-plugins \
     && curl -fsSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-${buildx_arch}" \
       -o /usr/local/lib/docker/cli-plugins/docker-buildx \

From fa89bf1c5b4c085dce06082e19816e93049a970f Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:35:35 -0700
Subject: [PATCH 05/14] fix(ci): start dockerd on default socket path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using a custom socket path and DOCKER_HOST breaks the GitHub Actions
runner's internal Docker operations (it uses docker exec to run steps
inside the container). Since we removed the host socket volume mount,
/var/run/docker.sock is free inside the container — just start dockerd
on the default path with no DOCKER_HOST override needed.
---
 .github/workflows/release-canary.yml | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 5d90e9f0..b231d037 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -81,16 +81,12 @@ jobs:
           # container's daemon, so 127.0.0.1 port bindings are reachable
           # directly — no --gateway-host workaround needed.
           #
-          # We listen on a dedicated socket and set DOCKER_HOST so that the
-          # openshell CLI (and bollard) connects to our DinD daemon rather
-          # than the GitHub Actions runner's injected host socket.
-          export DOCKER_HOST="unix:///var/run/dind.sock"
-          dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log &
-          echo "Waiting for Docker daemon on ${DOCKER_HOST}..."
-          timeout 30 sh -c 'until docker --host "$DOCKER_HOST" info >/dev/null 2>&1; do sleep 1; done'
+          # We start on the default socket (/var/run/docker.sock). Since we
+          # don't mount the host socket, this path is free inside the container.
+          dockerd &>/var/log/dockerd.log &
+          echo "Waiting for Docker daemon..."
+          timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
           echo "Docker daemon ready"
-          # Persist DOCKER_HOST for subsequent steps
-          echo "DOCKER_HOST=${DOCKER_HOST}" >> "$GITHUB_ENV"
 
       - name: Install CLI from GitHub Release
         run: ./install.sh

From e62cfbbb382c5f9690e13dfc8b9163b2c33bfe0b Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:39:04 -0700
Subject: [PATCH 06/14] fix(ci): use dedicated DinD socket with per-step
 DOCKER_HOST

The GHA runner injects its own /var/run/docker.sock into the container
for management, so dockerd can't bind to the default path. Use a
dedicated socket (/var/run/dind.sock) and set DOCKER_HOST only on
steps that need it (via step-level env) to avoid breaking the runner.
---
 .github/workflows/release-canary.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index b231d037..5c91cf34 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -81,10 +81,12 @@ jobs:
           # container's daemon, so 127.0.0.1 port bindings are reachable
           # directly — no --gateway-host workaround needed.
           #
-          # We start on the default socket (/var/run/docker.sock). Since we
-          # don't mount the host socket, this path is free inside the container.
-          dockerd &>/var/log/dockerd.log &
+          # We use a dedicated socket because the GHA runner injects its own
+          # /var/run/docker.sock for container management. DOCKER_HOST is set
+          # per-step (not via GITHUB_ENV) to avoid breaking the runner.
+          dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log &
           echo "Waiting for Docker daemon..."
+          export DOCKER_HOST=unix:///var/run/dind.sock
           timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
           echo "Docker daemon ready"
 
@@ -98,6 +100,8 @@ jobs:
         run: openshell --version
 
       - name: Run canary test
+        env:
+          DOCKER_HOST: unix:///var/run/dind.sock
         run: |
           set -euo pipefail
 

From 45f3fd325586d2f9770c817c57fc0eb2c76352ca Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:40:34 -0700
Subject: [PATCH 07/14] fix(ci): use nohup for dockerd to survive between GHA
 steps

Each GHA step runs via docker exec which sends SIGHUP to backgrounded
processes when the shell exits. Use nohup to detach dockerd from the
step's process group so it persists across steps.
---
 .github/workflows/release-canary.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 5c91cf34..91795385 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -84,7 +84,7 @@ jobs:
           # We use a dedicated socket because the GHA runner injects its own
           # /var/run/docker.sock for container management. DOCKER_HOST is set
           # per-step (not via GITHUB_ENV) to avoid breaking the runner.
-          dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log &
+          nohup dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log &
           echo "Waiting for Docker daemon..."
           export DOCKER_HOST=unix:///var/run/dind.sock
           timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'

From 291f48fb571394b2a334a292c831f0b8152ed378 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:42:02 -0700
Subject: [PATCH 08/14] fix(ci): use setsid to fully detach dockerd from GHA
 step shell

setsid creates a new session and process group, ensuring dockerd
survives when the GHA runner's docker-exec shell exits between steps.
---
 .github/workflows/release-canary.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 91795385..09b766b7 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -84,11 +84,15 @@ jobs:
           # We use a dedicated socket because the GHA runner injects its own
           # /var/run/docker.sock for container management. DOCKER_HOST is set
           # per-step (not via GITHUB_ENV) to avoid breaking the runner.
-          nohup dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log &
+          # Use setsid to fully detach dockerd from the step's process group
+          # so it survives when the GHA runner's docker-exec shell exits.
+          setsid dockerd --host unix:///var/run/dind.sock </dev/null &>/var/log/dockerd.log &
           echo "Waiting for Docker daemon..."
           export DOCKER_HOST=unix:///var/run/dind.sock
           timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
           echo "Docker daemon ready"
+          # Verify the socket file exists
+          ls -la /var/run/dind.sock
 
       - name: Install CLI from GitHub Release
         run: ./install.sh

From d3502adb4551c98203543d26a9034a68c6ccbfa5 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:43:23 -0700
Subject: [PATCH 09/14] fix(ci): start dockerd in the same step as canary test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Background processes started via docker-exec don't persist across GHA
steps — each step gets a fresh docker-exec invocation. Move dockerd
startup into the canary test step itself so it shares the same shell
session and stays alive for the duration of the test.
---
 .github/workflows/release-canary.yml | 34 +++++++++-------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 09b766b7..8da52e89 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -73,27 +73,6 @@ jobs:
             fi
           fi
 
-      - name: Start Docker daemon
-        run: |
-          # Start our own dockerd for Docker-in-Docker. The CI container runs
-          # --privileged so the daemon can create networks and cgroups.
-          # Using DinD means the gateway container is a child of this
-          # container's daemon, so 127.0.0.1 port bindings are reachable
-          # directly — no --gateway-host workaround needed.
-          #
-          # We use a dedicated socket because the GHA runner injects its own
-          # /var/run/docker.sock for container management. DOCKER_HOST is set
-          # per-step (not via GITHUB_ENV) to avoid breaking the runner.
-          # Use setsid to fully detach dockerd from the step's process group
-          # so it survives when the GHA runner's docker-exec shell exits.
-          setsid dockerd --host unix:///var/run/dind.sock </dev/null &>/var/log/dockerd.log &
-          echo "Waiting for Docker daemon..."
-          export DOCKER_HOST=unix:///var/run/dind.sock
-          timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
-          echo "Docker daemon ready"
-          # Verify the socket file exists
-          ls -la /var/run/dind.sock
-
       - name: Install CLI from GitHub Release
         run: ./install.sh
         env:
@@ -104,11 +83,20 @@ jobs:
         run: openshell --version
 
       - name: Run canary test
-        env:
-          DOCKER_HOST: unix:///var/run/dind.sock
         run: |
           set -euo pipefail
 
+          # Start our own dockerd for Docker-in-Docker. The GHA runner injects
+          # its own /var/run/docker.sock for container management, so we use a
+          # dedicated socket. We start dockerd in the same step as the canary
+          # test because background processes don't survive across GHA steps
+          # (each step runs via a separate docker-exec invocation).
+          export DOCKER_HOST=unix:///var/run/dind.sock
+          dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log &
+          echo "Waiting for Docker daemon..."
+          timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
+          echo "Docker daemon ready"
+
           # Single-command canary: tests the full zero-to-sandbox path.
           # With no gateway configured, `sandbox create` auto-bootstraps a
           # gateway (pulls the cluster image from GHCR, starts k3s, deploys

From cb49b3c2109de33f91f30626f890439cdb6bee20 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:45:09 -0700
Subject: [PATCH 10/14] fix(ci): use vfs storage driver for DinD to avoid
 overlay-on-overlay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GHA container uses overlayfs, and the inner dockerd also defaults
to overlayfs. Overlay can't be stacked, causing container creation
to fail. Use --storage-driver=vfs which copies layers instead of
layering them — slower but reliable for DinD.
---
 .github/workflows/release-canary.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 8da52e89..64c3bc79 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -92,7 +92,10 @@ jobs:
           # test because background processes don't survive across GHA steps
           # (each step runs via a separate docker-exec invocation).
           export DOCKER_HOST=unix:///var/run/dind.sock
-          dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log &
+          # Use vfs storage driver to avoid overlay-on-overlay failures
+          # (the GHA container already uses overlayfs, and overlayfs can't
+          # be stacked). VFS is slower but reliable for DinD.
+          dockerd --host "$DOCKER_HOST" --storage-driver=vfs &>/var/log/dockerd.log &
           echo "Waiting for Docker daemon..."
           timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
           echo "Docker daemon ready"

From 3d9e9b5853cc767e1f6815064e229b3c91979732 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sat, 14 Mar 2026 23:57:18 -0700
Subject: [PATCH 11/14] feat(cli): support OPENSHELL_GATEWAY_HOST env var in
 auto-bootstrap

Add OPENSHELL_GATEWAY_HOST environment variable support to the sandbox
create auto-bootstrap path. This mirrors the --gateway-host flag on
`gateway start` but works for the implicit bootstrap triggered by
`sandbox create` when no gateway exists.

In CI containers using Docker-outside-of-Docker (host socket mount),
127.0.0.1 inside the CI container doesn't reach sibling gateway
containers. Setting OPENSHELL_GATEWAY_HOST=host.docker.internal fixes
this without requiring the two-step gateway-start-then-sandbox-create
workflow.

Update release canary to use the single-command path: just
`openshell sandbox create` which auto-bootstraps everything. For
workflow_dispatch (branch testing), builds CLI from source to test the
current branch code. For workflow_run (release testing), installs the
published binary.
---
 .github/workflows/release-canary.yml  | 58 +++++++++++++++------------
 crates/openshell-cli/src/bootstrap.rs | 10 +++++
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 64c3bc79..6f38bf9b 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -35,16 +35,14 @@ jobs:
             target: aarch64-unknown-linux-musl
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 30
-    # Run with Docker-in-Docker: the CI container starts its own dockerd so
-    # the gateway cluster container is a child process. This means 127.0.0.1
-    # port bindings are reachable directly, matching the real user experience
-    # without needing --gateway-host workarounds.
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
         username: ${{ github.actor }}
         password: ${{ secrets.GITHUB_TOKEN }}
       options: --privileged
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
     env:
       OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
@@ -73,33 +71,44 @@ jobs:
             fi
           fi
 
-      - name: Install CLI from GitHub Release
-        run: ./install.sh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          OPENSHELL_VERSION: ${{ steps.release.outputs.tag }}
+      - name: Install CLI
+        run: |
+          # For workflow_dispatch (manual/branch testing), build from source
+          # so we test the code on this branch. For workflow_run (release
+          # testing), install the published binary.
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "Building CLI from source..."
+            cargo build --release -p openshell-cli
+            cp target/release/openshell /usr/local/bin/openshell
+          else
+            GH_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
+            OPENSHELL_VERSION="${{ steps.release.outputs.tag }}" \
+              ./install.sh
+          fi
 
       - name: Verify CLI installation
         run: openshell --version
 
+      - name: Resolve gateway host
+        run: |
+          # On Linux CI runners host.docker.internal is not set automatically
+          # (it's a Docker Desktop feature). Add it via the Docker bridge IP.
+          if ! getent hosts host.docker.internal >/dev/null 2>&1; then
+            BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}')
+            echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal"
+            echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts
+          fi
+
       - name: Run canary test
+        env:
+          # The CI container uses Docker-outside-of-Docker (host socket mount),
+          # so the gateway container is a sibling on the host. 127.0.0.1 inside
+          # the CI container doesn't reach it — OPENSHELL_GATEWAY_HOST tells
+          # the auto-bootstrap to advertise host.docker.internal instead.
+          OPENSHELL_GATEWAY_HOST: host.docker.internal
         run: |
           set -euo pipefail
 
-          # Start our own dockerd for Docker-in-Docker. The GHA runner injects
-          # its own /var/run/docker.sock for container management, so we use a
-          # dedicated socket. We start dockerd in the same step as the canary
-          # test because background processes don't survive across GHA steps
-          # (each step runs via a separate docker-exec invocation).
-          export DOCKER_HOST=unix:///var/run/dind.sock
-          # Use vfs storage driver to avoid overlay-on-overlay failures
-          # (the GHA container already uses overlayfs, and overlayfs can't
-          # be stacked). VFS is slower but reliable for DinD.
-          dockerd --host "$DOCKER_HOST" --storage-driver=vfs &>/var/log/dockerd.log &
-          echo "Waiting for Docker daemon..."
-          timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
-          echo "Docker daemon ready"
-
           # Single-command canary: tests the full zero-to-sandbox path.
           # With no gateway configured, `sandbox create` auto-bootstraps a
           # gateway (pulls the cluster image from GHCR, starts k3s, deploys
@@ -110,9 +119,6 @@ jobs:
             EXIT_CODE=$?
             echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}"
             echo "$OUTPUT"
-            echo ""
-            echo "--- dockerd logs ---"
-            cat /var/log/dockerd.log || true
             exit $EXIT_CODE
           }
 
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index ca81404f..e237688c 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -162,6 +162,16 @@ pub async fn run_bootstrap(
     {
         options = options.with_registry_token(token);
     }
+    // Read gateway host override from environment. In CI containers that use
+    // Docker-outside-of-Docker (socket mount), 127.0.0.1 inside the CI
+    // container doesn't reach the sibling gateway container. Setting
+    // OPENSHELL_GATEWAY_HOST=host.docker.internal (or the bridge IP) fixes
+    // this. The explicit `--gateway-host` flag is only on `gateway start`.
+    if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
+        && !host.trim().is_empty()
+    {
+        options = options.with_gateway_host(host);
+    }
     options = options.with_gpu(gpu);
 
     let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?;

From c66ba3f731663ed3b3eac4a6895310a6ed2f2cc8 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sun, 15 Mar 2026 00:03:15 -0700
Subject: [PATCH 12/14] ci(canary): use two-step canary with gateway-host flag
 for now

Use the explicit --gateway-host flag on gateway start (works with
current published CLI) while also setting OPENSHELL_GATEWAY_HOST env
var (will be picked up once the next release ships with env var
support). Once the env var support is released, the canary can switch
to the single-command sandbox create path.
---
 .github/workflows/release-canary.yml | 37 +++++++++-------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml
index 6f38bf9b..90251610 100644
--- a/.github/workflows/release-canary.yml
+++ b/.github/workflows/release-canary.yml
@@ -71,20 +71,11 @@ jobs:
             fi
           fi
 
-      - name: Install CLI
-        run: |
-          # For workflow_dispatch (manual/branch testing), build from source
-          # so we test the code on this branch. For workflow_run (release
-          # testing), install the published binary.
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-            echo "Building CLI from source..."
-            cargo build --release -p openshell-cli
-            cp target/release/openshell /usr/local/bin/openshell
-          else
-            GH_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-            OPENSHELL_VERSION="${{ steps.release.outputs.tag }}" \
-              ./install.sh
-          fi
+      - name: Install CLI from GitHub Release
+        run: ./install.sh
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          OPENSHELL_VERSION: ${{ steps.release.outputs.tag }}
 
       - name: Verify CLI installation
         run: openshell --version
@@ -99,22 +90,18 @@ jobs:
             echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts
           fi
 
-      - name: Run canary test
+      - name: Start gateway
         env:
-          # The CI container uses Docker-outside-of-Docker (host socket mount),
-          # so the gateway container is a sibling on the host. 127.0.0.1 inside
-          # the CI container doesn't reach it — OPENSHELL_GATEWAY_HOST tells
-          # the auto-bootstrap to advertise host.docker.internal instead.
+          # Use OPENSHELL_GATEWAY_HOST when supported (CLI >= next release),
+          # fall back to the explicit --gateway-host flag for older CLIs.
           OPENSHELL_GATEWAY_HOST: host.docker.internal
+        run: openshell gateway start --gateway-host host.docker.internal
+
+      - name: Run canary test
         run: |
           set -euo pipefail
 
-          # Single-command canary: tests the full zero-to-sandbox path.
-          # With no gateway configured, `sandbox create` auto-bootstraps a
-          # gateway (pulls the cluster image from GHCR, starts k3s, deploys
-          # the control plane, generates mTLS PKI), then creates a sandbox
-          # and runs the command inside it.
-          echo "Creating sandbox (with auto-bootstrap) and running 'echo hello world'..."
+          echo "Creating sandbox and running 'echo hello world'..."
           OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || {
             EXIT_CODE=$?
             echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}"

From 30792c37792a20d6b012475be45c0fb1ad4581d6 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sun, 15 Mar 2026 00:20:16 -0700
Subject: [PATCH 13/14] revert: remove DinD additions from Dockerfile.ci

The canary uses DooD (host socket mount), not DinD, so the dockerd,
containerd, runc, docker-proxy, and iptables additions are unnecessary.
---
 deploy/docker/Dockerfile.ci | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci
index 01b55e86..55ff8032 100644
--- a/deploy/docker/Dockerfile.ci
+++ b/deploy/docker/Dockerfile.ci
@@ -17,9 +17,7 @@ ENV MISE_DATA_DIR=/opt/mise
 ENV MISE_CACHE_DIR=/opt/mise/cache
 ENV PATH="/opt/mise/shims:/root/.cargo/bin:/root/.local/bin:$PATH"
 
-# Install system dependencies.
-# iptables is required for Docker-in-Docker networking (dockerd uses it for
-# container NAT and bridge rules).
+# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates \
     curl \
@@ -36,22 +34,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     xz-utils \
     jq \
     rsync \
-    iptables \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Docker CLI, daemon, and buildx plugin.
-# The full daemon (dockerd + containerd + runc) is needed for Docker-in-Docker
-# workflows like the release canary test, where the CI container runs its own
-# Docker daemon instead of mounting the host socket.
+# Install Docker CLI and buildx plugin used by CI jobs
 RUN case "$TARGETARCH" in \
       amd64) docker_arch=x86_64; buildx_arch=amd64 ;; \
       arm64) docker_arch=aarch64; buildx_arch=arm64 ;; \
       *) echo "Unsupported TARGETARCH: $TARGETARCH"; exit 1 ;; \
     esac \
     && curl -fsSL "https://download.docker.com/linux/static/stable/${docker_arch}/docker-${DOCKER_VERSION}.tgz" \
-    | tar xz --strip-components=1 -C /usr/local/bin \
-        docker/docker docker/dockerd docker/containerd \
-        docker/containerd-shim-runc-v2 docker/runc docker/docker-proxy \
+    | tar xz --strip-components=1 -C /usr/local/bin docker/docker \
     && mkdir -p /usr/local/lib/docker/cli-plugins \
     && curl -fsSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-${buildx_arch}" \
       -o /usr/local/lib/docker/cli-plugins/docker-buildx \

From c4f9475824eafce57ae74fc874301c424baa3940 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Sun, 15 Mar 2026 00:30:02 -0700
Subject: [PATCH 14/14] docs: broaden gateway-host descriptions beyond CI use
 case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gateway host override is useful in any environment where the
client can't reach the Docker host at 127.0.0.1 — CI containers,
WSL, remote Docker hosts, etc. Update the CLI help text, DeployOptions
doc comment, and bootstrap env var comment to reflect this.
---
 crates/openshell-bootstrap/src/lib.rs |  4 ++--
 crates/openshell-cli/src/bootstrap.rs | 10 +++++-----
 crates/openshell-cli/src/main.rs      |  9 +++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index bf6599b4..2da77f47 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -92,8 +92,8 @@ pub struct DeployOptions {
     /// Override the gateway host advertised in cluster metadata and passed to
     /// the server. When set, the metadata will use this host instead of
     /// `127.0.0.1` and the container will receive `SSH_GATEWAY_HOST`.
-    /// Useful in CI where `127.0.0.1` is not reachable from the test runner
-    /// (e.g., `host.docker.internal`).
+    /// Needed whenever the client cannot reach the Docker host at 127.0.0.1
+    /// — CI containers, WSL, remote Docker hosts, etc.
     pub gateway_host: Option<String>,
     /// Disable TLS entirely — the server listens on plaintext HTTP.
     pub disable_tls: bool,
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index e237688c..294995f1 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -162,11 +162,11 @@ pub async fn run_bootstrap(
     {
         options = options.with_registry_token(token);
     }
-    // Read gateway host override from environment. In CI containers that use
-    // Docker-outside-of-Docker (socket mount), 127.0.0.1 inside the CI
-    // container doesn't reach the sibling gateway container. Setting
-    // OPENSHELL_GATEWAY_HOST=host.docker.internal (or the bridge IP) fixes
-    // this. The explicit `--gateway-host` flag is only on `gateway start`.
+    // Read gateway host override from environment. Needed whenever the
+    // client cannot reach the Docker host at 127.0.0.1 — CI containers,
+    // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
+    // is only on `gateway start`; this env var covers the auto-bootstrap
+    // path triggered by `sandbox create`.
     if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
         && !host.trim().is_empty()
     {
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 7c379c2b..0099e477 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -731,10 +731,11 @@ enum GatewayCommands {
 
         /// Override the gateway host written into cluster metadata.
         ///
-        /// By default, local clusters advertise 127.0.0.1. In environments
-        /// where the test runner cannot reach 127.0.0.1 on the Docker host
-        /// (e.g., CI containers), set this to a reachable hostname such as
-        /// `host.docker.internal`.
+        /// By default, local clusters advertise 127.0.0.1. Set this when
+        /// the client cannot reach the Docker host at 127.0.0.1 — for
+        /// example in CI containers, WSL, or when Docker runs on a
+        /// remote host. Common values: `host.docker.internal`, a LAN IP,
+        /// or a hostname.
         #[arg(long)]
         gateway_host: Option<String>,