From 0033d9757a483df9afba0b49a59e8be69ba02400 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 22:58:34 -0700 Subject: [PATCH 01/14] wip --- .github/workflows/release-dev.yml | 2 +- .github/workflows/release-tag.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-dev.yml b/.github/workflows/release-dev.yml index ff51540e..61a545d5 100644 --- a/.github/workflows/release-dev.yml +++ b/.github/workflows/release-dev.yml @@ -70,7 +70,7 @@ jobs: tag-ghcr-dev: name: Tag GHCR Images as Dev - needs: [build-gateway, build-cluster, e2e] + needs: [build-gateway, build-cluster] runs-on: build-amd64 timeout-minutes: 10 steps: diff --git a/.github/workflows/release-tag.yml b/.github/workflows/release-tag.yml index cda74d1e..84a66818 100644 --- a/.github/workflows/release-tag.yml +++ b/.github/workflows/release-tag.yml @@ -177,7 +177,7 @@ jobs: publish-python: name: Publish Python - needs: [build-python-wheels] + needs: [build-python-wheels, e2e] runs-on: [self-hosted, nv] timeout-minutes: 10 env: From 364965a6cc7289a938ea63fa3a2e0f66dbe7756d Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:01:13 -0700 Subject: [PATCH 02/14] ci(release): gate python wheels on e2e for tagged releases - Add e2e to publish-python needs in release-tag.yml so wheels are not published to Artifactory until e2e passes - Remove e2e gate from tag-ghcr-dev in release-dev.yml since dev Docker images do not need to wait for e2e - Replace gitlab-master.nvidia.com references with generic example host in policy-advisor CTF example --- examples/policy-advisor/README.md | 4 ++-- examples/policy-advisor/ctf.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/policy-advisor/README.md b/examples/policy-advisor/README.md index 9dbfcb0b..a90f0a72 100644 --- a/examples/policy-advisor/README.md +++ b/examples/policy-advisor/README.md @@ -47,7 +47,7 @@ the script progresses through each gate. | 4 | The Oracle | `api.github.com:443` | Concurrent with 5 and 6 | | 5 | The Jester | `icanhazdadjoke.com:443` | Concurrent with 4 and 6 | | 6 | The Sphinx | `catfact.ninja:443` | Concurrent with 4 and 5 | -| 7 | The Vault | `gitlab-master.nvidia.com:443` | Internal IP -- mapper adds `allowed_ips` | +| 7 | The Vault | `internal.corp.example.com:443` | Internal IP -- mapper adds `allowed_ips` | Gates 1-3 run sequentially so you can observe the single-approval flow. Gate 3 uses `curl` to hit `ifconfig.me:80` -- a different endpoint that only @@ -123,7 +123,7 @@ Gate 3 uses `curl` to reach `ifconfig.me:80`. You'll see a new rule for When Gates 4-6 start, all three denials arrive together. Press `A` to approve all pending recommendations at once. -Gate 7 requires `allowed_ips` because `gitlab-master.nvidia.com` resolves to a +Gate 7 requires `allowed_ips` because `internal.corp.example.com` resolves to a private IP. The mapper detects this automatically and includes the resolved IPs in the proposed rule. diff --git a/examples/policy-advisor/ctf.py b/examples/policy-advisor/ctf.py index d62c4504..5974268f 100644 --- a/examples/policy-advisor/ctf.py +++ b/examples/policy-advisor/ctf.py @@ -155,9 +155,9 @@ def log(level: str, msg: str, **kv: object) -> None: { "num": 7, "name": "The Vault", - "host": "gitlab-master.nvidia.com", + "host": "internal.corp.example.com", "port": 443, - "url": "https://gitlab-master.nvidia.com/", + "url": "https://internal.corp.example.com/", "method": "GET", "headers": {"User-Agent": "openshell-ctf"}, "body": None, From c33d7dc81b55045d4fad153d753c72b2cf71273e Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:25:46 -0700 Subject: [PATCH 03/14] ci(canary): switch release canary to Docker-in-Docker with single-command test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch the release canary from Docker-outside-of-Docker (host socket mount) to true Docker-in-Docker. The CI container now starts its own dockerd, so the gateway cluster container is a child process and 127.0.0.1 port bindings are reachable directly. This enables testing the real zero-to-sandbox user path: a single `openshell sandbox create` that auto-bootstraps the gateway, pulls the cluster image, and creates a sandbox — no --gateway-host workaround. Dockerfile.ci changes: - Add iptables (required by dockerd for container networking) - Extract full Docker daemon suite (dockerd, containerd, runc) instead of CLI only release-canary.yml changes: - Remove /var/run/docker.sock volume mount - Add dockerd startup step - Remove gateway host resolution and explicit gateway start steps - Simplify canary to single auto-bootstrap sandbox create command --- .github/workflows/release-canary.yml | 41 +++++++++++++++++----------- deploy/docker/Dockerfile.ci | 14 ++++++++-- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 76b0d317..284e5fe2 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -35,14 +35,16 @@ jobs: target: aarch64-unknown-linux-musl runs-on: ${{ matrix.runner }} timeout-minutes: 30 + # Run with Docker-in-Docker: the CI container starts its own dockerd so + # the gateway cluster container is a child process. This means 127.0.0.1 + # port bindings are reachable directly, matching the real user experience + # without needing --gateway-host workarounds. container: image: ghcr.io/nvidia/openshell/ci:latest credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock env: OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: @@ -71,6 +73,18 @@ jobs: fi fi + - name: Start Docker daemon + run: | + # Start dockerd for Docker-in-Docker. The CI container runs + # --privileged so the daemon can create networks and cgroups. + # Using DinD means the gateway container is a child of this + # container's daemon, so 127.0.0.1 port bindings are reachable + # directly — no --gateway-host workaround needed. + dockerd &>/var/log/dockerd.log & + echo "Waiting for Docker daemon..." + timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' + echo "Docker daemon ready" + - name: Install CLI from GitHub Release run: ./install.sh env: @@ -80,28 +94,23 @@ jobs: - name: Verify CLI installation run: openshell --version - - name: Resolve gateway host - run: | - # On Linux CI runners host.docker.internal is not set automatically - # (it's a Docker Desktop feature). Add it via the Docker bridge IP. - if ! getent hosts host.docker.internal >/dev/null 2>&1; then - BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}') - echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal" - echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts - fi - - - name: Start gateway - run: openshell gateway start --gateway-host host.docker.internal - - name: Run canary test run: | set -euo pipefail - echo "Creating sandbox and running 'echo hello world'..." + # Single-command canary: tests the full zero-to-sandbox path. + # With no gateway configured, `sandbox create` auto-bootstraps a + # gateway (pulls the cluster image from GHCR, starts k3s, deploys + # the control plane, generates mTLS PKI), then creates a sandbox + # and runs the command inside it. + echo "Creating sandbox (with auto-bootstrap) and running 'echo hello world'..." OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || { EXIT_CODE=$? echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" echo "$OUTPUT" + echo "" + echo "--- dockerd logs ---" + cat /var/log/dockerd.log || true exit $EXIT_CODE } diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci index 55ff8032..3ce19d8a 100644 --- a/deploy/docker/Dockerfile.ci +++ b/deploy/docker/Dockerfile.ci @@ -17,7 +17,9 @@ ENV MISE_DATA_DIR=/opt/mise ENV MISE_CACHE_DIR=/opt/mise/cache ENV PATH="/opt/mise/shims:/root/.cargo/bin:/root/.local/bin:$PATH" -# Install system dependencies +# Install system dependencies. +# iptables is required for Docker-in-Docker networking (dockerd uses it for +# container NAT and bridge rules). RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ @@ -34,16 +36,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ xz-utils \ jq \ rsync \ + iptables \ && rm -rf /var/lib/apt/lists/* -# Install Docker CLI and buildx plugin used by CI jobs +# Install Docker CLI, daemon, and buildx plugin. +# The full daemon (dockerd + containerd + runc) is needed for Docker-in-Docker +# workflows like the release canary test, where the CI container runs its own +# Docker daemon instead of mounting the host socket. RUN case "$TARGETARCH" in \ amd64) docker_arch=x86_64; buildx_arch=amd64 ;; \ arm64) docker_arch=aarch64; buildx_arch=arm64 ;; \ *) echo "Unsupported TARGETARCH: $TARGETARCH"; exit 1 ;; \ esac \ && curl -fsSL "https://download.docker.com/linux/static/stable/${docker_arch}/docker-${DOCKER_VERSION}.tgz" \ - | tar xz --strip-components=1 -C /usr/local/bin docker/docker \ + | tar xz --strip-components=1 -C /usr/local/bin \ + docker/docker docker/dockerd docker/containerd \ + docker/containerd-shim-runc-v2 docker/runc \ && mkdir -p /usr/local/lib/docker/cli-plugins \ && curl -fsSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-${buildx_arch}" \ -o /usr/local/lib/docker/cli-plugins/docker-buildx \ From 6cf4c4447a5ba951a800f2a7f11d2ae365307bd6 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:32:07 -0700 Subject: [PATCH 04/14] fix(ci): add docker-proxy binary and use dedicated DinD socket The first canary run revealed two issues: 1. dockerd failed to start because docker-proxy was not extracted from the Docker static binary tarball. Add it to the extraction list. 2. The GitHub Actions runner injects its own Docker socket into job containers. Without an explicit DOCKER_HOST, the openshell CLI connected to the runner's host Docker daemon instead of our DinD daemon. Start dockerd on a dedicated socket (/var/run/dind.sock) and export DOCKER_HOST so all subsequent steps use it. --- .github/workflows/release-canary.yml | 15 +++++++++++---- deploy/docker/Dockerfile.ci | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 284e5fe2..5d90e9f0 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -75,15 +75,22 @@ jobs: - name: Start Docker daemon run: | - # Start dockerd for Docker-in-Docker. The CI container runs + # Start our own dockerd for Docker-in-Docker. The CI container runs # --privileged so the daemon can create networks and cgroups. # Using DinD means the gateway container is a child of this # container's daemon, so 127.0.0.1 port bindings are reachable # directly — no --gateway-host workaround needed. - dockerd &>/var/log/dockerd.log & - echo "Waiting for Docker daemon..." - timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' + # + # We listen on a dedicated socket and set DOCKER_HOST so that the + # openshell CLI (and bollard) connects to our DinD daemon rather + # than the GitHub Actions runner's injected host socket. + export DOCKER_HOST="unix:///var/run/dind.sock" + dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log & + echo "Waiting for Docker daemon on ${DOCKER_HOST}..." + timeout 30 sh -c 'until docker --host "$DOCKER_HOST" info >/dev/null 2>&1; do sleep 1; done' echo "Docker daemon ready" + # Persist DOCKER_HOST for subsequent steps + echo "DOCKER_HOST=${DOCKER_HOST}" >> "$GITHUB_ENV" - name: Install CLI from GitHub Release run: ./install.sh diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci index 3ce19d8a..01b55e86 100644 --- a/deploy/docker/Dockerfile.ci +++ b/deploy/docker/Dockerfile.ci @@ -51,7 +51,7 @@ RUN case "$TARGETARCH" in \ && curl -fsSL "https://download.docker.com/linux/static/stable/${docker_arch}/docker-${DOCKER_VERSION}.tgz" \ | tar xz --strip-components=1 -C /usr/local/bin \ docker/docker docker/dockerd docker/containerd \ - docker/containerd-shim-runc-v2 docker/runc \ + docker/containerd-shim-runc-v2 docker/runc docker/docker-proxy \ && mkdir -p /usr/local/lib/docker/cli-plugins \ && curl -fsSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-${buildx_arch}" \ -o /usr/local/lib/docker/cli-plugins/docker-buildx \ From fa89bf1c5b4c085dce06082e19816e93049a970f Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:35:35 -0700 Subject: [PATCH 05/14] fix(ci): start dockerd on default socket path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using a custom socket path and DOCKER_HOST breaks the GitHub Actions runner's internal Docker operations (it uses docker exec to run steps inside the container). Since we removed the host socket volume mount, /var/run/docker.sock is free inside the container — just start dockerd on the default path with no DOCKER_HOST override needed. --- .github/workflows/release-canary.yml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 5d90e9f0..b231d037 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -81,16 +81,12 @@ jobs: # container's daemon, so 127.0.0.1 port bindings are reachable # directly — no --gateway-host workaround needed. # - # We listen on a dedicated socket and set DOCKER_HOST so that the - # openshell CLI (and bollard) connects to our DinD daemon rather - # than the GitHub Actions runner's injected host socket. - export DOCKER_HOST="unix:///var/run/dind.sock" - dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log & - echo "Waiting for Docker daemon on ${DOCKER_HOST}..." - timeout 30 sh -c 'until docker --host "$DOCKER_HOST" info >/dev/null 2>&1; do sleep 1; done' + # We start on the default socket (/var/run/docker.sock). Since we + # don't mount the host socket, this path is free inside the container. + dockerd &>/var/log/dockerd.log & + echo "Waiting for Docker daemon..." + timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' echo "Docker daemon ready" - # Persist DOCKER_HOST for subsequent steps - echo "DOCKER_HOST=${DOCKER_HOST}" >> "$GITHUB_ENV" - name: Install CLI from GitHub Release run: ./install.sh From e62cfbbb382c5f9690e13dfc8b9163b2c33bfe0b Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:39:04 -0700 Subject: [PATCH 06/14] fix(ci): use dedicated DinD socket with per-step DOCKER_HOST The GHA runner injects its own /var/run/docker.sock into the container for management, so dockerd can't bind to the default path. Use a dedicated socket (/var/run/dind.sock) and set DOCKER_HOST only on steps that need it (via step-level env) to avoid breaking the runner. --- .github/workflows/release-canary.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index b231d037..5c91cf34 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -81,10 +81,12 @@ jobs: # container's daemon, so 127.0.0.1 port bindings are reachable # directly — no --gateway-host workaround needed. # - # We start on the default socket (/var/run/docker.sock). Since we - # don't mount the host socket, this path is free inside the container. - dockerd &>/var/log/dockerd.log & + # We use a dedicated socket because the GHA runner injects its own + # /var/run/docker.sock for container management. DOCKER_HOST is set + # per-step (not via GITHUB_ENV) to avoid breaking the runner. + dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log & echo "Waiting for Docker daemon..." + export DOCKER_HOST=unix:///var/run/dind.sock timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' echo "Docker daemon ready" @@ -98,6 +100,8 @@ jobs: run: openshell --version - name: Run canary test + env: + DOCKER_HOST: unix:///var/run/dind.sock run: | set -euo pipefail From 45f3fd325586d2f9770c817c57fc0eb2c76352ca Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:40:34 -0700 Subject: [PATCH 07/14] fix(ci): use nohup for dockerd to survive between GHA steps Each GHA step runs via docker exec which sends SIGHUP to backgrounded processes when the shell exits. Use nohup to detach dockerd from the step's process group so it persists across steps. --- .github/workflows/release-canary.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 5c91cf34..91795385 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -84,7 +84,7 @@ jobs: # We use a dedicated socket because the GHA runner injects its own # /var/run/docker.sock for container management. DOCKER_HOST is set # per-step (not via GITHUB_ENV) to avoid breaking the runner. - dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log & + nohup dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log & echo "Waiting for Docker daemon..." export DOCKER_HOST=unix:///var/run/dind.sock timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' From 291f48fb571394b2a334a292c831f0b8152ed378 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:42:02 -0700 Subject: [PATCH 08/14] fix(ci): use setsid to fully detach dockerd from GHA step shell setsid creates a new session and process group, ensuring dockerd survives when the GHA runner's docker-exec shell exits between steps. --- .github/workflows/release-canary.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 91795385..09b766b7 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -84,11 +84,15 @@ jobs: # We use a dedicated socket because the GHA runner injects its own # /var/run/docker.sock for container management. DOCKER_HOST is set # per-step (not via GITHUB_ENV) to avoid breaking the runner. - nohup dockerd --host unix:///var/run/dind.sock &>/var/log/dockerd.log & + # Use setsid to fully detach dockerd from the step's process group + # so it survives when the GHA runner's docker-exec shell exits. + setsid dockerd --host unix:///var/run/dind.sock /var/log/dockerd.log & echo "Waiting for Docker daemon..." export DOCKER_HOST=unix:///var/run/dind.sock timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' echo "Docker daemon ready" + # Verify the socket file exists + ls -la /var/run/dind.sock - name: Install CLI from GitHub Release run: ./install.sh From d3502adb4551c98203543d26a9034a68c6ccbfa5 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:43:23 -0700 Subject: [PATCH 09/14] fix(ci): start dockerd in the same step as canary test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background processes started via docker-exec don't persist across GHA steps — each step gets a fresh docker-exec invocation. Move dockerd startup into the canary test step itself so it shares the same shell session and stays alive for the duration of the test. --- .github/workflows/release-canary.yml | 34 +++++++++------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 09b766b7..8da52e89 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -73,27 +73,6 @@ jobs: fi fi - - name: Start Docker daemon - run: | - # Start our own dockerd for Docker-in-Docker. The CI container runs - # --privileged so the daemon can create networks and cgroups. - # Using DinD means the gateway container is a child of this - # container's daemon, so 127.0.0.1 port bindings are reachable - # directly — no --gateway-host workaround needed. - # - # We use a dedicated socket because the GHA runner injects its own - # /var/run/docker.sock for container management. DOCKER_HOST is set - # per-step (not via GITHUB_ENV) to avoid breaking the runner. - # Use setsid to fully detach dockerd from the step's process group - # so it survives when the GHA runner's docker-exec shell exits. - setsid dockerd --host unix:///var/run/dind.sock /var/log/dockerd.log & - echo "Waiting for Docker daemon..." - export DOCKER_HOST=unix:///var/run/dind.sock - timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' - echo "Docker daemon ready" - # Verify the socket file exists - ls -la /var/run/dind.sock - - name: Install CLI from GitHub Release run: ./install.sh env: @@ -104,11 +83,20 @@ jobs: run: openshell --version - name: Run canary test - env: - DOCKER_HOST: unix:///var/run/dind.sock run: | set -euo pipefail + # Start our own dockerd for Docker-in-Docker. The GHA runner injects + # its own /var/run/docker.sock for container management, so we use a + # dedicated socket. We start dockerd in the same step as the canary + # test because background processes don't survive across GHA steps + # (each step runs via a separate docker-exec invocation). + export DOCKER_HOST=unix:///var/run/dind.sock + dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log & + echo "Waiting for Docker daemon..." + timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' + echo "Docker daemon ready" + # Single-command canary: tests the full zero-to-sandbox path. # With no gateway configured, `sandbox create` auto-bootstraps a # gateway (pulls the cluster image from GHCR, starts k3s, deploys From cb49b3c2109de33f91f30626f890439cdb6bee20 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:45:09 -0700 Subject: [PATCH 10/14] fix(ci): use vfs storage driver for DinD to avoid overlay-on-overlay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GHA container uses overlayfs, and the inner dockerd also defaults to overlayfs. Overlay can't be stacked, causing container creation to fail. Use --storage-driver=vfs which copies layers instead of layering them — slower but reliable for DinD. --- .github/workflows/release-canary.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 8da52e89..64c3bc79 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -92,7 +92,10 @@ jobs: # test because background processes don't survive across GHA steps # (each step runs via a separate docker-exec invocation). export DOCKER_HOST=unix:///var/run/dind.sock - dockerd --host "$DOCKER_HOST" &>/var/log/dockerd.log & + # Use vfs storage driver to avoid overlay-on-overlay failures + # (the GHA container already uses overlayfs, and overlayfs can't + # be stacked). VFS is slower but reliable for DinD. + dockerd --host "$DOCKER_HOST" --storage-driver=vfs &>/var/log/dockerd.log & echo "Waiting for Docker daemon..." timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' echo "Docker daemon ready" From 3d9e9b5853cc767e1f6815064e229b3c91979732 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 23:57:18 -0700 Subject: [PATCH 11/14] feat(cli): support OPENSHELL_GATEWAY_HOST env var in auto-bootstrap Add OPENSHELL_GATEWAY_HOST environment variable support to the sandbox create auto-bootstrap path. This mirrors the --gateway-host flag on `gateway start` but works for the implicit bootstrap triggered by `sandbox create` when no gateway exists. In CI containers using Docker-outside-of-Docker (host socket mount), 127.0.0.1 inside the CI container doesn't reach sibling gateway containers. Setting OPENSHELL_GATEWAY_HOST=host.docker.internal fixes this without requiring the two-step gateway-start-then-sandbox-create workflow. Update release canary to use the single-command path: just `openshell sandbox create` which auto-bootstraps everything. For workflow_dispatch (branch testing), builds CLI from source to test the current branch code. For workflow_run (release testing), installs the published binary. --- .github/workflows/release-canary.yml | 58 +++++++++++++++------------ crates/openshell-cli/src/bootstrap.rs | 10 +++++ 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 64c3bc79..6f38bf9b 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -35,16 +35,14 @@ jobs: target: aarch64-unknown-linux-musl runs-on: ${{ matrix.runner }} timeout-minutes: 30 - # Run with Docker-in-Docker: the CI container starts its own dockerd so - # the gateway cluster container is a child process. This means 127.0.0.1 - # port bindings are reachable directly, matching the real user experience - # without needing --gateway-host workarounds. container: image: ghcr.io/nvidia/openshell/ci:latest credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock env: OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: @@ -73,33 +71,44 @@ jobs: fi fi - - name: Install CLI from GitHub Release - run: ./install.sh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_VERSION: ${{ steps.release.outputs.tag }} + - name: Install CLI + run: | + # For workflow_dispatch (manual/branch testing), build from source + # so we test the code on this branch. For workflow_run (release + # testing), install the published binary. + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "Building CLI from source..." + cargo build --release -p openshell-cli + cp target/release/openshell /usr/local/bin/openshell + else + GH_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + OPENSHELL_VERSION="${{ steps.release.outputs.tag }}" \ + ./install.sh + fi - name: Verify CLI installation run: openshell --version + - name: Resolve gateway host + run: | + # On Linux CI runners host.docker.internal is not set automatically + # (it's a Docker Desktop feature). Add it via the Docker bridge IP. + if ! getent hosts host.docker.internal >/dev/null 2>&1; then + BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}') + echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal" + echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts + fi + - name: Run canary test + env: + # The CI container uses Docker-outside-of-Docker (host socket mount), + # so the gateway container is a sibling on the host. 127.0.0.1 inside + # the CI container doesn't reach it — OPENSHELL_GATEWAY_HOST tells + # the auto-bootstrap to advertise host.docker.internal instead. + OPENSHELL_GATEWAY_HOST: host.docker.internal run: | set -euo pipefail - # Start our own dockerd for Docker-in-Docker. The GHA runner injects - # its own /var/run/docker.sock for container management, so we use a - # dedicated socket. We start dockerd in the same step as the canary - # test because background processes don't survive across GHA steps - # (each step runs via a separate docker-exec invocation). - export DOCKER_HOST=unix:///var/run/dind.sock - # Use vfs storage driver to avoid overlay-on-overlay failures - # (the GHA container already uses overlayfs, and overlayfs can't - # be stacked). VFS is slower but reliable for DinD. - dockerd --host "$DOCKER_HOST" --storage-driver=vfs &>/var/log/dockerd.log & - echo "Waiting for Docker daemon..." - timeout 30 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' - echo "Docker daemon ready" - # Single-command canary: tests the full zero-to-sandbox path. # With no gateway configured, `sandbox create` auto-bootstraps a # gateway (pulls the cluster image from GHCR, starts k3s, deploys @@ -110,9 +119,6 @@ jobs: EXIT_CODE=$? echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" echo "$OUTPUT" - echo "" - echo "--- dockerd logs ---" - cat /var/log/dockerd.log || true exit $EXIT_CODE } diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index ca81404f..e237688c 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -162,6 +162,16 @@ pub async fn run_bootstrap( { options = options.with_registry_token(token); } + // Read gateway host override from environment. In CI containers that use + // Docker-outside-of-Docker (socket mount), 127.0.0.1 inside the CI + // container doesn't reach the sibling gateway container. Setting + // OPENSHELL_GATEWAY_HOST=host.docker.internal (or the bridge IP) fixes + // this. The explicit `--gateway-host` flag is only on `gateway start`. + if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") + && !host.trim().is_empty() + { + options = options.with_gateway_host(host); + } options = options.with_gpu(gpu); let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?; From c66ba3f731663ed3b3eac4a6895310a6ed2f2cc8 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 00:03:15 -0700 Subject: [PATCH 12/14] ci(canary): use two-step canary with gateway-host flag for now Use the explicit --gateway-host flag on gateway start (works with current published CLI) while also setting OPENSHELL_GATEWAY_HOST env var (will be picked up once the next release ships with env var support). Once the env var support is released, the canary can switch to the single-command sandbox create path. --- .github/workflows/release-canary.yml | 37 +++++++++------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 6f38bf9b..90251610 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -71,20 +71,11 @@ jobs: fi fi - - name: Install CLI - run: | - # For workflow_dispatch (manual/branch testing), build from source - # so we test the code on this branch. For workflow_run (release - # testing), install the published binary. - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "Building CLI from source..." - cargo build --release -p openshell-cli - cp target/release/openshell /usr/local/bin/openshell - else - GH_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - OPENSHELL_VERSION="${{ steps.release.outputs.tag }}" \ - ./install.sh - fi + - name: Install CLI from GitHub Release + run: ./install.sh + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_VERSION: ${{ steps.release.outputs.tag }} - name: Verify CLI installation run: openshell --version @@ -99,22 +90,18 @@ jobs: echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts fi - - name: Run canary test + - name: Start gateway env: - # The CI container uses Docker-outside-of-Docker (host socket mount), - # so the gateway container is a sibling on the host. 127.0.0.1 inside - # the CI container doesn't reach it — OPENSHELL_GATEWAY_HOST tells - # the auto-bootstrap to advertise host.docker.internal instead. + # Use OPENSHELL_GATEWAY_HOST when supported (CLI >= next release), + # fall back to the explicit --gateway-host flag for older CLIs. OPENSHELL_GATEWAY_HOST: host.docker.internal + run: openshell gateway start --gateway-host host.docker.internal + + - name: Run canary test run: | set -euo pipefail - # Single-command canary: tests the full zero-to-sandbox path. - # With no gateway configured, `sandbox create` auto-bootstraps a - # gateway (pulls the cluster image from GHCR, starts k3s, deploys - # the control plane, generates mTLS PKI), then creates a sandbox - # and runs the command inside it. - echo "Creating sandbox (with auto-bootstrap) and running 'echo hello world'..." + echo "Creating sandbox and running 'echo hello world'..." OUTPUT=$(openshell sandbox create --no-keep --no-tty -- echo "hello world" 2>&1) || { EXIT_CODE=$? echo "::error::openshell sandbox create failed with exit code ${EXIT_CODE}" From 30792c37792a20d6b012475be45c0fb1ad4581d6 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 00:20:16 -0700 Subject: [PATCH 13/14] revert: remove DinD additions from Dockerfile.ci The canary uses DooD (host socket mount), not DinD, so the dockerd, containerd, runc, docker-proxy, and iptables additions are unnecessary. --- deploy/docker/Dockerfile.ci | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci index 01b55e86..55ff8032 100644 --- a/deploy/docker/Dockerfile.ci +++ b/deploy/docker/Dockerfile.ci @@ -17,9 +17,7 @@ ENV MISE_DATA_DIR=/opt/mise ENV MISE_CACHE_DIR=/opt/mise/cache ENV PATH="/opt/mise/shims:/root/.cargo/bin:/root/.local/bin:$PATH" -# Install system dependencies. -# iptables is required for Docker-in-Docker networking (dockerd uses it for -# container NAT and bridge rules). +# Install system dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ @@ -36,22 +34,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ xz-utils \ jq \ rsync \ - iptables \ && rm -rf /var/lib/apt/lists/* -# Install Docker CLI, daemon, and buildx plugin. -# The full daemon (dockerd + containerd + runc) is needed for Docker-in-Docker -# workflows like the release canary test, where the CI container runs its own -# Docker daemon instead of mounting the host socket. +# Install Docker CLI and buildx plugin used by CI jobs RUN case "$TARGETARCH" in \ amd64) docker_arch=x86_64; buildx_arch=amd64 ;; \ arm64) docker_arch=aarch64; buildx_arch=arm64 ;; \ *) echo "Unsupported TARGETARCH: $TARGETARCH"; exit 1 ;; \ esac \ && curl -fsSL "https://download.docker.com/linux/static/stable/${docker_arch}/docker-${DOCKER_VERSION}.tgz" \ - | tar xz --strip-components=1 -C /usr/local/bin \ - docker/docker docker/dockerd docker/containerd \ - docker/containerd-shim-runc-v2 docker/runc docker/docker-proxy \ + | tar xz --strip-components=1 -C /usr/local/bin docker/docker \ && mkdir -p /usr/local/lib/docker/cli-plugins \ && curl -fsSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-${buildx_arch}" \ -o /usr/local/lib/docker/cli-plugins/docker-buildx \ From c4f9475824eafce57ae74fc874301c424baa3940 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 00:30:02 -0700 Subject: [PATCH 14/14] docs: broaden gateway-host descriptions beyond CI use case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway host override is useful in any environment where the client can't reach the Docker host at 127.0.0.1 — CI containers, WSL, remote Docker hosts, etc. Update the CLI help text, DeployOptions doc comment, and bootstrap env var comment to reflect this. --- crates/openshell-bootstrap/src/lib.rs | 4 ++-- crates/openshell-cli/src/bootstrap.rs | 10 +++++----- crates/openshell-cli/src/main.rs | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index bf6599b4..2da77f47 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -92,8 +92,8 @@ pub struct DeployOptions { /// Override the gateway host advertised in cluster metadata and passed to /// the server. When set, the metadata will use this host instead of /// `127.0.0.1` and the container will receive `SSH_GATEWAY_HOST`. - /// Useful in CI where `127.0.0.1` is not reachable from the test runner - /// (e.g., `host.docker.internal`). + /// Needed whenever the client cannot reach the Docker host at 127.0.0.1 + /// — CI containers, WSL, remote Docker hosts, etc. pub gateway_host: Option, /// Disable TLS entirely — the server listens on plaintext HTTP. pub disable_tls: bool, diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index e237688c..294995f1 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -162,11 +162,11 @@ pub async fn run_bootstrap( { options = options.with_registry_token(token); } - // Read gateway host override from environment. In CI containers that use - // Docker-outside-of-Docker (socket mount), 127.0.0.1 inside the CI - // container doesn't reach the sibling gateway container. Setting - // OPENSHELL_GATEWAY_HOST=host.docker.internal (or the bridge IP) fixes - // this. The explicit `--gateway-host` flag is only on `gateway start`. + // Read gateway host override from environment. Needed whenever the + // client cannot reach the Docker host at 127.0.0.1 — CI containers, + // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag + // is only on `gateway start`; this env var covers the auto-bootstrap + // path triggered by `sandbox create`. if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") && !host.trim().is_empty() { diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 7c379c2b..0099e477 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -731,10 +731,11 @@ enum GatewayCommands { /// Override the gateway host written into cluster metadata. /// - /// By default, local clusters advertise 127.0.0.1. In environments - /// where the test runner cannot reach 127.0.0.1 on the Docker host - /// (e.g., CI containers), set this to a reachable hostname such as - /// `host.docker.internal`. + /// By default, local clusters advertise 127.0.0.1. Set this when + /// the client cannot reach the Docker host at 127.0.0.1 — for + /// example in CI containers, WSL, or when Docker runs on a + /// remote host. Common values: `host.docker.internal`, a LAN IP, + /// or a hostname. #[arg(long)] gateway_host: Option,