From fc94f0cd03abe6100c1e94848f6ac76fc39c6024 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 12:01:01 -0700 Subject: [PATCH 1/7] fix(ci): replace Docker Hub python images with CI image to avoid rate limits Unauthenticated pulls from Docker Hub hit 429 rate limits in E2E tests and wheel builds. Switch all python:* base image references to ghcr.io/nvidia/openshell/ci:latest which is already authenticated in every CI workflow. --- deploy/docker/Dockerfile.python-wheels | 16 ++-------------- deploy/docker/Dockerfile.python-wheels-macos | 10 ++-------- e2e/rust/tests/custom_image.rs | 2 +- e2e/rust/tests/forward_proxy_l7_bypass.rs | 2 +- e2e/rust/tests/host_gateway_alias.rs | 2 +- 5 files changed, 7 insertions(+), 25 deletions(-) diff --git a/deploy/docker/Dockerfile.python-wheels b/deploy/docker/Dockerfile.python-wheels index 0cfe17eb..460cbd65 100644 --- a/deploy/docker/Dockerfile.python-wheels +++ b/deploy/docker/Dockerfile.python-wheels @@ -4,22 +4,10 @@ # SPDX-License-Identifier: Apache-2.0 -ARG PYTHON_VERSION=3.12 +ARG CI_IMAGE=ghcr.io/nvidia/openshell/ci:latest -FROM --platform=$BUILDPLATFORM python:${PYTHON_VERSION}-slim AS base +FROM --platform=$BUILDPLATFORM ${CI_IMAGE} AS base -ENV PATH="/root/.cargo/bin:${PATH}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - gcc \ - libc6-dev \ - pkg-config \ - libssl-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y RUN pip install --no-cache-dir maturin COPY deploy/docker/cross-build.sh /usr/local/bin/ diff --git a/deploy/docker/Dockerfile.python-wheels-macos b/deploy/docker/Dockerfile.python-wheels-macos index fb4199af..69a946c0 100644 --- a/deploy/docker/Dockerfile.python-wheels-macos +++ b/deploy/docker/Dockerfile.python-wheels-macos @@ -5,11 +5,11 @@ ARG OSXCROSS_IMAGE=crazymax/osxcross:latest -ARG PYTHON_VERSION=3.12 +ARG CI_IMAGE=ghcr.io/nvidia/openshell/ci:latest FROM ${OSXCROSS_IMAGE} AS osxcross -FROM python:${PYTHON_VERSION}-slim AS builder +FROM ${CI_IMAGE} AS builder ARG TARGETARCH ARG OPENSHELL_CARGO_VERSION @@ -22,19 +22,13 @@ ENV LD_LIBRARY_PATH="/osxcross/lib" COPY --from=osxcross /osxcross /osxcross RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ clang \ - curl \ - libssl-dev \ - pkg-config \ && rm -rf /var/lib/apt/lists/* # aws-lc-sys probes with --target=arm64-apple-macosx and clang then looks for # arm64-apple-macosx-ld. Provide a linker alias to osxcross ld64. RUN ln -sf /osxcross/bin/arm64-apple-darwin25.1-ld /usr/local/bin/arm64-apple-macosx-ld -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y RUN rustup target add aarch64-apple-darwin RUN pip install --no-cache-dir maturin diff --git a/e2e/rust/tests/custom_image.rs b/e2e/rust/tests/custom_image.rs index 30a61f85..9018dae1 100644 --- a/e2e/rust/tests/custom_image.rs +++ b/e2e/rust/tests/custom_image.rs @@ -15,7 +15,7 @@ use std::io::Write; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; -const DOCKERFILE_CONTENT: &str = r#"FROM python:3.13-slim +const DOCKERFILE_CONTENT: &str = r#"FROM ghcr.io/nvidia/openshell/ci:latest # iproute2 is required for sandbox network namespace isolation. RUN apt-get update && apt-get install -y --no-install-recommends iproute2 \ diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index 3e913607..5d08830e 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -17,7 +17,7 @@ use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; use tokio::time::{interval, timeout}; -const TEST_SERVER_IMAGE: &str = "python:3.13-alpine"; +const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell/ci:latest"; struct DockerServer { port: u16, diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs index 547a9238..9464115d 100644 --- a/e2e/rust/tests/host_gateway_alias.rs +++ b/e2e/rust/tests/host_gateway_alias.rs @@ -17,7 +17,7 @@ use tokio::time::{interval, timeout}; const INFERENCE_PROVIDER_NAME: &str = "e2e-host-inference"; const INFERENCE_PROVIDER_UNREACHABLE_NAME: &str = "e2e-host-inference-unreachable"; -const TEST_SERVER_IMAGE: &str = "python:3.13-alpine"; +const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell/ci:latest"; static INFERENCE_ROUTE_LOCK: Mutex<()> = Mutex::new(()); async fn run_cli(args: &[&str]) -> Result { From d263c793d7ede993c80bd29677612cead2a16503 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 12:16:53 -0700 Subject: [PATCH 2/7] fix(e2e): use community sandbox base image for E2E tests The CI image is heavyweight and not intended as a sandbox base. Use the community sandbox base image instead, which is purpose-built for running inside sandboxes. --- e2e/rust/tests/custom_image.rs | 2 +- e2e/rust/tests/forward_proxy_l7_bypass.rs | 2 +- e2e/rust/tests/host_gateway_alias.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e/rust/tests/custom_image.rs b/e2e/rust/tests/custom_image.rs index 9018dae1..1564003a 100644 --- a/e2e/rust/tests/custom_image.rs +++ b/e2e/rust/tests/custom_image.rs @@ -15,7 +15,7 @@ use std::io::Write; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; -const DOCKERFILE_CONTENT: &str = r#"FROM ghcr.io/nvidia/openshell/ci:latest +const DOCKERFILE_CONTENT: &str = r#"FROM ghcr.io/nvidia/openshell-community/sandboxes/base:latest # iproute2 is required for sandbox network namespace isolation. RUN apt-get update && apt-get install -y --no-install-recommends iproute2 \ diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index 5d08830e..76b7da12 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -17,7 +17,7 @@ use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; use tokio::time::{interval, timeout}; -const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell/ci:latest"; +const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest"; struct DockerServer { port: u16, diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs index 9464115d..ba4dcb6e 100644 --- a/e2e/rust/tests/host_gateway_alias.rs +++ b/e2e/rust/tests/host_gateway_alias.rs @@ -17,7 +17,7 @@ use tokio::time::{interval, timeout}; const INFERENCE_PROVIDER_NAME: &str = "e2e-host-inference"; const INFERENCE_PROVIDER_UNREACHABLE_NAME: &str = "e2e-host-inference-unreachable"; -const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell/ci:latest"; +const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest"; static INFERENCE_ROUTE_LOCK: Mutex<()> = Mutex::new(()); async fn run_cli(args: &[&str]) -> Result { From 63496ac729746bc3272aa21acd3ead1ee4199a3e Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 12:22:30 -0700 Subject: [PATCH 3/7] fix(e2e): remove redundant setup from custom_image test Dockerfile The community sandbox base image already includes iproute2 and the sandbox user/group (UID/GID 1000). The groupadd command failed because GID 1000 already exists in the base image. --- e2e/rust/tests/custom_image.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/e2e/rust/tests/custom_image.rs b/e2e/rust/tests/custom_image.rs index 1564003a..fc1d28fd 100644 --- a/e2e/rust/tests/custom_image.rs +++ b/e2e/rust/tests/custom_image.rs @@ -17,14 +17,6 @@ use openshell_e2e::harness::sandbox::SandboxGuard; const DOCKERFILE_CONTENT: &str = r#"FROM ghcr.io/nvidia/openshell-community/sandboxes/base:latest -# iproute2 is required for sandbox network namespace isolation. -RUN apt-get update && apt-get install -y --no-install-recommends iproute2 \ - && rm -rf /var/lib/apt/lists/* - -# Create the sandbox user/group so the supervisor can switch to it. -RUN groupadd -g 1000 sandbox && \ - useradd -m -u 1000 -g sandbox sandbox - # Write a marker file so we can verify this is our custom image. RUN echo "custom-image-e2e-marker" > /opt/marker.txt From 79a51ca35de91b4a8b5e5fbdd4ac66f16fc89e5b Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 12:41:59 -0700 Subject: [PATCH 4/7] fix(e2e): use ECR public mirror for custom_image test base This test validates building a truly custom image from scratch, so it needs a generic base (not the sandbox base image). Use the ECR public mirror of python:3.13-slim to avoid Docker Hub rate limits. --- e2e/rust/tests/custom_image.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/e2e/rust/tests/custom_image.rs b/e2e/rust/tests/custom_image.rs index fc1d28fd..14fc3f47 100644 --- a/e2e/rust/tests/custom_image.rs +++ b/e2e/rust/tests/custom_image.rs @@ -15,7 +15,15 @@ use std::io::Write; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; -const DOCKERFILE_CONTENT: &str = r#"FROM ghcr.io/nvidia/openshell-community/sandboxes/base:latest +const DOCKERFILE_CONTENT: &str = r#"FROM public.ecr.aws/docker/library/python:3.13-slim + +# iproute2 is required for sandbox network namespace isolation. +RUN apt-get update && apt-get install -y --no-install-recommends iproute2 \ + && rm -rf /var/lib/apt/lists/* + +# Create the sandbox user/group so the supervisor can switch to it. +RUN groupadd -g 1000 sandbox && \ + useradd -m -u 1000 -g sandbox sandbox # Write a marker file so we can verify this is our custom image. RUN echo "custom-image-e2e-marker" > /opt/marker.txt From a2dc4a1173e5292831909ae93a21ecb248b8d0a0 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 12:52:07 -0700 Subject: [PATCH 5/7] revert: restore python:3.12-slim for wheel builds The CI image's mise shims don't resolve Python tools (pip, maturin) outside /opt/mise. Revert wheel Dockerfiles to the original python:3.12-slim base until a proper fix is in place. --- deploy/docker/Dockerfile.python-wheels | 16 ++++++++++++++-- deploy/docker/Dockerfile.python-wheels-macos | 10 ++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/deploy/docker/Dockerfile.python-wheels b/deploy/docker/Dockerfile.python-wheels index 460cbd65..0cfe17eb 100644 --- a/deploy/docker/Dockerfile.python-wheels +++ b/deploy/docker/Dockerfile.python-wheels @@ -4,10 +4,22 @@ # SPDX-License-Identifier: Apache-2.0 -ARG CI_IMAGE=ghcr.io/nvidia/openshell/ci:latest +ARG PYTHON_VERSION=3.12 -FROM --platform=$BUILDPLATFORM ${CI_IMAGE} AS base +FROM --platform=$BUILDPLATFORM python:${PYTHON_VERSION}-slim AS base +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + gcc \ + libc6-dev \ + pkg-config \ + libssl-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y RUN pip install --no-cache-dir maturin COPY deploy/docker/cross-build.sh /usr/local/bin/ diff --git a/deploy/docker/Dockerfile.python-wheels-macos b/deploy/docker/Dockerfile.python-wheels-macos index 69a946c0..fb4199af 100644 --- a/deploy/docker/Dockerfile.python-wheels-macos +++ b/deploy/docker/Dockerfile.python-wheels-macos @@ -5,11 +5,11 @@ ARG OSXCROSS_IMAGE=crazymax/osxcross:latest -ARG CI_IMAGE=ghcr.io/nvidia/openshell/ci:latest +ARG PYTHON_VERSION=3.12 FROM ${OSXCROSS_IMAGE} AS osxcross -FROM ${CI_IMAGE} AS builder +FROM python:${PYTHON_VERSION}-slim AS builder ARG TARGETARCH ARG OPENSHELL_CARGO_VERSION @@ -22,13 +22,19 @@ ENV LD_LIBRARY_PATH="/osxcross/lib" COPY --from=osxcross /osxcross /osxcross RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ clang \ + curl \ + libssl-dev \ + pkg-config \ && rm -rf /var/lib/apt/lists/* # aws-lc-sys probes with --target=arm64-apple-macosx and clang then looks for # arm64-apple-macosx-ld. Provide a linker alias to osxcross ld64. RUN ln -sf /osxcross/bin/arm64-apple-darwin25.1-ld /usr/local/bin/arm64-apple-macosx-ld +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y RUN rustup target add aarch64-apple-darwin RUN pip install --no-cache-dir maturin From c2d2638196fb531f4cea445a192b4737b7c1ab16 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 13:12:40 -0700 Subject: [PATCH 6/7] fix(e2e): increase docker test server readiness timeout to 60s The larger sandbox base image takes longer to pull than the previous alpine image, leaving less headroom for the readiness check. --- e2e/rust/tests/forward_proxy_l7_bypass.rs | 4 ++-- e2e/rust/tests/host_gateway_alias.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index 76b7da12..8ba218a7 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -79,7 +79,7 @@ HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() async fn wait_until_ready(&self) -> Result<(), String> { let container_id = self.container_id.clone(); - timeout(Duration::from_secs(30), async move { + timeout(Duration::from_secs(60), async move { let mut tick = interval(Duration::from_millis(500)); loop { tick.tick().await; @@ -99,7 +99,7 @@ HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() } }) .await - .map_err(|_| "docker test server did not become ready within 30s".to_string()) + .map_err(|_| "docker test server did not become ready within 60s".to_string()) } } diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs index ba4dcb6e..a59b4073 100644 --- a/e2e/rust/tests/host_gateway_alias.rs +++ b/e2e/rust/tests/host_gateway_alias.rs @@ -118,7 +118,7 @@ HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() async fn wait_until_ready(&self) -> Result<(), String> { let container_id = self.container_id.clone(); - timeout(Duration::from_secs(30), async move { + timeout(Duration::from_secs(60), async move { let mut tick = interval(Duration::from_millis(500)); loop { tick.tick().await; @@ -141,7 +141,7 @@ HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() .await .map_err(|_| { format!( - "docker test server {} did not become ready within 30s", + "docker test server {} did not become ready within 60s", self.container_id ) })? From c77d2ada846b168d979a86d4d340589fbf8f1c90 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 16 Mar 2026 13:32:31 -0700 Subject: [PATCH 7/7] fix(e2e): use ECR python:3.13-alpine for test server images The sandbox base image is too large and times out during pull. Use the ECR public mirror of python:3.13-alpine which is small and fast. --- e2e/rust/tests/forward_proxy_l7_bypass.rs | 2 +- e2e/rust/tests/host_gateway_alias.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index 8ba218a7..fb75176c 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -17,7 +17,7 @@ use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; use tokio::time::{interval, timeout}; -const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest"; +const TEST_SERVER_IMAGE: &str = "public.ecr.aws/docker/library/python:3.13-alpine"; struct DockerServer { port: u16, diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs index a59b4073..fe6a15e7 100644 --- a/e2e/rust/tests/host_gateway_alias.rs +++ b/e2e/rust/tests/host_gateway_alias.rs @@ -17,7 +17,7 @@ use tokio::time::{interval, timeout}; const INFERENCE_PROVIDER_NAME: &str = "e2e-host-inference"; const INFERENCE_PROVIDER_UNREACHABLE_NAME: &str = "e2e-host-inference-unreachable"; -const TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest"; +const TEST_SERVER_IMAGE: &str = "public.ecr.aws/docker/library/python:3.13-alpine"; static INFERENCE_ROUTE_LOCK: Mutex<()> = Mutex::new(()); async fn run_cli(args: &[&str]) -> Result {