diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 1cb62b7b..6616b085 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -21,6 +21,7 @@ use bollard::query_parameters::{ use futures::StreamExt; use miette::{IntoDiagnostic, Result, WrapErr}; use std::collections::HashMap; +use tracing::debug; const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell"; @@ -436,10 +437,36 @@ pub async fn ensure_image( ..Default::default() }; - let mut stream = docker.create_image(Some(options), None, credentials); + // Attempt the pull with credentials (if any), falling back to an + // anonymous pull when the registry rejects the credentials. This + // handles public ghcr.io repos where sending a token the caller + // doesn't have access with causes a 401/403, but an unauthenticated + // pull would succeed. + let has_credentials = credentials.is_some(); + let mut stream = docker.create_image(Some(options.clone()), None, credentials); + let mut auth_failed = false; while let Some(result) = stream.next().await { - result.into_diagnostic()?; + match result { + Ok(_info) => {} + Err(err) if has_credentials && image::is_auth_failure(&err) => { + debug!( + "Registry credentials rejected for {}, retrying as anonymous pull", + repo + ); + auth_failed = true; + break; + } + Err(err) => return Err(err).into_diagnostic(), + } } + + if auth_failed { + let mut stream = docker.create_image(Some(options), None, None); + while let Some(result) = stream.next().await { + result.into_diagnostic()?; + } + } + Ok(()) } diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 14284a90..03b0caae 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -240,8 +240,9 @@ fn diagnose_port_conflict(_gateway_name: &str) -> GatewayFailureDiagnosis { fn diagnose_image_pull_auth_failure(_gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Registry authentication failed".to_string(), - explanation: "Could not authenticate with the container registry. The image may not \ - exist, or you may not have permission to access it." + explanation: "Could not authenticate with the container registry and anonymous \ + pull also failed. The image may not exist, the repository may be private, \ + or there may be a network issue preventing access." .to_string(), recovery_steps: vec![ RecoveryStep::with_command( diff --git a/crates/openshell-bootstrap/src/image.rs b/crates/openshell-bootstrap/src/image.rs index 0bb091a0..0cdc7d3f 100644 --- a/crates/openshell-bootstrap/src/image.rs +++ b/crates/openshell-bootstrap/src/image.rs @@ -6,6 +6,7 @@ use crate::docker::{HostPlatform, get_host_platform}; use bollard::Docker; use bollard::auth::DockerCredentials; +use bollard::errors::Error as BollardError; use bollard::query_parameters::{CreateImageOptions, TagImageOptionsBuilder}; use futures::StreamExt; use miette::{IntoDiagnostic, Result, WrapErr}; @@ -222,15 +223,11 @@ pub async fn pull_remote_image( ..Default::default() }; - let mut stream = remote.create_image(Some(options), None, credentials); - while let Some(result) = stream.next().await { - let info = result - .into_diagnostic() - .wrap_err("failed to pull image on remote host")?; + // Attempt the pull with credentials, falling back to anonymous on auth failure. + let result = consume_pull_stream(remote, options.clone(), credentials, |info| { if let Some(ref status) = info.status { debug!("Remote pull: {}", status); } - // Report layer progress if let Some(ref status) = info.status && let Some(ref detail) = info.progress_detail && let (Some(current), Some(total)) = (detail.current, detail.total) @@ -239,6 +236,38 @@ pub async fn pull_remote_image( let total_mb = total / (1024 * 1024); on_progress(format!("[progress] {status}: {current_mb}/{total_mb} MB")); } + }) + .await; + + match result { + Ok(()) => {} + Err(err) if is_auth_failure(&err) => { + debug!( + "Registry credentials rejected for {}, retrying as anonymous pull", + registry_image + ); + consume_pull_stream(remote, options, None, |info| { + if let Some(ref status) = info.status { + debug!("Remote pull (anonymous): {}", status); + } + if let Some(ref status) = info.status + && let Some(ref detail) = info.progress_detail + && let (Some(current), Some(total)) = (detail.current, detail.total) + { + let current_mb = current / (1024 * 1024); + let total_mb = total / (1024 * 1024); + on_progress(format!("[progress] {status}: {current_mb}/{total_mb} MB")); + } + }) + .await + .into_diagnostic() + .wrap_err("failed to pull image on remote host (anonymous fallback)")?; + } + Err(err) => { + return Err(err) + .into_diagnostic() + .wrap_err("failed to pull image on remote host"); + } } // Tag the pulled image to the expected local image ref so downstream code @@ -304,6 +333,50 @@ pub(crate) fn is_local_image_ref(image_ref: &str) -> bool { !repo.contains('/') } +/// Check whether a bollard error indicates an authentication/authorization failure. +/// +/// These errors occur when credentials are rejected by the registry. For public +/// repos on ghcr.io, retrying without credentials (anonymous pull) may succeed. +pub(crate) fn is_auth_failure(err: &BollardError) -> bool { + match err { + BollardError::DockerResponseServerError { + status_code: 401 | 403, + .. + } => true, + BollardError::DockerResponseServerError { message, .. } => { + let msg = message.to_lowercase(); + msg.contains("pull access denied") + || msg.contains("unauthorized") + || msg.contains("denied: access forbidden") + } + BollardError::DockerStreamError { error } => { + let msg = error.to_lowercase(); + msg.contains("pull access denied") + || msg.contains("unauthorized") + || msg.contains("denied: access forbidden") + } + _ => false, + } +} + +/// Consume a `create_image` pull stream to completion. +/// +/// Returns `Ok(())` on success, or the first [`BollardError`] on failure. +/// The `on_info` callback is invoked for each progress chunk. +async fn consume_pull_stream( + docker: &Docker, + options: CreateImageOptions, + credentials: Option, + mut on_info: impl FnMut(&bollard::models::CreateImageInfo), +) -> std::result::Result<(), BollardError> { + let mut stream = docker.create_image(Some(options), None, credentials); + while let Some(result) = stream.next().await { + let info = result?; + on_info(&info); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; @@ -399,4 +472,58 @@ mod tests { "repo base should start with the registry host" ); } + + #[test] + fn is_auth_failure_status_401() { + let err = BollardError::DockerResponseServerError { + status_code: 401, + message: "Unauthorized".to_string(), + }; + assert!(is_auth_failure(&err)); + } + + #[test] + fn is_auth_failure_status_403() { + let err = BollardError::DockerResponseServerError { + status_code: 403, + message: "Forbidden".to_string(), + }; + assert!(is_auth_failure(&err)); + } + + #[test] + fn is_auth_failure_pull_access_denied_message() { + let err = BollardError::DockerResponseServerError { + status_code: 500, + message: "pull access denied for ghcr.io/foo/bar, repository does not exist" + .to_string(), + }; + assert!(is_auth_failure(&err)); + } + + #[test] + fn is_auth_failure_stream_error() { + let err = BollardError::DockerStreamError { + error: "unauthorized: unauthenticated: User cannot be authenticated".to_string(), + }; + assert!(is_auth_failure(&err)); + } + + #[test] + fn is_auth_failure_not_found_is_false() { + let err = BollardError::DockerResponseServerError { + status_code: 404, + message: "Not found".to_string(), + }; + assert!(!is_auth_failure(&err)); + } + + #[test] + fn is_auth_failure_generic_500_is_false() { + let err = BollardError::DockerResponseServerError { + status_code: 500, + message: "internal server error".to_string(), + }; + assert!(!is_auth_failure(&err)); + } } diff --git a/deploy/docker/Dockerfile.cluster b/deploy/docker/Dockerfile.cluster index 49e29a98..36fc373f 100644 --- a/deploy/docker/Dockerfile.cluster +++ b/deploy/docker/Dockerfile.cluster @@ -175,9 +175,11 @@ FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 # - mount/umount: needed by kubelet for volume mounts (provided by mount package) # - ca-certificates: TLS verification for registry pulls # - conntrack: k3s/kube-proxy uses conntrack for connection tracking +# - curl: used by entrypoint to validate registry credentials # - dnsutils: nslookup used by entrypoint/healthcheck for DNS probe RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ + curl \ iptables \ mount \ dnsutils \ diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index f2e768c6..9a21cfdb 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -208,8 +208,26 @@ REGEOF REGEOF fi + # Helper: test whether registry credentials are accepted by attempting + # a token exchange via the OCI distribution auth endpoint. Returns 0 + # when credentials work, non-zero otherwise. This lets us skip writing + # auth config for registries where the token is rejected — containerd + # will then fall back to anonymous pulls, which succeeds for public repos. + test_registry_credentials() { + local host="$1" user="$2" pass="$3" + # ghcr.io uses token-based auth; we test with an arbitrary scope. + curl -sf -o /dev/null -u "${user}:${pass}" \ + "https://${host}/token?service=${host}&scope=repository:nvidia/openshell/cluster:pull" \ + 2>/dev/null + } + + # Track whether the configs: YAML block has been started so subsequent + # registry entries can be appended without duplicating the key. + CONFIGS_STARTED=false + if [ -n "${REGISTRY_USERNAME:-}" ] && [ -n "${REGISTRY_PASSWORD:-}" ]; then - cat >> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <