Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions crates/openshell-bootstrap/src/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use bollard::query_parameters::{
use futures::StreamExt;
use miette::{IntoDiagnostic, Result, WrapErr};
use std::collections::HashMap;
use tracing::debug;

const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell";

Expand Down Expand Up @@ -436,10 +437,36 @@ pub async fn ensure_image(
..Default::default()
};

let mut stream = docker.create_image(Some(options), None, credentials);
// Attempt the pull with credentials (if any), falling back to an
// anonymous pull when the registry rejects the credentials. This
// handles public ghcr.io repos where sending a token the caller
// doesn't have access with causes a 401/403, but an unauthenticated
// pull would succeed.
let has_credentials = credentials.is_some();
let mut stream = docker.create_image(Some(options.clone()), None, credentials);
let mut auth_failed = false;
while let Some(result) = stream.next().await {
result.into_diagnostic()?;
match result {
Ok(_info) => {}
Err(err) if has_credentials && image::is_auth_failure(&err) => {
debug!(
"Registry credentials rejected for {}, retrying as anonymous pull",
repo
);
auth_failed = true;
break;
}
Err(err) => return Err(err).into_diagnostic(),
}
}

if auth_failed {
let mut stream = docker.create_image(Some(options), None, None);
while let Some(result) = stream.next().await {
result.into_diagnostic()?;
}
}

Ok(())
}

Expand Down
5 changes: 3 additions & 2 deletions crates/openshell-bootstrap/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,9 @@ fn diagnose_port_conflict(_gateway_name: &str) -> GatewayFailureDiagnosis {
fn diagnose_image_pull_auth_failure(_gateway_name: &str) -> GatewayFailureDiagnosis {
GatewayFailureDiagnosis {
summary: "Registry authentication failed".to_string(),
explanation: "Could not authenticate with the container registry. The image may not \
exist, or you may not have permission to access it."
explanation: "Could not authenticate with the container registry and anonymous \
pull also failed. The image may not exist, the repository may be private, \
or there may be a network issue preventing access."
.to_string(),
recovery_steps: vec![
RecoveryStep::with_command(
Expand Down
139 changes: 133 additions & 6 deletions crates/openshell-bootstrap/src/image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use crate::docker::{HostPlatform, get_host_platform};
use bollard::Docker;
use bollard::auth::DockerCredentials;
use bollard::errors::Error as BollardError;
use bollard::query_parameters::{CreateImageOptions, TagImageOptionsBuilder};
use futures::StreamExt;
use miette::{IntoDiagnostic, Result, WrapErr};
Expand Down Expand Up @@ -222,15 +223,11 @@ pub async fn pull_remote_image(
..Default::default()
};

let mut stream = remote.create_image(Some(options), None, credentials);
while let Some(result) = stream.next().await {
let info = result
.into_diagnostic()
.wrap_err("failed to pull image on remote host")?;
// Attempt the pull with credentials, falling back to anonymous on auth failure.
let result = consume_pull_stream(remote, options.clone(), credentials, |info| {
if let Some(ref status) = info.status {
debug!("Remote pull: {}", status);
}
// Report layer progress
if let Some(ref status) = info.status
&& let Some(ref detail) = info.progress_detail
&& let (Some(current), Some(total)) = (detail.current, detail.total)
Expand All @@ -239,6 +236,38 @@ pub async fn pull_remote_image(
let total_mb = total / (1024 * 1024);
on_progress(format!("[progress] {status}: {current_mb}/{total_mb} MB"));
}
})
.await;

match result {
Ok(()) => {}
Err(err) if is_auth_failure(&err) => {
debug!(
"Registry credentials rejected for {}, retrying as anonymous pull",
registry_image
);
consume_pull_stream(remote, options, None, |info| {
if let Some(ref status) = info.status {
debug!("Remote pull (anonymous): {}", status);
}
if let Some(ref status) = info.status
&& let Some(ref detail) = info.progress_detail
&& let (Some(current), Some(total)) = (detail.current, detail.total)
{
let current_mb = current / (1024 * 1024);
let total_mb = total / (1024 * 1024);
on_progress(format!("[progress] {status}: {current_mb}/{total_mb} MB"));
}
})
.await
.into_diagnostic()
.wrap_err("failed to pull image on remote host (anonymous fallback)")?;
}
Err(err) => {
return Err(err)
.into_diagnostic()
.wrap_err("failed to pull image on remote host");
}
}

// Tag the pulled image to the expected local image ref so downstream code
Expand Down Expand Up @@ -304,6 +333,50 @@ pub(crate) fn is_local_image_ref(image_ref: &str) -> bool {
!repo.contains('/')
}

/// Check whether a bollard error indicates an authentication/authorization failure.
///
/// These errors occur when credentials are rejected by the registry. For public
/// repos on ghcr.io, retrying without credentials (anonymous pull) may succeed.
pub(crate) fn is_auth_failure(err: &BollardError) -> bool {
match err {
BollardError::DockerResponseServerError {
status_code: 401 | 403,
..
} => true,
BollardError::DockerResponseServerError { message, .. } => {
let msg = message.to_lowercase();
msg.contains("pull access denied")
|| msg.contains("unauthorized")
|| msg.contains("denied: access forbidden")
}
BollardError::DockerStreamError { error } => {
let msg = error.to_lowercase();
msg.contains("pull access denied")
|| msg.contains("unauthorized")
|| msg.contains("denied: access forbidden")
}
_ => false,
}
}

/// Consume a `create_image` pull stream to completion.
///
/// Returns `Ok(())` on success, or the first [`BollardError`] on failure.
/// The `on_info` callback is invoked for each progress chunk.
async fn consume_pull_stream(
docker: &Docker,
options: CreateImageOptions,
credentials: Option<DockerCredentials>,
mut on_info: impl FnMut(&bollard::models::CreateImageInfo),
) -> std::result::Result<(), BollardError> {
let mut stream = docker.create_image(Some(options), None, credentials);
while let Some(result) = stream.next().await {
let info = result?;
on_info(&info);
}
Ok(())
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -399,4 +472,58 @@ mod tests {
"repo base should start with the registry host"
);
}

#[test]
fn is_auth_failure_status_401() {
let err = BollardError::DockerResponseServerError {
status_code: 401,
message: "Unauthorized".to_string(),
};
assert!(is_auth_failure(&err));
}

#[test]
fn is_auth_failure_status_403() {
let err = BollardError::DockerResponseServerError {
status_code: 403,
message: "Forbidden".to_string(),
};
assert!(is_auth_failure(&err));
}

#[test]
fn is_auth_failure_pull_access_denied_message() {
let err = BollardError::DockerResponseServerError {
status_code: 500,
message: "pull access denied for ghcr.io/foo/bar, repository does not exist"
.to_string(),
};
assert!(is_auth_failure(&err));
}

#[test]
fn is_auth_failure_stream_error() {
let err = BollardError::DockerStreamError {
error: "unauthorized: unauthenticated: User cannot be authenticated".to_string(),
};
assert!(is_auth_failure(&err));
}

#[test]
fn is_auth_failure_not_found_is_false() {
let err = BollardError::DockerResponseServerError {
status_code: 404,
message: "Not found".to_string(),
};
assert!(!is_auth_failure(&err));
}

#[test]
fn is_auth_failure_generic_500_is_false() {
let err = BollardError::DockerResponseServerError {
status_code: 500,
message: "internal server error".to_string(),
};
assert!(!is_auth_failure(&err));
}
}
2 changes: 2 additions & 0 deletions deploy/docker/Dockerfile.cluster
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,11 @@ FROM nvcr.io/nvidia/base/ubuntu:noble-20251013
# - mount/umount: needed by kubelet for volume mounts (provided by mount package)
# - ca-certificates: TLS verification for registry pulls
# - conntrack: k3s/kube-proxy uses conntrack for connection tracking
# - curl: used by entrypoint to validate registry credentials
# - dnsutils: nslookup used by entrypoint/healthcheck for DNS probe
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
curl \
iptables \
mount \
dnsutils \
Expand Down
39 changes: 32 additions & 7 deletions deploy/docker/cluster-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -208,39 +208,64 @@ REGEOF
REGEOF
fi

# Helper: test whether registry credentials are accepted by attempting
# a token exchange via the OCI distribution auth endpoint. Returns 0
# when credentials work, non-zero otherwise. This lets us skip writing
# auth config for registries where the token is rejected — containerd
# will then fall back to anonymous pulls, which succeeds for public repos.
test_registry_credentials() {
local host="$1" user="$2" pass="$3"
# ghcr.io uses token-based auth; we test with an arbitrary scope.
curl -sf -o /dev/null -u "${user}:${pass}" \
"https://${host}/token?service=${host}&scope=repository:nvidia/openshell/cluster:pull" \
2>/dev/null
}

# Track whether the configs: YAML block has been started so subsequent
# registry entries can be appended without duplicating the key.
CONFIGS_STARTED=false

if [ -n "${REGISTRY_USERNAME:-}" ] && [ -n "${REGISTRY_PASSWORD:-}" ]; then
cat >> "$REGISTRIES_YAML" <<REGEOF
if test_registry_credentials "${REGISTRY_HOST}" "${REGISTRY_USERNAME}" "${REGISTRY_PASSWORD}"; then
cat >> "$REGISTRIES_YAML" <<REGEOF

configs:
"${REGISTRY_HOST}":
auth:
username: ${REGISTRY_USERNAME}
password: ${REGISTRY_PASSWORD}
REGEOF
CONFIGS_STARTED=true
else
echo "Registry credentials rejected for ${REGISTRY_HOST}, skipping auth config (anonymous pulls)"
fi
fi

# Add auth for the community registry when it differs from the
# primary registry (community sandbox images live there).
if [ -n "${COMMUNITY_REGISTRY_HOST:-}" ] && [ "${COMMUNITY_REGISTRY_HOST}" != "${REGISTRY_HOST}" ] \
&& [ -n "${COMMUNITY_REGISTRY_USERNAME:-}" ] && [ -n "${COMMUNITY_REGISTRY_PASSWORD:-}" ]; then
# Append to existing configs block or start a new one.
if [ -n "${REGISTRY_USERNAME:-}" ] && [ -n "${REGISTRY_PASSWORD:-}" ]; then
# configs: block already started above — just append the entry.
cat >> "$REGISTRIES_YAML" <<REGEOF
if test_registry_credentials "${COMMUNITY_REGISTRY_HOST}" "${COMMUNITY_REGISTRY_USERNAME}" "${COMMUNITY_REGISTRY_PASSWORD}"; then
if [ "$CONFIGS_STARTED" = "true" ]; then
# configs: block already started above — just append the entry.
cat >> "$REGISTRIES_YAML" <<REGEOF
"${COMMUNITY_REGISTRY_HOST}":
auth:
username: ${COMMUNITY_REGISTRY_USERNAME}
password: ${COMMUNITY_REGISTRY_PASSWORD}
REGEOF
else
cat >> "$REGISTRIES_YAML" <<REGEOF
else
cat >> "$REGISTRIES_YAML" <<REGEOF

configs:
"${COMMUNITY_REGISTRY_HOST}":
auth:
username: ${COMMUNITY_REGISTRY_USERNAME}
password: ${COMMUNITY_REGISTRY_PASSWORD}
REGEOF
fi
else
echo "Community registry credentials rejected for ${COMMUNITY_REGISTRY_HOST}, skipping auth config (anonymous pulls)"
fi
fi
else
Expand Down
Loading