From 48fca13dc5001543b137c8a6c2062e305a4e557c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sat, 14 Mar 2026 14:44:26 -0700 Subject: [PATCH 1/2] refactor(bootstrap): remove XOR-obfuscated default registry token GHCR repos are now public and no longer require authentication by default. Remove the baked-in XOR-encoded PAT and all automatic credential injection. Image pulls are unauthenticated unless the user explicitly provides credentials via --registry-username/--registry-token flags or OPENSHELL_REGISTRY_USERNAME/OPENSHELL_REGISTRY_TOKEN env vars. Also adds a new --registry-username CLI flag on gateway start for private registry use cases, and removes the automatic community registry credential injection (COMMUNITY_REGISTRY_* env vars) since community sandbox images on GHCR are also public. --- .../skills/debug-openshell-cluster/SKILL.md | 2 +- architecture/gateway-single-node.md | 6 +- crates/openshell-bootstrap/src/docker.rs | 46 +++----- crates/openshell-bootstrap/src/errors.rs | 7 +- crates/openshell-bootstrap/src/image.rs | 100 ++++++------------ crates/openshell-bootstrap/src/lib.rs | 38 +++++-- crates/openshell-cli/src/bootstrap.rs | 12 ++- crates/openshell-cli/src/doctor_llm_prompt.md | 2 +- crates/openshell-cli/src/main.rs | 21 +++- crates/openshell-cli/src/run.rs | 4 + docs/sandboxes/manage-gateways.md | 10 ++ 11 files changed, 128 insertions(+), 120 deletions(-) diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 81b1b53e..115a2aa5 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -160,7 +160,7 @@ openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-op Common issues: - **Replicas 0/0**: The StatefulSet has been scaled to zero — no pods are running. This can happen after a failed deploy, manual scale-down, or Helm values misconfiguration. Fix: `openshell doctor exec -- kubectl -n openshell scale statefulset openshell --replicas=1` -- **ImagePullBackOff**: The component image failed to pull. In `internal` mode, verify internal registry readiness and pushed image tags (Step 5). In `external` mode, check `/etc/rancher/k3s/registries.yaml` credentials/endpoints and DNS (Step 8). Default external registry is `ghcr.io/nvidia/openshell/`. Ensure a valid `--registry-token` (or `OPENSHELL_REGISTRY_TOKEN`) was provided during deploy. +- **ImagePullBackOff**: The component image failed to pull. In `internal` mode, verify internal registry readiness and pushed image tags (Step 5). In `external` mode, check `/etc/rancher/k3s/registries.yaml` credentials/endpoints and DNS (Step 8). Default external registry is `ghcr.io/nvidia/openshell/` (public, no auth required). If using a private registry, ensure `--registry-username` and `--registry-token` (or `OPENSHELL_REGISTRY_USERNAME`/`OPENSHELL_REGISTRY_TOKEN`) were provided during deploy. - **CrashLoopBackOff**: The server is crashing. Check pod logs for the actual error. - **Pending**: Insufficient resources or scheduling constraints. diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 08ba5170..679bc338 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -264,7 +264,7 @@ Falls back to `8.8.8.8` / `8.8.4.4` if iptables detection fails. ### Registry configuration -Writes `/etc/rancher/k3s/registries.yaml` from `REGISTRY_HOST`, `REGISTRY_ENDPOINT`, `REGISTRY_USERNAME`, `REGISTRY_PASSWORD`, and `REGISTRY_INSECURE` environment variables so that k3s/containerd can authenticate when pulling component images at runtime. +Writes `/etc/rancher/k3s/registries.yaml` from `REGISTRY_HOST`, `REGISTRY_ENDPOINT`, `REGISTRY_USERNAME`, `REGISTRY_PASSWORD`, and `REGISTRY_INSECURE` environment variables so that k3s/containerd can authenticate when pulling component images at runtime. When no explicit credentials are provided (the default for public GHCR repos), the auth block is omitted and images are pulled anonymously. ### Manifest injection @@ -392,8 +392,8 @@ Variables set on the container by `ensure_container()` in `docker.rs`: | `REGISTRY_INSECURE` | `"true"` or `"false"` | Always | | `IMAGE_REPO_BASE` | `{registry_host}/{namespace}` (or `IMAGE_REPO_BASE`/`OPENSHELL_IMAGE_REPO_BASE` override) | Always | | `REGISTRY_ENDPOINT` | Custom endpoint URL | When `OPENSHELL_REGISTRY_ENDPOINT` is set | -| `REGISTRY_USERNAME` | Registry auth username | When credentials available | -| `REGISTRY_PASSWORD` | Registry auth password | When credentials available | +| `REGISTRY_USERNAME` | Registry auth username | When explicit credentials provided via `--registry-username`/`--registry-token` or env vars | +| `REGISTRY_PASSWORD` | Registry auth password | When explicit credentials provided via `--registry-username`/`--registry-token` or env vars | | `EXTRA_SANS` | Comma-separated extra TLS SANs | When extra SANs computed | | `SSH_GATEWAY_HOST` | Resolved remote hostname/IP | Remote deploys only | | `SSH_GATEWAY_PORT` | Configured host port (default `8080`) | Remote deploys only | diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 3dc832aa..6f90185e 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -3,9 +3,7 @@ use crate::RemoteOptions; use crate::constants::{container_name, network_name, volume_name}; -use crate::image::{ - self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, DEFAULT_REGISTRY_USERNAME, parse_image_ref, -}; +use crate::image::{self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, parse_image_ref}; use bollard::API_DEFAULT_VERSION; use bollard::Docker; use bollard::errors::Error as BollardError; @@ -403,6 +401,7 @@ pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> { pub async fn ensure_image( docker: &Docker, image_ref: &str, + registry_username: Option<&str>, registry_token: Option<&str>, ) -> Result<()> { match docker.inspect_image(image_ref).await { @@ -423,9 +422,10 @@ pub async fn ensure_image( let (repo, tag) = parse_image_ref(image_ref); - // Use GHCR credentials (explicit or built-in default) for ghcr.io images. + // Use explicit GHCR credentials when provided for ghcr.io images. + // Public repos are pulled without authentication by default. let credentials = if repo.starts_with("ghcr.io/") { - image::ghcr_credentials(registry_token) + image::ghcr_credentials(registry_username, registry_token) } else { None }; @@ -452,6 +452,7 @@ pub async fn ensure_container( gateway_port: u16, disable_tls: bool, disable_gateway_auth: bool, + registry_username: Option<&str>, registry_token: Option<&str>, gpu: bool, ) -> Result<()> { @@ -586,15 +587,17 @@ pub async fn ensure_container( // Credential priority: // 1. OPENSHELL_REGISTRY_USERNAME/PASSWORD env vars (power-user override) - // 2. registry_token from --registry-token / OPENSHELL_REGISTRY_TOKEN - // 3. Built-in default XOR-decoded token - let registry_username = env_non_empty("OPENSHELL_REGISTRY_USERNAME") - .or_else(|| Some(DEFAULT_REGISTRY_USERNAME.to_string())); - let registry_password = env_non_empty("OPENSHELL_REGISTRY_PASSWORD").or_else(|| { + // 2. registry_username/registry_token from CLI flags / env vars + // No built-in default — GHCR repos are public and pull without auth. + let effective_username = env_non_empty("OPENSHELL_REGISTRY_USERNAME").or_else(|| { + registry_username + .filter(|u| !u.is_empty()) + .map(ToString::to_string) + }); + let effective_password = env_non_empty("OPENSHELL_REGISTRY_PASSWORD").or_else(|| { registry_token .filter(|t| !t.is_empty()) .map(ToString::to_string) - .or_else(|| Some(image::default_registry_token())) }); let mut env_vars: Vec = vec![ @@ -606,28 +609,13 @@ pub async fn ensure_container( if let Some(endpoint) = registry_endpoint { env_vars.push(format!("REGISTRY_ENDPOINT={endpoint}")); } - if let (Some(username), Some(password)) = (registry_username, registry_password) { + if let Some(password) = effective_password { + // Default to __token__ when only a password/token is provided. + let username = effective_username.unwrap_or_else(|| "__token__".to_string()); env_vars.push(format!("REGISTRY_USERNAME={username}")); env_vars.push(format!("REGISTRY_PASSWORD={password}")); } - // When the primary registry is NOT ghcr.io (e.g. a local registry in - // push-mode), we still need containerd credentials for the community - // registry so that community sandbox images - // (ghcr.io/nvidia/openshell-community/sandboxes/*) can be pulled at - // runtime. Pass community registry credentials as a separate set of - // env vars so the entrypoint can add a second block to registries.yaml. - if registry_host != DEFAULT_REGISTRY { - env_vars.push(format!("COMMUNITY_REGISTRY_HOST={DEFAULT_REGISTRY}")); - env_vars.push(format!( - "COMMUNITY_REGISTRY_USERNAME={DEFAULT_REGISTRY_USERNAME}" - )); - env_vars.push(format!( - "COMMUNITY_REGISTRY_PASSWORD={}", - image::default_registry_token() - )); - } - if !extra_sans.is_empty() { env_vars.push(format!("EXTRA_SANS={}", extra_sans.join(","))); } diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 14284a90..1f1c20fe 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -241,7 +241,9 @@ fn diagnose_image_pull_auth_failure(_gateway_name: &str) -> GatewayFailureDiagno GatewayFailureDiagnosis { summary: "Registry authentication failed".to_string(), explanation: "Could not authenticate with the container registry. The image may not \ - exist, or you may not have permission to access it." + exist, or you may not have permission to access it. Public GHCR repos \ + should not require authentication — if you see this error with the default \ + registry, it may indicate the image does not exist or a network issue." .to_string(), recovery_steps: vec![ RecoveryStep::with_command( @@ -249,7 +251,8 @@ fn diagnose_image_pull_auth_failure(_gateway_name: &str) -> GatewayFailureDiagno "docker pull ghcr.io/nvidia/openshell/cluster:latest", ), RecoveryStep::new( - "If using a private registry, ensure OPENSHELL_REGISTRY_TOKEN is set", + "If using a private registry, set OPENSHELL_REGISTRY_USERNAME and OPENSHELL_REGISTRY_TOKEN \ + (or use --registry-username and --registry-token)", ), RecoveryStep::with_command("Check your Docker login", "docker login ghcr.io"), ], diff --git a/crates/openshell-bootstrap/src/image.rs b/crates/openshell-bootstrap/src/image.rs index 6e2b0f94..bcb13f68 100644 --- a/crates/openshell-bootstrap/src/image.rs +++ b/crates/openshell-bootstrap/src/image.rs @@ -42,42 +42,7 @@ pub const DEFAULT_GATEWAY_IMAGE: &str = "ghcr.io/nvidia/openshell/cluster"; /// /// GHCR accepts any non-empty username when authenticating with a PAT; /// `__token__` is a common convention for token-based OCI registry auth. -pub const DEFAULT_REGISTRY_USERNAME: &str = "__token__"; - -// --------------------------------------------------------------------------- -// XOR-obfuscated default registry token -// --------------------------------------------------------------------------- -// A read-only GHCR PAT is XOR-encoded so it doesn't appear as plaintext in -// the compiled binary. This is a lightweight deterrent against casual -// inspection — it is NOT a security boundary. The `--registry-token` flag -// (or `OPENSHELL_REGISTRY_TOKEN` env var) overrides this default. - -/// XOR key used to decode the default registry token. -const XOR_KEY: [u8; 32] = [ - 0x9c, 0x87, 0xc1, 0x0c, 0x00, 0xe2, 0x59, 0x14, 0x98, 0xb8, 0xa5, 0x45, 0x48, 0x40, 0x3e, 0x92, - 0x62, 0x41, 0xfe, 0x5e, 0xd4, 0x09, 0x23, 0xe6, 0x85, 0xa7, 0x94, 0xab, 0xb8, 0x15, 0xcd, 0x45, -]; - -/// XOR-encoded default GHCR registry token. -const DEFAULT_REGISTRY_TOKEN_ENC: [u8; 40] = [ - 0xfb, 0xef, 0xb1, 0x53, 0x44, 0xb4, 0x6d, 0x71, 0xd0, 0xf0, 0xd1, 0x15, 0x09, 0x39, 0x72, 0xd7, - 0x29, 0x36, 0xb7, 0x69, 0xe5, 0x64, 0x55, 0xaf, 0xee, 0xd2, 0xc0, 0xd2, 0xd1, 0x5b, 0x81, 0x0e, - 0xd1, 0xf5, 0xf2, 0x5a, 0x6b, 0xa3, 0x14, 0x46, -]; - -/// Decode an XOR-encoded byte slice using [`XOR_KEY`]. -fn xor_decode(encoded: &[u8]) -> String { - encoded - .iter() - .enumerate() - .map(|(i, b)| (b ^ XOR_KEY[i % XOR_KEY.len()]) as char) - .collect() -} - -/// Default GHCR registry token, decoded at runtime. -pub(crate) fn default_registry_token() -> String { - xor_decode(&DEFAULT_REGISTRY_TOKEN_ENC) -} +const DEFAULT_REGISTRY_USERNAME: &str = "__token__"; /// Parse an image reference into (repository, tag). /// @@ -150,18 +115,22 @@ pub async fn pull_image( Ok(()) } -/// Build [`DockerCredentials`] for ghcr.io from a registry token. +/// Build [`DockerCredentials`] for ghcr.io from explicit credentials. /// -/// When `token` is `None` or empty, falls back to the built-in default -/// token (XOR-decoded at runtime). Always returns `Some`. -#[allow(clippy::unnecessary_wraps)] -pub(crate) fn ghcr_credentials(token: Option<&str>) -> Option { - let effective_token = token - .filter(|t| !t.is_empty()) - .map_or_else(default_registry_token, ToString::to_string); +/// Returns `None` when `token` is `None` or empty — the default GHCR repos +/// are public and do not require authentication. When a token is provided, +/// uses the given `username` (falling back to `__token__` if `None`/empty). +pub(crate) fn ghcr_credentials( + username: Option<&str>, + token: Option<&str>, +) -> Option { + let token = token.filter(|t| !t.is_empty())?; + let username = username + .filter(|u| !u.is_empty()) + .unwrap_or(DEFAULT_REGISTRY_USERNAME); Some(DockerCredentials { - username: Some(DEFAULT_REGISTRY_USERNAME.to_string()), - password: Some(effective_token), + username: Some(username.to_string()), + password: Some(token.to_string()), serveraddress: Some(DEFAULT_REGISTRY.to_string()), ..Default::default() }) @@ -182,6 +151,7 @@ pub(crate) fn ghcr_credentials(token: Option<&str>) -> Option pub async fn pull_remote_image( remote: &Docker, image_ref: &str, + registry_username: Option<&str>, registry_token: Option<&str>, mut on_progress: impl FnMut(String) + Send + 'static, ) -> Result<()> { @@ -213,7 +183,7 @@ pub async fn pull_remote_image( ); on_progress(format!("[progress] Pulling {platform_str} image")); - let credentials = ghcr_credentials(registry_token); + let credentials = ghcr_credentials(registry_username, registry_token); let options = CreateImageOptions { from_image: Some(registry_image_base), @@ -351,8 +321,8 @@ mod tests { } #[test] - fn ghcr_credentials_with_token() { - let creds = ghcr_credentials(Some("ghp_test123")); + fn ghcr_credentials_with_token_default_username() { + let creds = ghcr_credentials(None, Some("ghp_test123")); assert!(creds.is_some()); let creds = creds.unwrap(); assert_eq!(creds.username.as_deref(), Some("__token__")); @@ -361,31 +331,21 @@ mod tests { } #[test] - fn ghcr_credentials_without_token_uses_default() { - // When no explicit token is provided, the built-in default is used. - let creds = ghcr_credentials(None).unwrap(); - assert_eq!(creds.username.as_deref(), Some("__token__")); + fn ghcr_credentials_with_custom_username() { + let creds = ghcr_credentials(Some("myuser"), Some("ghp_test123")); + assert!(creds.is_some()); + let creds = creds.unwrap(); + assert_eq!(creds.username.as_deref(), Some("myuser")); + assert_eq!(creds.password.as_deref(), Some("ghp_test123")); assert_eq!(creds.serveraddress.as_deref(), Some("ghcr.io")); - // The password should be the decoded default token (non-empty). - assert!(creds.password.is_some()); - assert!(!creds.password.as_ref().unwrap().is_empty()); - - // Same for empty string. - let creds2 = ghcr_credentials(Some("")).unwrap(); - assert_eq!(creds2.password, creds.password); } #[test] - fn xor_decode_default_token() { - let token = default_registry_token(); - assert!( - !token.is_empty(), - "default token should decode to non-empty" - ); - assert!( - token.chars().all(|c| c.is_ascii_graphic()), - "default token should be printable ASCII" - ); + fn ghcr_credentials_without_token_returns_none() { + // No token means unauthenticated (public repos). + assert!(ghcr_credentials(None, None).is_none()); + assert!(ghcr_credentials(None, Some("")).is_none()); + assert!(ghcr_credentials(Some("myuser"), None).is_none()); } #[test] diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index e5f81dc5..8bcb60fd 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -102,9 +102,14 @@ pub struct DeployOptions { /// Disable gateway authentication (mTLS client certificate requirement). /// Ignored when `disable_tls` is true. pub disable_gateway_auth: bool, + /// Registry authentication username. Defaults to `__token__` when a + /// `registry_token` is provided but no username is set. Only needed + /// for private registries — public GHCR repos pull without auth. + pub registry_username: Option, /// Registry authentication token (e.g. a GitHub PAT with `read:packages` - /// scope) used to pull images from ghcr.io both during the initial - /// bootstrap pull and inside the k3s cluster at runtime. + /// scope) used to pull images from the registry both during the initial + /// bootstrap pull and inside the k3s cluster at runtime. Only needed + /// for private registries. pub registry_token: Option, /// Enable NVIDIA GPU passthrough. When true, the Docker container is /// created with GPU device requests (`--gpus all`) and the NVIDIA @@ -126,6 +131,7 @@ impl DeployOptions { gateway_host: None, disable_tls: false, disable_gateway_auth: false, + registry_username: None, registry_token: None, gpu: false, recreate: false, @@ -167,7 +173,14 @@ impl DeployOptions { self } - /// Set the registry authentication token for pulling images from ghcr.io. + /// Set the registry authentication username. + #[must_use] + pub fn with_registry_username(mut self, username: impl Into) -> Self { + self.registry_username = Some(username.into()); + self + } + + /// Set the registry authentication token for pulling images. #[must_use] pub fn with_registry_token(mut self, token: impl Into) -> Self { self.registry_token = Some(token.into()); @@ -247,6 +260,7 @@ where let gateway_host = options.gateway_host; let disable_tls = options.disable_tls; let disable_gateway_auth = options.disable_gateway_auth; + let registry_username = options.registry_username; let registry_token = options.registry_token; let gpu = options.gpu; let recreate = options.recreate; @@ -302,6 +316,7 @@ where image::pull_remote_image( &target_docker, &image_ref, + registry_username.as_deref(), registry_token.as_deref(), progress_cb, ) @@ -309,7 +324,13 @@ where } else { // Local deployment: ensure image exists (pull if needed) log("[status] Downloading gateway".to_string()); - ensure_image(&target_docker, &image_ref, registry_token.as_deref()).await?; + ensure_image( + &target_docker, + &image_ref, + registry_username.as_deref(), + registry_token.as_deref(), + ) + .await?; } // All subsequent operations use the target Docker (remote or local) @@ -388,6 +409,7 @@ where port, disable_tls, disable_gateway_auth, + registry_username.as_deref(), registry_token.as_deref(), gpu, ) @@ -537,10 +559,14 @@ pub async fn extract_and_store_pki( Ok(()) } -pub async fn ensure_gateway_image(version: &str, registry_token: Option<&str>) -> Result { +pub async fn ensure_gateway_image( + version: &str, + registry_username: Option<&str>, + registry_token: Option<&str>, +) -> Result { let docker = Docker::connect_with_local_defaults().into_diagnostic()?; let image_ref = format!("{}:{version}", image::DEFAULT_GATEWAY_IMAGE); - ensure_image(&docker, &image_ref, registry_token).await?; + ensure_image(&docker, &image_ref, registry_username, registry_token).await?; Ok(image_ref) } diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index 294995f1..eb8f93a3 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -154,9 +154,15 @@ pub async fn run_bootstrap( } options = options.with_remote(remote_opts); } - // Read registry token from environment for the auto-bootstrap path. - // The explicit `--registry-token` flag is only on `gateway start`; - // when bootstrapping via `sandbox create`, the env var is the mechanism. + // Read registry credentials from environment for the auto-bootstrap path. + // The explicit `--registry-username` / `--registry-token` flags are only + // on `gateway start`; when bootstrapping via `sandbox create`, the env + // vars are the mechanism. + if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") + && !username.trim().is_empty() + { + options = options.with_registry_username(username); + } if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") && !token.trim().is_empty() { diff --git a/crates/openshell-cli/src/doctor_llm_prompt.md b/crates/openshell-cli/src/doctor_llm_prompt.md index 3319d12f..4d4a6b64 100644 --- a/crates/openshell-cli/src/doctor_llm_prompt.md +++ b/crates/openshell-cli/src/doctor_llm_prompt.md @@ -147,7 +147,7 @@ openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-op Common issues: - **Replicas 0/0**: The StatefulSet has been scaled to zero — no pods are running. This can happen after a failed deploy, manual scale-down, or Helm values misconfiguration. Fix: `openshell doctor exec -- kubectl -n openshell scale statefulset openshell --replicas=1` -- **ImagePullBackOff**: The component image failed to pull. In `internal` mode, verify internal registry readiness and pushed image tags (Step 5). In `external` mode, check `/etc/rancher/k3s/registries.yaml` credentials/endpoints and DNS (Step 8). Default external registry is `ghcr.io/nvidia/openshell/`. Ensure a valid `--registry-token` (or `OPENSHELL_REGISTRY_TOKEN`) was provided during deploy. +- **ImagePullBackOff**: The component image failed to pull. In `internal` mode, verify internal registry readiness and pushed image tags (Step 5). In `external` mode, check `/etc/rancher/k3s/registries.yaml` credentials/endpoints and DNS (Step 8). Default external registry is `ghcr.io/nvidia/openshell/` (public, no auth required). If using a private registry, ensure `--registry-username` and `--registry-token` (or `OPENSHELL_REGISTRY_USERNAME`/`OPENSHELL_REGISTRY_TOKEN`) were provided during deploy. - **CrashLoopBackOff**: The server is crashing. Check pod logs for the actual error. - **Pending**: Insufficient resources or scheduling constraints. diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 8995c3df..84a323b5 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -763,12 +763,21 @@ enum GatewayCommands { #[arg(long)] disable_gateway_auth: bool, - /// Authentication token for pulling container images from ghcr.io. + /// Username for authenticating with the container image registry. /// - /// A GitHub personal access token (PAT) with `read:packages` scope. - /// Used to pull the cluster bootstrap image and passed into the k3s - /// cluster so it can pull server, sandbox, and community images at - /// runtime. + /// Defaults to `__token__` when `--registry-token` is set (the + /// standard convention for GHCR PAT-based auth). Only needed for + /// private registries — public GHCR repos pull without auth. + #[arg(long, env = "OPENSHELL_REGISTRY_USERNAME")] + registry_username: Option, + + /// Authentication token for pulling container images from the registry. + /// + /// For GHCR, this is a GitHub personal access token (PAT) with + /// `read:packages` scope. Only needed for private registries — + /// public GHCR repos pull without auth. Used to pull the cluster + /// bootstrap image and passed into the k3s cluster so it can pull + /// server, sandbox, and community images at runtime. #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")] registry_token: Option, @@ -1438,6 +1447,7 @@ async fn main() -> Result<()> { recreate, plaintext, disable_gateway_auth, + registry_username, registry_token, gpu, } => { @@ -1450,6 +1460,7 @@ async fn main() -> Result<()> { recreate, plaintext, disable_gateway_auth, + registry_username.as_deref(), registry_token.as_deref(), gpu, ) diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 22123c1d..052a7de2 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1317,6 +1317,7 @@ pub async fn gateway_admin_deploy( recreate: bool, disable_tls: bool, disable_gateway_auth: bool, + registry_username: Option<&str>, registry_token: Option<&str>, gpu: bool, ) -> Result<()> { @@ -1390,6 +1391,9 @@ pub async fn gateway_admin_deploy( if let Some(host) = gateway_host { options = options.with_gateway_host(host); } + if let Some(username) = registry_username { + options = options.with_registry_username(username); + } if let Some(token) = registry_token { options = options.with_registry_token(token); } diff --git a/docs/sandboxes/manage-gateways.md b/docs/sandboxes/manage-gateways.md index 07449d64..2f3dba7a 100644 --- a/docs/sandboxes/manage-gateways.md +++ b/docs/sandboxes/manage-gateways.md @@ -164,6 +164,16 @@ $ openshell gateway info $ openshell gateway info --name my-remote-cluster ``` +## Advanced Start Options + +| Flag | Purpose | +|---|---| +| `--gpu` | Enable NVIDIA GPU passthrough. Requires NVIDIA drivers and the Container Toolkit on the host. | +| `--plaintext` | Listen on HTTP instead of mTLS. Use behind a TLS-terminating reverse proxy. | +| `--disable-gateway-auth` | Skip mTLS client certificate checks. Use when a reverse proxy cannot forward client certs. | +| `--registry-username` | Username for registry authentication. Defaults to `__token__` when `--registry-token` is set. Only needed for private registries. Also configurable with `OPENSHELL_REGISTRY_USERNAME`. | +| `--registry-token` | Authentication token for pulling container images. For GHCR, a GitHub PAT with `read:packages` scope. Only needed for private registries. Also configurable with `OPENSHELL_REGISTRY_TOKEN`. | + ## Stop and Destroy Stop a gateway while preserving its state for later restart: From 66e1f58560b96ba9e3c29697d02bfd894047f762 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 15 Mar 2026 16:53:50 -0700 Subject: [PATCH 2/2] install script --- README.md | 19 +-- install.sh | 427 ++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 378 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index b30b8134..5e531987 100644 --- a/README.md +++ b/README.md @@ -21,25 +21,10 @@ Want to run on cloud compute? [Launch on Brev](https://brev.nvidia.com/launchabl ### Install -**Binary (recommended — requires [GitHub CLI](https://cli.github.com)):** +**Binary (recommended):** ```bash -sh -c 'ARCH=$(uname -m); OS=$(uname -s); \ - case "${OS}-${ARCH}" in \ - Linux-x86_64) ASSET="openshell-x86_64-unknown-linux-musl.tar.gz" ;; \ - Linux-aarch64) ASSET="openshell-aarch64-unknown-linux-musl.tar.gz" ;; \ - Darwin-arm64) ASSET="openshell-aarch64-apple-darwin.tar.gz" ;; \ - *) echo "Unsupported platform: ${OS}-${ARCH}" >&2; exit 1 ;; \ - esac; \ - gh release download devel --repo NVIDIA/OpenShell --pattern "${ASSET}" -O - \ - | tar xz \ - && sudo install -m 755 openshell /usr/local/bin/openshell' -``` - -Or use the install script from the repository: - -```bash -./install.sh +curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/main/install.sh | sh ``` **From PyPI (requires [uv](https://docs.astral.sh/uv/)):** diff --git a/install.sh b/install.sh index 4a778a29..d945cde7 100755 --- a/install.sh +++ b/install.sh @@ -4,31 +4,122 @@ # # Install the OpenShell CLI binary. # -# Requires the GitHub CLI (gh) to be installed and authenticated, since this -# repository is internal and public HTTP download links are not available. -# # Usage: +# curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/main/install.sh | sh +# +# Or run directly: # ./install.sh # # Environment variables: -# OPENSHELL_VERSION - Release tag to install (default: "devel") -# OPENSHELL_INSTALL_DIR - Directory to install into (default: /usr/local/bin) +# OPENSHELL_VERSION - Release tag to install (default: latest tagged release) +# OPENSHELL_INSTALL_DIR - Directory to install into (default: ~/.local/bin) +# +# CLI flags: +# --help - Print usage information +# --no-modify-path - Skip PATH modification in shell profiles # set -eu +APP_NAME="openshell" REPO="NVIDIA/OpenShell" -VERSION="${OPENSHELL_VERSION:-devel}" -INSTALL_DIR="${OPENSHELL_INSTALL_DIR:-/usr/local/bin}" +GITHUB_URL="https://github.com/${REPO}" +NO_MODIFY_PATH=0 + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- info() { - echo "openshell: $*" >&2 + printf '%s: %s\n' "$APP_NAME" "$*" >&2 +} + +warn() { + printf '%s: warning: %s\n' "$APP_NAME" "$*" >&2 } error() { - echo "openshell: error: $*" >&2 + printf '%s: error: %s\n' "$APP_NAME" "$*" >&2 exit 1 } +# --------------------------------------------------------------------------- +# Usage +# --------------------------------------------------------------------------- + +usage() { + cat </dev/null 2>&1 +} + +check_downloader() { + if has_cmd curl; then + return 0 + elif has_cmd wget; then + return 0 + else + error "either 'curl' or 'wget' is required to download files" + fi +} + +# Download a URL to a file. Outputs nothing on success. +download() { + _url="$1" + _output="$2" + + if has_cmd curl; then + curl -fLsS --retry 3 -o "$_output" "$_url" + elif has_cmd wget; then + wget -q --tries=3 -O "$_output" "$_url" + fi +} + +# Follow a URL and print the final resolved URL (for detecting redirect targets). +resolve_redirect() { + _url="$1" + + if has_cmd curl; then + curl -fLsS -o /dev/null -w '%{url_effective}' "$_url" + elif has_cmd wget; then + # wget --spider follows redirects and prints the final URL + wget --spider -q --max-redirect=10 "$_url" 2>&1 | grep -oP 'Location: \K\S+' | tail -1 + fi +} + +# --------------------------------------------------------------------------- +# Platform detection +# --------------------------------------------------------------------------- + get_os() { case "$(uname -s)" in Darwin) echo "apple-darwin" ;; @@ -39,84 +130,318 @@ get_os() { get_arch() { case "$(uname -m)" in - x86_64|amd64) echo "x86_64" ;; + x86_64|amd64) echo "x86_64" ;; aarch64|arm64) echo "aarch64" ;; *) error "unsupported architecture: $(uname -m)" ;; esac } get_target() { - arch="$(get_arch)" - os="$(get_os)" - target="${arch}-${os}" + _arch="$(get_arch)" + _os="$(get_os)" + _target="${_arch}-${_os}" # Only these targets have published binaries. - case "$target" in + case "$_target" in x86_64-unknown-linux-musl|aarch64-unknown-linux-musl|aarch64-apple-darwin) ;; x86_64-apple-darwin) error "macOS x86_64 is not supported; use Apple Silicon (aarch64) or Rosetta 2" ;; - *) error "no prebuilt binary for $target" ;; + *) error "no prebuilt binary for $_target" ;; esac - echo "$target" + echo "$_target" } +# --------------------------------------------------------------------------- +# Version resolution +# --------------------------------------------------------------------------- + +resolve_version() { + if [ -n "${OPENSHELL_VERSION:-}" ]; then + echo "$OPENSHELL_VERSION" + return 0 + fi + + # Resolve "latest" by following the GitHub releases/latest redirect. + # GitHub redirects /releases/latest -> /releases/tag/ + info "resolving latest version..." + _latest_url="${GITHUB_URL}/releases/latest" + _resolved="$(resolve_redirect "$_latest_url")" || error "failed to resolve latest release from ${_latest_url}" + + # Extract the tag from the resolved URL: .../releases/tag/v0.0.4 -> v0.0.4 + _version="${_resolved##*/}" + + if [ -z "$_version" ] || [ "$_version" = "latest" ]; then + error "could not determine latest release version (resolved URL: ${_resolved})" + fi + + echo "$_version" +} + +# --------------------------------------------------------------------------- +# Checksum verification +# --------------------------------------------------------------------------- + verify_checksum() { - archive="$1" checksums="$2" filename="$3" - expected="$(grep "$filename" "$checksums" | awk '{print $1}')" + _archive="$1" + _checksums="$2" + _filename="$3" - if [ -z "$expected" ]; then - info "warning: no checksum found for $filename, skipping verification" + _expected="$(grep "$_filename" "$_checksums" | awk '{print $1}')" + + if [ -z "$_expected" ]; then + warn "no checksum found for $_filename, skipping verification" return 0 fi - # Prefer shasum (ships with macOS and most Linux); the macOS /sbin/sha256sum - # does not support -c / stdin check mode. - if command -v shasum >/dev/null 2>&1; then - echo "$expected $archive" | shasum -a 256 -c --quiet 2>/dev/null - elif command -v sha256sum >/dev/null 2>&1; then - echo "$expected $archive" | sha256sum -c --quiet 2>/dev/null + if has_cmd shasum; then + echo "$_expected $_archive" | shasum -a 256 -c --quiet 2>/dev/null + elif has_cmd sha256sum; then + echo "$_expected $_archive" | sha256sum -c --quiet 2>/dev/null else - info "warning: sha256sum/shasum not found, skipping checksum verification" + warn "sha256sum/shasum not found, skipping checksum verification" return 0 fi } +# --------------------------------------------------------------------------- +# Install location and PATH management +# --------------------------------------------------------------------------- + +get_home() { + if [ -n "${HOME:-}" ]; then + echo "$HOME" + elif [ -n "${USER:-}" ]; then + getent passwd "$USER" | cut -d: -f6 + else + getent passwd "$(id -un)" | cut -d: -f6 + fi +} + +get_default_install_dir() { + if [ -n "${XDG_BIN_HOME:-}" ]; then + echo "$XDG_BIN_HOME" + else + _home="$(get_home)" + echo "${_home}/.local/bin" + fi +} + +# Check if a directory is already on PATH. +is_on_path() { + _dir="$1" + case ":${PATH}:" in + *":${_dir}:"*) return 0 ;; + *) return 1 ;; + esac +} + +# Write a small env script that conditionally prepends the install dir to PATH. +write_env_script_sh() { + _install_dir_expr="$1" + _env_script="$2" + + cat < "$_env_script" +#!/bin/sh +# Add OpenShell to PATH if not already present +case ":\${PATH}:" in + *:"${_install_dir_expr}":*) + ;; + *) + export PATH="${_install_dir_expr}:\$PATH" + ;; +esac +ENVEOF +} + +write_env_script_fish() { + _install_dir_expr="$1" + _env_script="$2" + + cat < "$_env_script" +# Add OpenShell to PATH if not already present +if not contains "${_install_dir_expr}" \$PATH + set -gx PATH "${_install_dir_expr}" \$PATH +end +ENVEOF +} + +# Add a `. /path/to/env` line to a shell rc file if not already present. +add_source_line() { + _env_script_path="$1" + _rcfile="$2" + _shell_type="$3" + + if [ "$_shell_type" = "fish" ]; then + _line="source \"${_env_script_path}\"" + else + _line=". \"${_env_script_path}\"" + fi + + # Check if line already exists + if [ -f "$_rcfile" ] && grep -qF "$_line" "$_rcfile" 2>/dev/null; then + return 0 + fi + + # Append with a leading newline in case the file doesn't end with one + printf '\n%s\n' "$_line" >> "$_rcfile" + return 1 +} + +# Set up PATH modification in common shell rc files. +setup_path() { + _install_dir="$1" + _home="$(get_home)" + _env_script="${_install_dir}/env" + _fish_env_script="${_install_dir}/env.fish" + _needs_source=0 + + # Replace $HOME in the expression for late-bound references in rc files + if [ -n "${HOME:-}" ]; then + # shellcheck disable=SC2016 + _install_dir_expr='$HOME'"${_install_dir#"$_home"}" + else + _install_dir_expr="$_install_dir" + fi + + # Write the env scripts + write_env_script_sh "$_install_dir_expr" "$_env_script" + write_env_script_fish "$_install_dir_expr" "$_fish_env_script" + + # POSIX shells: .profile, .bashrc, .bash_profile, .zshrc, .zshenv + for _rcfile_rel in .profile .bashrc .bash_profile .zshrc .zshenv; do + _rcdir="$_home" + # zsh respects ZDOTDIR + case "$_rcfile_rel" in + .zsh*) _rcdir="${ZDOTDIR:-$_home}" ;; + esac + _rcfile="${_rcdir}/${_rcfile_rel}" + if [ -f "$_rcfile" ]; then + if ! add_source_line "$_env_script" "$_rcfile" "sh"; then + _needs_source=1 + fi + fi + done + + # If none of the above existed, create .profile + if [ "$_needs_source" = "0" ]; then + _found_any=0 + for _rcfile_rel in .profile .bashrc .bash_profile .zshrc .zshenv; do + if [ -f "${_home}/${_rcfile_rel}" ]; then + _found_any=1 + break + fi + done + if [ "$_found_any" = "0" ]; then + if ! add_source_line "$_env_script" "${_home}/.profile" "sh"; then + _needs_source=1 + fi + fi + fi + + # Fish shell + _fish_conf_dir="${_home}/.config/fish/conf.d" + if [ -d "${_home}/.config/fish" ]; then + mkdir -p "$_fish_conf_dir" + add_source_line "$_fish_env_script" "${_fish_conf_dir}/${APP_NAME}.env.fish" "fish" || true + fi + + # GitHub Actions: write to GITHUB_PATH for CI environments + if [ -n "${GITHUB_PATH:-}" ]; then + echo "$_install_dir" >> "$GITHUB_PATH" + fi + + if [ "$_needs_source" = "1" ] || ! is_on_path "$_install_dir"; then + echo "" + info "to add ${APP_NAME} to your PATH, restart your shell or run:" + info "" + info " source \"${_env_script}\" (sh, bash, zsh)" + if [ -d "${_home}/.config/fish" ]; then + info " source \"${_fish_env_script}\" (fish)" + fi + fi +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + main() { - command -v gh >/dev/null 2>&1 || error "the GitHub CLI (gh) is required; install it from https://cli.github.com" + # Parse CLI flags + for arg in "$@"; do + case "$arg" in + --help) + usage + exit 0 + ;; + --no-modify-path) + NO_MODIFY_PATH=1 + ;; + *) + error "unknown option: $arg" + ;; + esac + done + + check_downloader + + _version="$(resolve_version)" + _target="$(get_target)" + _filename="${APP_NAME}-${_target}.tar.gz" + _download_url="${GITHUB_URL}/releases/download/${_version}/${_filename}" + _checksums_url="${GITHUB_URL}/releases/download/${_version}/${APP_NAME}-checksums-sha256.txt" + + # Determine install directory + _using_default_dir=0 + if [ -n "${OPENSHELL_INSTALL_DIR:-}" ]; then + _install_dir="$OPENSHELL_INSTALL_DIR" + else + _install_dir="$(get_default_install_dir)" + _using_default_dir=1 + fi - target="$(get_target)" - filename="openshell-${target}.tar.gz" + info "downloading ${APP_NAME} ${_version} (${_target})..." - tmpdir="$(mktemp -d)" - trap 'rm -rf "$tmpdir"' EXIT + _tmpdir="$(mktemp -d)" + trap 'rm -rf "$_tmpdir"' EXIT - info "downloading ${filename} (${VERSION})..." - gh release download "${VERSION}" \ - --repo "${REPO}" \ - --pattern "${filename}" \ - --output "${tmpdir}/${filename}" + if ! download "$_download_url" "${_tmpdir}/${_filename}"; then + error "failed to download ${_download_url}" + fi + # Verify checksum info "verifying checksum..." - gh release download "${VERSION}" \ - --repo "${REPO}" \ - --pattern "openshell-checksums-sha256.txt" \ - --output "${tmpdir}/checksums.txt" - if ! verify_checksum "${tmpdir}/${filename}" "${tmpdir}/checksums.txt" "$filename"; then - error "checksum verification failed" + if download "$_checksums_url" "${_tmpdir}/checksums.txt"; then + if ! verify_checksum "${_tmpdir}/${_filename}" "${_tmpdir}/checksums.txt" "$_filename"; then + error "checksum verification failed for ${_filename}" + fi + else + warn "could not download checksums file, skipping verification" fi + # Extract info "extracting..." - tar -xzf "${tmpdir}/${filename}" -C "${tmpdir}" + tar -xzf "${_tmpdir}/${_filename}" -C "${_tmpdir}" - info "installing to ${INSTALL_DIR}/openshell..." - if [ -w "$INSTALL_DIR" ]; then - install -m 755 "${tmpdir}/openshell" "${INSTALL_DIR}/openshell" + # Install + mkdir -p "$_install_dir" 2>/dev/null || true + + if [ -w "$_install_dir" ] || mkdir -p "$_install_dir" 2>/dev/null; then + install -m 755 "${_tmpdir}/${APP_NAME}" "${_install_dir}/${APP_NAME}" else - info "sudo access is required to install to ${INSTALL_DIR}" - sudo install -m 755 "${tmpdir}/openshell" "${INSTALL_DIR}/openshell" + info "elevated permissions required to install to ${_install_dir}" + sudo mkdir -p "$_install_dir" + sudo install -m 755 "${_tmpdir}/${APP_NAME}" "${_install_dir}/${APP_NAME}" fi - info "installed openshell $(${INSTALL_DIR}/openshell --version 2>/dev/null || echo "${VERSION}") to ${INSTALL_DIR}/openshell" + _installed_version="$("${_install_dir}/${APP_NAME}" --version 2>/dev/null || echo "${_version}")" + info "installed ${APP_NAME} ${_installed_version} to ${_install_dir}/${APP_NAME}" + + # Set up PATH for default install location + if [ "$_using_default_dir" = "1" ] && [ "$NO_MODIFY_PATH" = "0" ]; then + if ! is_on_path "$_install_dir"; then + setup_path "$_install_dir" + fi + fi } -main +main "$@"