From ebdc1f6380318be5aa10f55667c1fcd7c95e2170 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 18 Mar 2026 21:49:08 -0700 Subject: [PATCH 1/3] fix(bootstrap): surface diagnostics for K8s namespace not ready failures The 'K8s namespace not ready' error had three gaps preventing diagnostic information from reaching users: 1. The non-interactive (CI/piped) code path used bare error propagation with no diagnosis at all. 2. The interactive path's pattern matcher returned None for the common timeout case, and the generic_failure_diagnosis fallback existed but was never called. 3. Container logs were never passed to the diagnosis engine, so patterns only visible in logs (node pressure, corrupted state, etc.) could not match. Fix all three by fetching container logs at the CLI error-handling site, passing them to diagnose_failure, and falling back to generic_failure_diagnosis when no specific pattern matches. Also add container logs to the two wait_for_namespace error paths that were missing them (timeout and exec-error-on-final-attempt), and update the generic diagnosis to suggest 'openshell doctor' commands. --- crates/openshell-bootstrap/src/errors.rs | 12 ++++-- crates/openshell-bootstrap/src/lib.rs | 24 ++++++++++- crates/openshell-cli/src/run.rs | 52 ++++++++++++++++++++---- 3 files changed, 73 insertions(+), 15 deletions(-) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 1511b362..6d9c229c 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -449,16 +449,20 @@ pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis summary: "Gateway failed to start".to_string(), explanation: "The gateway encountered an unexpected error during startup.".to_string(), recovery_steps: vec![ + RecoveryStep::with_command( + "Check container logs for details", + format!("openshell doctor logs --name {gateway_name}"), + ), + RecoveryStep::with_command( + "Run diagnostics", + format!("openshell doctor check --name {gateway_name}"), + ), RecoveryStep::with_command( "Try destroying and recreating the gateway", format!( "openshell gateway destroy --name {gateway_name} && openshell gateway start" ), ), - RecoveryStep::with_command( - "Check container logs for details", - format!("docker logs openshell-cluster-{gateway_name}"), - ), RecoveryStep::new( "If the issue persists, report it at https://github.com/nvidia/openshell/issues", ), diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index b6423aae..48f8d74d 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -638,6 +638,21 @@ pub async fn gateway_container_logs( Ok(()) } +/// Fetch the last `n` lines of container logs for a local gateway as a +/// `String`. This is a convenience wrapper for diagnostic call sites (e.g. +/// failure diagnosis in the CLI) that do not hold a Docker client handle. +/// +/// Returns an empty string on any Docker/connection error so callers don't +/// need to worry about error handling. +pub async fn fetch_gateway_logs(name: &str, n: usize) -> String { + let docker = match Docker::connect_with_local_defaults() { + Ok(d) => d, + Err(_) => return String::new(), + }; + let container = container_name(name); + fetch_recent_logs(&docker, &container, n).await +} + fn default_gateway_image_ref() -> String { if let Ok(image) = std::env::var("OPENSHELL_CLUSTER_IMAGE") && !image.trim().is_empty() @@ -984,7 +999,11 @@ async fn wait_for_namespace( } if attempt + 1 == attempts { - return Err(err).wrap_err("K8s namespace not ready"); + let logs = fetch_recent_logs(docker, container_name, 40).await; + return Err(miette::miette!( + "exec failed on final attempt while waiting for namespace '{namespace}': {err}\n{logs}" + )) + .wrap_err("K8s namespace not ready"); } tokio::time::sleep(backoff).await; backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff); @@ -997,8 +1016,9 @@ async fn wait_for_namespace( } if attempt + 1 == attempts { + let logs = fetch_recent_logs(docker, container_name, 40).await; return Err(miette::miette!( - "timed out waiting for namespace '{namespace}' to exist: {output}" + "timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}" )) .wrap_err("K8s namespace not ready"); } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 37f11fcb..2f9dd2f7 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1248,19 +1248,27 @@ pub(crate) async fn deploy_gateway_with_panel( "x".red().bold(), "Gateway failed:".red().bold(), ); + // Fetch container logs for pattern-based diagnosis + let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await; + let logs_opt = if container_logs.is_empty() { + None + } else { + Some(container_logs.as_str()) + }; // Try to diagnose the failure and provide guidance let err_str = format!("{err:?}"); - if let Some(diagnosis) = - openshell_bootstrap::errors::diagnose_failure(name, &err_str, None) - { - print_failure_diagnosis(&diagnosis); - } + let diagnosis = + openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt) + .unwrap_or_else(|| { + openshell_bootstrap::errors::generic_failure_diagnosis(name) + }); + print_failure_diagnosis(&diagnosis); Err(err) } } } else { eprintln!("Deploying {location} gateway {name}..."); - let handle = openshell_bootstrap::deploy_gateway_with_logs(options, |line| { + let result = openshell_bootstrap::deploy_gateway_with_logs(options, |line| { if let Some(status) = line.strip_prefix("[status] ") { eprintln!(" {status}"); } else if line.strip_prefix("[progress] ").is_some() { @@ -1269,9 +1277,35 @@ pub(crate) async fn deploy_gateway_with_panel( eprintln!(" {line}"); } }) - .await?; - eprintln!("Gateway {name} ready."); - Ok(handle) + .await; + match result { + Ok(handle) => { + eprintln!("Gateway {name} ready."); + Ok(handle) + } + Err(err) => { + eprintln!( + "{} {} {name}", + "x".red().bold(), + "Gateway failed:".red().bold(), + ); + // Fetch container logs for pattern-based diagnosis + let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await; + let logs_opt = if container_logs.is_empty() { + None + } else { + Some(container_logs.as_str()) + }; + let err_str = format!("{err:?}"); + let diagnosis = + openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt) + .unwrap_or_else(|| { + openshell_bootstrap::errors::generic_failure_diagnosis(name) + }); + print_failure_diagnosis(&diagnosis); + Err(err) + } + } } } From b74da9d5d9a8be3bd4bd3967e09d428ed784c2d2 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 18 Mar 2026 21:53:47 -0700 Subject: [PATCH 2/3] fix(cli): correct 'openshell gateway status' to 'openshell status' in bootstrap message The auto-bootstrap banner told users to run 'openshell gateway status', but 'status' is a top-level command, not a gateway subcommand. Running the suggested command produced 'unrecognized subcommand' error. --- crates/openshell-cli/src/bootstrap.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index eb8f93a3..e976061f 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -139,7 +139,7 @@ pub async fn run_bootstrap( eprintln!(); eprintln!( " Manage it later with: {} or {}", - "openshell gateway status".bold(), + "openshell status".bold(), "openshell gateway stop".bold(), ); eprintln!(); From db7a2fb44401f5641d6f345545f7f580efe62628 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 18 Mar 2026 22:03:48 -0700 Subject: [PATCH 3/3] test(bootstrap): add tests for namespace-not-ready diagnostic fallback Cover the three key behaviors introduced by the diagnostic fix: - generic_failure_diagnosis suggests doctor logs/check commands - Plain namespace timeout returns None from diagnose_failure (confirming the generic fallback is necessary) - Container logs enable pattern matching for namespace errors that would otherwise go undiagnosed (node pressure, corrupted state, no route, network connectivity) - End-to-end fallback pattern mirrors the actual CLI unwrap_or_else chain --- crates/openshell-bootstrap/src/errors.rs | 192 +++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 6d9c229c..f316b953 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -654,4 +654,196 @@ mod tests { ); assert!(d.retryable); } + + // -- generic_failure_diagnosis tests -- + + #[test] + fn generic_diagnosis_suggests_doctor_logs() { + let d = generic_failure_diagnosis("my-gw"); + let commands: Vec = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect(); + assert!( + commands.iter().any(|c| c.contains("openshell doctor logs")), + "expected 'openshell doctor logs' in recovery commands, got: {commands:?}" + ); + } + + #[test] + fn generic_diagnosis_suggests_doctor_check() { + let d = generic_failure_diagnosis("my-gw"); + let commands: Vec = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect(); + assert!( + commands + .iter() + .any(|c| c.contains("openshell doctor check")), + "expected 'openshell doctor check' in recovery commands, got: {commands:?}" + ); + } + + #[test] + fn generic_diagnosis_includes_gateway_name() { + let d = generic_failure_diagnosis("custom-name"); + let all_text: String = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect::>() + .join(" "); + assert!( + all_text.contains("custom-name"), + "expected gateway name in recovery commands, got: {all_text}" + ); + } + + // -- fallback behavior tests -- + + #[test] + fn namespace_timeout_without_logs_returns_none() { + // This is the most common user-facing error: a plain timeout with only + // kubectl output. It must NOT match any specific pattern so the caller + // can fall back to generic_failure_diagnosis. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: \ + error: the server doesn't have a resource type \"namespace\"", + None, + ); + assert!( + diagnosis.is_none(), + "plain namespace timeout should not match any specific pattern, got: {:?}", + diagnosis.map(|d| d.summary) + ); + } + + #[test] + fn namespace_timeout_with_pressure_logs_matches() { + // When container logs reveal node pressure, the diagnosis engine + // should detect it even though the error message itself is generic. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: ", + Some("HEALTHCHECK_NODE_PRESSURE: DiskPressure"), + ); + assert!(diagnosis.is_some(), "expected node pressure diagnosis"); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("pressure"), + "expected pressure in summary, got: {}", + d.summary + ); + } + + #[test] + fn namespace_timeout_with_corrupted_state_logs_matches() { + // Container logs revealing RBAC corruption should be caught. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: ", + Some( + "configmaps \"extension-apiserver-authentication\" is forbidden: \ + User cannot get resource", + ), + ); + assert!(diagnosis.is_some(), "expected corrupted state diagnosis"); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("Corrupted"), + "expected Corrupted in summary, got: {}", + d.summary + ); + } + + #[test] + fn namespace_timeout_with_no_route_logs_matches() { + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready", + Some("Error: no default route present before starting k3s"), + ); + assert!(diagnosis.is_some(), "expected networking diagnosis"); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("networking"), + "expected networking in summary, got: {}", + d.summary + ); + } + + #[test] + fn diagnose_failure_with_logs_uses_combined_text() { + // Verify that diagnose_failure combines error_message + container_logs + // for pattern matching. The pattern "connection refused" is in logs, + // not in the error message. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready", + Some("dial tcp 127.0.0.1:6443: connect: connection refused"), + ); + assert!( + diagnosis.is_some(), + "expected diagnosis from container logs pattern" + ); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("Network") || d.summary.contains("connectivity"), + "expected network diagnosis, got: {}", + d.summary + ); + } + + // -- end-to-end fallback pattern (mirrors CLI code) -- + + #[test] + fn fallback_to_generic_produces_actionable_diagnosis() { + // This mirrors the actual CLI pattern: + // diagnose_failure(...).unwrap_or_else(|| generic_failure_diagnosis(name)) + // For a plain namespace timeout with no useful container logs, the + // specific matcher returns None and we must fall back to the generic + // diagnosis that suggests doctor commands. + let err_str = "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: \ + error: the server doesn't have a resource type \"namespace\""; + let container_logs = Some("k3s is starting\nwaiting for kube-apiserver"); + + let diagnosis = diagnose_failure("my-gw", err_str, container_logs) + .unwrap_or_else(|| generic_failure_diagnosis("my-gw")); + + // Should have gotten the generic diagnosis (no specific pattern matched) + assert_eq!(diagnosis.summary, "Gateway failed to start"); + // Must contain actionable recovery steps + assert!( + !diagnosis.recovery_steps.is_empty(), + "generic diagnosis should have recovery steps" + ); + // Must mention doctor commands + let all_commands: String = diagnosis + .recovery_steps + .iter() + .filter_map(|s| s.command.as_ref()) + .cloned() + .collect::>() + .join("\n"); + assert!( + all_commands.contains("doctor logs"), + "should suggest 'doctor logs', got: {all_commands}" + ); + assert!( + all_commands.contains("doctor check"), + "should suggest 'doctor check', got: {all_commands}" + ); + assert!( + all_commands.contains("my-gw"), + "commands should include gateway name, got: {all_commands}" + ); + } }