diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 3812f72b..1cb62b7b 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -91,6 +91,153 @@ pub fn normalize_arch(arch: &str) -> String { } } +/// Result of a successful Docker preflight check. +/// +/// Contains the validated Docker client and metadata about the daemon so +/// callers can reuse the connection without re-checking. +#[derive(Debug)] +pub struct DockerPreflight { + /// A Docker client that has been verified as connected and responsive. + pub docker: Docker, + /// Docker daemon version string (e.g., "28.1.1"). + pub version: Option, +} + +/// Well-known Docker socket paths to probe when the default fails. +/// +/// These cover common container runtimes on macOS and Linux: +/// - `/var/run/docker.sock` — default for Docker Desktop, `OrbStack`, Colima +/// - `$HOME/.colima/docker.sock` — Colima (older installs) +/// - `$HOME/.orbstack/run/docker.sock` — `OrbStack` (if symlink is missing) +const WELL_KNOWN_SOCKET_PATHS: &[&str] = &[ + "/var/run/docker.sock", + // Expanded at runtime via home_dir(): + // ~/.colima/docker.sock + // ~/.orbstack/run/docker.sock +]; + +/// Check that a Docker-compatible runtime is installed, running, and reachable. +/// +/// This is the primary preflight gate. It must be called before any gateway +/// deploy work begins. On failure it produces a user-friendly error with +/// actionable recovery steps instead of a raw bollard connection error. +pub async fn check_docker_available() -> Result { + // Step 1: Try to connect using bollard's default resolution + // (respects DOCKER_HOST, then falls back to /var/run/docker.sock). + let docker = match Docker::connect_with_local_defaults() { + Ok(d) => d, + Err(err) => { + return Err(docker_not_reachable_error( + &format!("{err}"), + "Failed to create Docker client", + )); + } + }; + + // Step 2: Ping the daemon to confirm it's responsive. + if let Err(err) = docker.ping().await { + return Err(docker_not_reachable_error( + &format!("{err}"), + "Docker socket exists but the daemon is not responding", + )); + } + + // Step 3: Query version info (best-effort — don't fail on this). + let version = match docker.version().await { + Ok(v) => v.version, + Err(_) => None, + }; + + Ok(DockerPreflight { docker, version }) +} + +/// Build a rich, user-friendly error when Docker is not reachable. +fn docker_not_reachable_error(raw_err: &str, summary: &str) -> miette::Report { + let docker_host = std::env::var("DOCKER_HOST").ok(); + let socket_exists = std::path::Path::new("/var/run/docker.sock").exists(); + + let mut hints: Vec = Vec::new(); + + if !socket_exists && docker_host.is_none() { + // No socket and no DOCKER_HOST — likely nothing is installed or started + hints.push( + "No Docker socket found at /var/run/docker.sock and DOCKER_HOST is not set." + .to_string(), + ); + hints.push( + "Install and start a Docker-compatible runtime. See the support matrix \ + in the OpenShell docs for tested configurations." + .to_string(), + ); + + // Check for alternative sockets that might exist + let alt_sockets = find_alternative_sockets(); + if !alt_sockets.is_empty() { + hints.push(format!( + "Found Docker-compatible socket(s) at alternative path(s):\n {}\n\n \ + Set DOCKER_HOST to use one, e.g.:\n\n \ + export DOCKER_HOST=unix://{}", + alt_sockets.join("\n "), + alt_sockets[0], + )); + } + } else if docker_host.is_some() { + // DOCKER_HOST is set but daemon didn't respond + let host_val = docker_host.unwrap(); + hints.push(format!( + "DOCKER_HOST is set to '{host_val}' but the Docker daemon is not responding." + )); + hints.push( + "Verify your Docker runtime is started and the DOCKER_HOST value is correct." + .to_string(), + ); + } else { + // Socket exists but daemon isn't responding + hints.push( + "Docker socket found at /var/run/docker.sock but the daemon is not responding." + .to_string(), + ); + hints.push("Start your Docker runtime and try again.".to_string()); + } + + hints.push("Verify Docker is working with: docker info".to_string()); + + let help_text = hints.join("\n\n"); + + miette::miette!(help = help_text, "{summary}.\n\n {raw_err}") +} + +/// Probe for Docker-compatible sockets at non-default locations. +fn find_alternative_sockets() -> Vec { + let mut found = Vec::new(); + + // Check well-known static paths + for path in WELL_KNOWN_SOCKET_PATHS { + if std::path::Path::new(path).exists() { + found.push(path.to_string()); + } + } + + // Check home-relative paths + if let Some(home) = home_dir() { + let home_sockets = [ + format!("{home}/.colima/docker.sock"), + format!("{home}/.orbstack/run/docker.sock"), + ]; + for path in &home_sockets { + if std::path::Path::new(path).exists() && !found.contains(path) { + found.push(path.clone()); + } + } + } + + found +} + +fn home_dir() -> Option { + std::env::var("HOME").ok() +} + /// Create an SSH Docker client from remote options. pub async fn create_ssh_docker_client(remote: &RemoteOptions) -> Result { // Ensure destination has ssh:// prefix @@ -981,4 +1128,74 @@ mod tests { }; assert_eq!(platform.platform_string(), "linux/arm64"); } + + #[test] + fn docker_not_reachable_error_no_socket_no_docker_host() { + // Simulate: no socket at default path, no DOCKER_HOST set. + // We can't guarantee /var/run/docker.sock state in CI, but we can + // verify the error message is well-formed and contains guidance. + let err = + docker_not_reachable_error("connection refused", "Failed to create Docker client"); + let msg = format!("{err:?}"); + assert!( + msg.contains("Failed to create Docker client"), + "should include the summary" + ); + assert!( + msg.contains("connection refused"), + "should include the raw error" + ); + // The message should always include the verification step + assert!( + msg.contains("docker info"), + "should suggest 'docker info' verification" + ); + } + + #[test] + fn docker_not_reachable_error_with_docker_host() { + // Simulate: DOCKER_HOST is set but daemon unresponsive. + // We set the env var temporarily (this is test-only). + let prev_docker_host = std::env::var("DOCKER_HOST").ok(); + // SAFETY: test-only, single-threaded test runner for this test + unsafe { + std::env::set_var("DOCKER_HOST", "unix:///tmp/fake-docker.sock"); + } + + let err = docker_not_reachable_error( + "daemon not responding", + "Docker socket exists but the daemon is not responding", + ); + let msg = format!("{err:?}"); + + // Restore env + // SAFETY: test-only, restoring previous state + unsafe { + match prev_docker_host { + Some(val) => std::env::set_var("DOCKER_HOST", val), + None => std::env::remove_var("DOCKER_HOST"), + } + } + + assert!( + msg.contains("DOCKER_HOST"), + "should mention DOCKER_HOST when it is set" + ); + assert!( + msg.contains("unix:///tmp/fake-docker.sock"), + "should show the current DOCKER_HOST value" + ); + } + + #[test] + fn find_alternative_sockets_returns_vec() { + // Verify the function runs without panic and returns a vec. + // Exact contents depend on the host system, so we just check the type. + let sockets = find_alternative_sockets(); + // On any system, /var/run/docker.sock may or may not exist + assert!( + sockets.len() <= 10, + "should return a reasonable number of sockets" + ); + } } diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 39088ffb..14284a90 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -155,12 +155,16 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ match_mode: MatchMode::Any, diagnose: diagnose_certificate_issue, }, - // Docker daemon not running + // Docker daemon not running or socket not found FailurePattern { matchers: &[ "Cannot connect to the Docker daemon", "docker daemon is not running", "Is the docker daemon running", + "Socket not found", + "No such file or directory", + "Failed to create Docker client", + "Docker socket exists but the daemon is not responding", ], match_mode: MatchMode::Any, diagnose: diagnose_docker_not_running, @@ -203,7 +207,7 @@ fn diagnose_no_default_route(_gateway_name: &str) -> GatewayFailureDiagnosis { "Stop any container holding the gateway port (default 8080), then retry", ), RecoveryStep::with_command("Prune unused Docker networks", "docker network prune -f"), - RecoveryStep::new("Restart Docker Desktop (if on Mac/Windows)"), + RecoveryStep::new("Restart your Docker runtime"), RecoveryStep::new("Then retry: openshell gateway start"), ], retryable: true, @@ -309,10 +313,7 @@ fn diagnose_oom_killed(_gateway_name: &str) -> GatewayFailureDiagnosis { The gateway requires at least 4GB of memory." .to_string(), recovery_steps: vec![ - RecoveryStep::new( - "Increase Docker memory allocation to at least 4GB \ - (Docker Desktop → Settings → Resources)", - ), + RecoveryStep::new("Increase Docker memory allocation to at least 4GB"), RecoveryStep::new("Close other memory-intensive applications"), RecoveryStep::new("Then retry: openshell gateway start"), ], @@ -335,10 +336,7 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis { "docker system prune -a --volumes", ), RecoveryStep::with_command("Check available memory on the host", "free -h"), - RecoveryStep::new( - "Increase Docker resource allocation \ - (Docker Desktop → Settings → Resources), or free resources on the host", - ), + RecoveryStep::new("Increase Docker resource allocation or free resources on the host"), RecoveryStep::with_command( "Destroy and recreate the gateway after freeing resources", format!("openshell gateway destroy {gateway_name} && openshell gateway start"), @@ -392,10 +390,16 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Docker is not running".to_string(), - explanation: "The Docker daemon is not running or not accessible.".to_string(), + explanation: "The Docker daemon is not running or not accessible. OpenShell requires \ + a Docker-compatible container runtime to manage gateway clusters." + .to_string(), recovery_steps: vec![ - RecoveryStep::new("Start Docker Desktop (Mac/Windows) or the Docker service (Linux)"), - RecoveryStep::with_command("Verify Docker is running", "docker info"), + RecoveryStep::new("Start your Docker runtime"), + RecoveryStep::with_command("Verify Docker is accessible", "docker info"), + RecoveryStep::new( + "If using a non-default Docker socket, set DOCKER_HOST:\n \ + export DOCKER_HOST=unix:///var/run/docker.sock", + ), RecoveryStep::new("Then retry: openshell gateway start"), ], retryable: true, @@ -558,6 +562,61 @@ mod tests { ); } + #[test] + fn test_diagnose_docker_not_running() { + let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!(d.summary.contains("Docker")); + assert!(d.retryable); + } + + #[test] + fn test_diagnose_docker_socket_not_found() { + let diagnosis = diagnose_failure("test", "Socket not found: /var/run/docker.sock", None); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!(d.summary.contains("Docker")); + assert!(d.retryable); + } + + #[test] + fn test_diagnose_docker_no_such_file() { + let diagnosis = diagnose_failure("test", "No such file or directory (os error 2)", None); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!(d.summary.contains("Docker")); + } + + #[test] + fn test_diagnose_docker_preflight_error() { + let diagnosis = diagnose_failure( + "test", + "Failed to create Docker client.\n\n connection error", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!(d.summary.contains("Docker")); + assert!(d.retryable); + } + + #[test] + fn test_diagnose_docker_recovery_mentions_docker_host() { + let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None); + let d = diagnosis.unwrap(); + let steps_text: String = d + .recovery_steps + .iter() + .map(|s| s.description.clone()) + .collect::>() + .join(" "); + assert!( + steps_text.contains("DOCKER_HOST"), + "recovery steps should mention DOCKER_HOST" + ); + } + #[test] fn test_diagnose_dns_failure_from_namespace_timeout() { // When wait_for_namespace detects DNS failure, the error message itself diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index bf6599b4..b196ab72 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -44,7 +44,9 @@ use crate::runtime::{ }; pub use crate::constants::container_name; -pub use crate::docker::{ExistingGatewayInfo, create_ssh_docker_client}; +pub use crate::docker::{ + DockerPreflight, ExistingGatewayInfo, check_docker_available, create_ssh_docker_client, +}; pub use crate::metadata::{ GatewayMetadata, clear_active_gateway, extract_host_from_ssh_destination, get_gateway_metadata, list_gateways, load_active_gateway, load_gateway_metadata, load_last_sandbox, @@ -222,9 +224,11 @@ pub async fn check_existing_deployment( name: &str, remote: Option<&RemoteOptions>, ) -> Result> { - let docker = match remote { - Some(remote_opts) => create_ssh_docker_client(remote_opts).await?, - None => Docker::connect_with_local_defaults().into_diagnostic()?, + let docker = if let Some(remote_opts) = remote { + create_ssh_docker_client(remote_opts).await? + } else { + let preflight = check_docker_available().await?; + preflight.docker }; check_existing_gateway(&docker, name).await } @@ -258,16 +262,16 @@ where } }; - // Create Docker client based on deployment mode - let (target_docker, remote_opts) = match &options.remote { - Some(remote_opts) => { - let remote = create_ssh_docker_client(remote_opts).await?; - (remote, Some(remote_opts.clone())) - } - None => ( - Docker::connect_with_local_defaults().into_diagnostic()?, - None, - ), + // Create Docker client based on deployment mode. + // For local deploys, run a preflight check to fail fast with actionable + // guidance when Docker is not installed, not running, or unreachable. + let (target_docker, remote_opts) = if let Some(remote_opts) = &options.remote { + let remote = create_ssh_docker_client(remote_opts).await?; + (remote, Some(remote_opts.clone())) + } else { + log("[status] Checking Docker".to_string()); + let preflight = check_docker_available().await?; + (preflight.docker, None) }; // If an existing gateway is found, either tear it down (when recreate is diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 151bd166..c18e196c 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -281,6 +281,7 @@ const DOCTOR_HELP: &str = "\x1b[1mALIAS\x1b[0m dr \x1b[1mEXAMPLES\x1b[0m + $ openshell doctor check $ openshell doctor logs --lines 100 $ openshell doctor exec -- kubectl get pods -A $ openshell doctor llm.txt @@ -1014,6 +1015,17 @@ enum DoctorCommands { /// openshell doctor llm.txt | pbcopy #[command(name = "llm.txt", help_template = LEAF_HELP_TEMPLATE)] LlmTxt, + + /// Validate system prerequisites for running a gateway. + /// + /// Checks that a Docker-compatible runtime is installed, running, and + /// reachable. Reports version info and socket path. Use this to verify + /// your environment before running `openshell gateway start`. + /// + /// Examples: + /// openshell doctor check + #[command(help_template = LEAF_HELP_TEMPLATE)] + Check, } #[derive(Subcommand, Debug)] @@ -1539,6 +1551,9 @@ async fn main() -> Result<()> { DoctorCommands::LlmTxt => { run::doctor_llm()?; } + DoctorCommands::Check => { + run::doctor_check().await?; + } }, Some(Commands::Doctor { command: None }) => { Cli::command() diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index db033e45..13006393 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1695,6 +1695,43 @@ pub fn doctor_llm() -> Result<()> { Ok(()) } +/// Validate system prerequisites for running a gateway. +/// +/// Checks Docker connectivity and reports the result. Returns exit code 0 +/// if all checks pass, 1 otherwise. +pub async fn doctor_check() -> Result<()> { + use std::io::Write; + let mut stdout = std::io::stdout().lock(); + + writeln!(stdout, "Checking system prerequisites...\n").into_diagnostic()?; + + // --- Docker connectivity --- + write!(stdout, " Docker ............. ").into_diagnostic()?; + stdout.flush().into_diagnostic()?; + + match openshell_bootstrap::check_docker_available().await { + Ok(preflight) => { + let version_str = preflight.version.as_deref().unwrap_or("unknown"); + writeln!(stdout, "ok (version {version_str})").into_diagnostic()?; + + // --- DOCKER_HOST --- + write!(stdout, " DOCKER_HOST ........ ").into_diagnostic()?; + match std::env::var("DOCKER_HOST") { + Ok(val) => writeln!(stdout, "{val}").into_diagnostic()?, + Err(_) => writeln!(stdout, "(not set, using default socket)").into_diagnostic()?, + }; + + writeln!(stdout, "\nAll checks passed.").into_diagnostic()?; + Ok(()) + } + Err(err) => { + writeln!(stdout, "FAILED").into_diagnostic()?; + writeln!(stdout).into_diagnostic()?; + Err(err) + } + } +} + /// Shell-escape a single argument for safe inclusion in a `sh -c` string. fn shell_escape(s: &str) -> String { if s.is_empty() { diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md index d3b07014..d6f2c84a 100644 --- a/docs/get-started/quickstart.md +++ b/docs/get-started/quickstart.md @@ -32,9 +32,11 @@ This page gets you from zero to a running, policy-enforced sandbox in two comman Before you begin, make sure you have: -- Python 3.12 or later -- [uv](https://docs.astral.sh/uv/) installed -- Docker Desktop running on your machine +- Python 3.12 or later. +- [uv](https://docs.astral.sh/uv/) installed. +- Docker Desktop running on your machine. + +For a complete list of requirements, refer to {doc}`../reference/support-matrix`. ## Install the OpenShell CLI @@ -42,29 +44,45 @@ Install the `openshell` package into a virtual environment. Activate your virtual environment: -```bash -uv venv && source .venv/bin/activate +```console +$ uv venv && source .venv/bin/activate ``` Install the CLI: -```bash -uv pip install openshell +```console +$ uv pip install openshell ``` -## Connect to a Remote Gateway (Optional) +:::{tip} +To find the CLI reference, run: +- `openshell --help` to see all available commands. +- `openshell --help` for detailed usage of any subcommand. + +For example: + +```console +$ openshell --help +$ openshell gateway --help +$ openshell sandbox create --help +``` +::: + +## Deploy a Gateway (Optional) -If you're running locally, skip this step. The OpenShell CLI creates a gateway automatically when you create your first sandbox. +Running `openshell sandbox create` without a gateway auto-bootstraps a local one. +To start the gateway explicitly or deploy to a remote host, choose the tab that matches your setup. :::::{tab-set} ::::{tab-item} Brev :::{note} -Deploy an OpenShell gateway on Brev by hitting **Deploy** on the [OpenShell Launchable](https://brev.nvidia.com/launchable/deploy/now?launchableID=env-3AaK9NmCzWp3pVyUDNNFBt805FT). +Deploy an OpenShell gateway on Brev by clicking **Deploy** on the [OpenShell Launchable](https://brev.nvidia.com/launchable/deploy/now?launchableID=env-3AaK9NmCzWp3pVyUDNNFBt805FT). ::: -After the instance is running, find the gateway URL in the Brev console under **Using Secure Links**. Copy the shareable URL for **port 8080** — this is the gateway endpoint. +After the instance starts running, find the gateway URL in the Brev console under **Using Secure Links**. +Copy the shareable URL for **port 8080**, which is the gateway endpoint. ```console $ openshell gateway add https://.brevlab.com @@ -94,6 +112,7 @@ After `openshell status` shows the gateway as healthy, all subsequent commands r ## Create Your First OpenShell Sandbox +Create a sandbox and launch an agent inside it. Choose the tab that matches your agent: ::::{tab-set} @@ -106,7 +125,10 @@ Run the following command to create a sandbox with Claude Code: $ openshell sandbox create -- claude ``` -The CLI prompts you to create a provider from local credentials — type `yes` to continue. If `ANTHROPIC_API_KEY` is set in your environment, it is picked up automatically. If not, you can configure it from inside the sandbox after it launches. +The CLI prompts you to create a provider from local credentials. +Type `yes` to continue. +If `ANTHROPIC_API_KEY` is set in your environment, the CLI picks it up automatically. +If not, you can configure it from inside the sandbox after it launches. ::: :::{tab-item} OpenCode @@ -117,7 +139,10 @@ Run the following command to create a sandbox with OpenCode: $ openshell sandbox create -- opencode ``` -The CLI prompts you to create a provider from local credentials. Type `yes` to continue. If `OPENAI_API_KEY` or `OPENROUTER_API_KEY` is set in your environment, it is picked up automatically. If not, you can configure it from inside the sandbox after it launches. +The CLI prompts you to create a provider from local credentials. +Type `yes` to continue. +If `OPENAI_API_KEY` or `OPENROUTER_API_KEY` is set in your environment, the CLI picks it up automatically. +If not, you can configure it from inside the sandbox after it launches. ::: :::{tab-item} Codex @@ -128,7 +153,10 @@ Run the following command to create a sandbox with Codex: $ openshell sandbox create -- codex ``` -The CLI prompts you to create a provider from local credentials. Type `yes` to continue. If `OPENAI_API_KEY` is set in your environment, it is picked up automatically. If not, you can configure it from inside the sandbox after it launches. +The CLI prompts you to create a provider from local credentials. +Type `yes` to continue. +If `OPENAI_API_KEY` is set in your environment, the CLI picks it up automatically. +If not, you can configure it from inside the sandbox after it launches. ::: :::{tab-item} OpenClaw @@ -139,12 +167,14 @@ Run the following command to create a sandbox with OpenClaw: $ openshell sandbox create --from openclaw ``` -The `--from` flag pulls a pre-built sandbox definition from the [OpenShell Community](https://github.com/NVIDIA/OpenShell-Community) catalog. Each definition bundles a container image, a tailored policy, and optional skills into a single package. +The `--from` flag pulls a pre-built sandbox definition from the [OpenShell Community](https://github.com/NVIDIA/OpenShell-Community) catalog. +Each definition bundles a container image, a tailored policy, and optional skills into a single package. ::: :::{tab-item} Community Sandbox -You can use the `--from` flag to pull other OpenShell sandbox images from the [NVIDIA Container Registry](https://registry.nvidia.com/). For example, to pull the `base` image, run the following command: +Use the `--from` flag to pull other OpenShell sandbox images from the [NVIDIA Container Registry](https://registry.nvidia.com/). +For example, to pull the `base` image, run the following command: ```console $ openshell sandbox create --from base diff --git a/e2e/rust/tests/docker_preflight.rs b/e2e/rust/tests/docker_preflight.rs new file mode 100644 index 00000000..0b51666e --- /dev/null +++ b/e2e/rust/tests/docker_preflight.rs @@ -0,0 +1,274 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Docker preflight e2e tests. +//! +//! These tests verify that the CLI fails fast with actionable guidance when +//! Docker is not available, instead of starting a multi-minute deploy that +//! eventually times out with a cryptic error. +//! +//! The tests do NOT require a running gateway or Docker — they intentionally +//! point `DOCKER_HOST` at a non-existent socket to simulate Docker being +//! unavailable. + +use std::process::Stdio; +use std::time::Instant; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::output::strip_ansi; + +/// Run `openshell ` in an isolated environment where Docker is +/// guaranteed to be unreachable. +/// +/// Sets `DOCKER_HOST` to a non-existent socket so the preflight check +/// fails immediately regardless of the host's Docker configuration. +async fn run_without_docker(args: &[&str]) -> (String, i32, std::time::Duration) { + let tmpdir = tempfile::tempdir().expect("create isolated config dir"); + let start = Instant::now(); + + let mut cmd = openshell_cmd(); + cmd.args(args) + .env("XDG_CONFIG_HOME", tmpdir.path()) + .env("HOME", tmpdir.path()) + .env("DOCKER_HOST", "unix:///tmp/openshell-e2e-nonexistent.sock") + .env_remove("OPENSHELL_GATEWAY") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let elapsed = start.elapsed(); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + (combined, code, elapsed) +} + +// ------------------------------------------------------------------- +// gateway start: fails fast when Docker is unavailable +// ------------------------------------------------------------------- + +/// `openshell gateway start` with no Docker should fail within seconds +/// (not minutes) and produce a non-zero exit code. +#[tokio::test] +async fn gateway_start_fails_fast_without_docker() { + let (output, code, elapsed) = run_without_docker(&["gateway", "start"]).await; + + assert_ne!( + code, 0, + "gateway start should fail when Docker is unavailable, output:\n{output}" + ); + + // The preflight check should cause failure in under 30 seconds. + // Before the preflight was added, this would time out after several minutes + // waiting for k3s namespace readiness. + assert!( + elapsed.as_secs() < 30, + "gateway start should fail fast (took {}s), output:\n{output}", + elapsed.as_secs() + ); +} + +/// When Docker is unavailable, the error output should mention Docker +/// so the user knows what to fix. +#[tokio::test] +async fn gateway_start_error_mentions_docker() { + let (output, code, _) = run_without_docker(&["gateway", "start"]).await; + + assert_ne!(code, 0); + let clean = strip_ansi(&output); + let lower = clean.to_lowercase(); + + assert!( + lower.contains("docker"), + "error output should mention 'Docker' so the user knows what to fix:\n{clean}" + ); +} + +/// When Docker is unavailable, the error output should include guidance +/// about DOCKER_HOST since that's the likely fix for non-default runtimes. +#[tokio::test] +async fn gateway_start_error_mentions_docker_host() { + let (output, code, _) = run_without_docker(&["gateway", "start"]).await; + + assert_ne!(code, 0); + let clean = strip_ansi(&output); + + assert!( + clean.contains("DOCKER_HOST"), + "error output should mention DOCKER_HOST for users with non-default socket paths:\n{clean}" + ); +} + +/// When Docker is unavailable, the error output should suggest a +/// verification command like `docker info`. +#[tokio::test] +async fn gateway_start_error_suggests_verification() { + let (output, code, _) = run_without_docker(&["gateway", "start"]).await; + + assert_ne!(code, 0); + let clean = strip_ansi(&output); + + assert!( + clean.contains("docker info"), + "error output should suggest 'docker info' as a verification step:\n{clean}" + ); +} + +// ------------------------------------------------------------------- +// gateway start --recreate: same preflight behavior +// ------------------------------------------------------------------- + +/// `openshell gateway start --recreate` should also fail fast when +/// Docker is unavailable (the recreate flag should not bypass the check). +#[tokio::test] +async fn gateway_start_recreate_fails_fast_without_docker() { + let (output, code, elapsed) = run_without_docker(&["gateway", "start", "--recreate"]).await; + + assert_ne!( + code, 0, + "gateway start --recreate should fail when Docker is unavailable, output:\n{output}" + ); + + assert!( + elapsed.as_secs() < 30, + "gateway start --recreate should fail fast (took {}s)", + elapsed.as_secs() + ); +} + +// ------------------------------------------------------------------- +// sandbox create with auto-bootstrap: same preflight behavior +// ------------------------------------------------------------------- + +/// `openshell sandbox create` triggers auto-bootstrap when no gateway +/// exists. With Docker unavailable, it should fail fast with Docker +/// guidance rather than timing out. +#[tokio::test] +async fn sandbox_create_auto_bootstrap_fails_fast_without_docker() { + let (output, code, elapsed) = + run_without_docker(&["sandbox", "create", "--from", "openclaw"]).await; + + assert_ne!( + code, 0, + "sandbox create should fail when Docker is unavailable, output:\n{output}" + ); + + // Auto-bootstrap path should also hit the preflight check quickly. + assert!( + elapsed.as_secs() < 30, + "sandbox create should fail fast via auto-bootstrap preflight (took {}s), output:\n{output}", + elapsed.as_secs() + ); + + let clean = strip_ansi(&output); + let lower = clean.to_lowercase(); + assert!( + lower.contains("docker"), + "sandbox create error should mention Docker:\n{clean}" + ); +} + +// ------------------------------------------------------------------- +// doctor check: validates system prerequisites +// ------------------------------------------------------------------- + +/// `openshell doctor check` with Docker unavailable should fail fast +/// and report the Docker check as FAILED. +#[tokio::test] +async fn doctor_check_fails_without_docker() { + let (output, code, elapsed) = run_without_docker(&["doctor", "check"]).await; + + assert_ne!( + code, 0, + "doctor check should fail when Docker is unavailable, output:\n{output}" + ); + + assert!( + elapsed.as_secs() < 10, + "doctor check should complete quickly (took {}s)", + elapsed.as_secs() + ); + + let clean = strip_ansi(&output); + assert!( + clean.contains("FAILED"), + "doctor check should report Docker as FAILED:\n{clean}" + ); +} + +/// `openshell doctor check` output should include the check label +/// so the user knows what was tested. +#[tokio::test] +async fn doctor_check_output_shows_docker_label() { + let (output, _, _) = run_without_docker(&["doctor", "check"]).await; + let clean = strip_ansi(&output); + + assert!( + clean.contains("Docker"), + "doctor check output should include 'Docker' label:\n{clean}" + ); +} + +/// `openshell doctor check` with Docker unavailable should include +/// actionable guidance in the error output. +#[tokio::test] +async fn doctor_check_error_includes_guidance() { + let (output, code, _) = run_without_docker(&["doctor", "check"]).await; + + assert_ne!(code, 0); + let clean = strip_ansi(&output); + + assert!( + clean.contains("DOCKER_HOST"), + "doctor check error should mention DOCKER_HOST:\n{clean}" + ); + assert!( + clean.contains("docker info"), + "doctor check error should suggest 'docker info':\n{clean}" + ); +} + +/// When Docker IS available, `openshell doctor check` should pass and +/// report the version. +/// +/// This test only runs when Docker is actually reachable on the host +/// (i.e., it will pass in CI with Docker but be skipped locally if +/// Docker is not running). We detect this by checking if the default +/// socket exists. +#[tokio::test] +async fn doctor_check_passes_with_docker() { + if !std::path::Path::new("/var/run/docker.sock").exists() { + eprintln!("skipping: /var/run/docker.sock not found"); + return; + } + + let tmpdir = tempfile::tempdir().expect("create isolated config dir"); + let mut cmd = openshell_cmd(); + cmd.args(["doctor", "check"]) + .env("XDG_CONFIG_HOME", tmpdir.path()) + .env("HOME", tmpdir.path()) + .env_remove("OPENSHELL_GATEWAY") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + let clean = strip_ansi(&combined); + + assert_eq!( + code, 0, + "doctor check should pass when Docker is available, output:\n{clean}" + ); + assert!( + clean.contains("All checks passed"), + "doctor check should report success:\n{clean}" + ); + assert!( + clean.contains("ok"), + "doctor check should show 'ok' for Docker:\n{clean}" + ); +}