Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 217 additions & 0 deletions crates/openshell-bootstrap/src/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,153 @@ pub fn normalize_arch(arch: &str) -> String {
}
}

/// Result of a successful Docker preflight check.
///
/// Contains the validated Docker client and metadata about the daemon so
/// callers can reuse the connection without re-checking.
#[derive(Debug)]
pub struct DockerPreflight {
/// A Docker client that has been verified as connected and responsive.
pub docker: Docker,
/// Docker daemon version string (e.g., "28.1.1").
pub version: Option<String>,
}

/// Well-known Docker socket paths to probe when the default fails.
///
/// These cover common container runtimes on macOS and Linux:
/// - `/var/run/docker.sock` — default for Docker Desktop, `OrbStack`, Colima
/// - `$HOME/.colima/docker.sock` — Colima (older installs)
/// - `$HOME/.orbstack/run/docker.sock` — `OrbStack` (if symlink is missing)
const WELL_KNOWN_SOCKET_PATHS: &[&str] = &[
"/var/run/docker.sock",
// Expanded at runtime via home_dir():
// ~/.colima/docker.sock
// ~/.orbstack/run/docker.sock
];

/// Check that a Docker-compatible runtime is installed, running, and reachable.
///
/// This is the primary preflight gate. It must be called before any gateway
/// deploy work begins. On failure it produces a user-friendly error with
/// actionable recovery steps instead of a raw bollard connection error.
pub async fn check_docker_available() -> Result<DockerPreflight> {
// Step 1: Try to connect using bollard's default resolution
// (respects DOCKER_HOST, then falls back to /var/run/docker.sock).
let docker = match Docker::connect_with_local_defaults() {
Ok(d) => d,
Err(err) => {
return Err(docker_not_reachable_error(
&format!("{err}"),
"Failed to create Docker client",
));
}
};

// Step 2: Ping the daemon to confirm it's responsive.
if let Err(err) = docker.ping().await {
return Err(docker_not_reachable_error(
&format!("{err}"),
"Docker socket exists but the daemon is not responding",
));
}

// Step 3: Query version info (best-effort — don't fail on this).
let version = match docker.version().await {
Ok(v) => v.version,
Err(_) => None,
};

Ok(DockerPreflight { docker, version })
}

/// Build a rich, user-friendly error when Docker is not reachable.
fn docker_not_reachable_error(raw_err: &str, summary: &str) -> miette::Report {
let docker_host = std::env::var("DOCKER_HOST").ok();
let socket_exists = std::path::Path::new("/var/run/docker.sock").exists();

let mut hints: Vec<String> = Vec::new();

if !socket_exists && docker_host.is_none() {
// No socket and no DOCKER_HOST — likely nothing is installed or started
hints.push(
"No Docker socket found at /var/run/docker.sock and DOCKER_HOST is not set."
.to_string(),
);
hints.push(
"Install and start a Docker-compatible runtime. See the support matrix \
in the OpenShell docs for tested configurations."
.to_string(),
);

// Check for alternative sockets that might exist
let alt_sockets = find_alternative_sockets();
if !alt_sockets.is_empty() {
hints.push(format!(
"Found Docker-compatible socket(s) at alternative path(s):\n {}\n\n \
Set DOCKER_HOST to use one, e.g.:\n\n \
export DOCKER_HOST=unix://{}",
alt_sockets.join("\n "),
alt_sockets[0],
));
}
} else if docker_host.is_some() {
// DOCKER_HOST is set but daemon didn't respond
let host_val = docker_host.unwrap();
hints.push(format!(
"DOCKER_HOST is set to '{host_val}' but the Docker daemon is not responding."
));
hints.push(
"Verify your Docker runtime is started and the DOCKER_HOST value is correct."
.to_string(),
);
} else {
// Socket exists but daemon isn't responding
hints.push(
"Docker socket found at /var/run/docker.sock but the daemon is not responding."
.to_string(),
);
hints.push("Start your Docker runtime and try again.".to_string());
}

hints.push("Verify Docker is working with: docker info".to_string());

let help_text = hints.join("\n\n");

miette::miette!(help = help_text, "{summary}.\n\n {raw_err}")
}

/// Probe for Docker-compatible sockets at non-default locations.
fn find_alternative_sockets() -> Vec<String> {
let mut found = Vec::new();

// Check well-known static paths
for path in WELL_KNOWN_SOCKET_PATHS {
if std::path::Path::new(path).exists() {
found.push(path.to_string());
}
}

// Check home-relative paths
if let Some(home) = home_dir() {
let home_sockets = [
format!("{home}/.colima/docker.sock"),
format!("{home}/.orbstack/run/docker.sock"),
];
for path in &home_sockets {
if std::path::Path::new(path).exists() && !found.contains(path) {
found.push(path.clone());
}
}
}

found
}

fn home_dir() -> Option<String> {
std::env::var("HOME").ok()
}

/// Create an SSH Docker client from remote options.
pub async fn create_ssh_docker_client(remote: &RemoteOptions) -> Result<Docker> {
// Ensure destination has ssh:// prefix
Expand Down Expand Up @@ -981,4 +1128,74 @@ mod tests {
};
assert_eq!(platform.platform_string(), "linux/arm64");
}

#[test]
fn docker_not_reachable_error_no_socket_no_docker_host() {
// Simulate: no socket at default path, no DOCKER_HOST set.
// We can't guarantee /var/run/docker.sock state in CI, but we can
// verify the error message is well-formed and contains guidance.
let err =
docker_not_reachable_error("connection refused", "Failed to create Docker client");
let msg = format!("{err:?}");
assert!(
msg.contains("Failed to create Docker client"),
"should include the summary"
);
assert!(
msg.contains("connection refused"),
"should include the raw error"
);
// The message should always include the verification step
assert!(
msg.contains("docker info"),
"should suggest 'docker info' verification"
);
}

#[test]
fn docker_not_reachable_error_with_docker_host() {
// Simulate: DOCKER_HOST is set but daemon unresponsive.
// We set the env var temporarily (this is test-only).
let prev_docker_host = std::env::var("DOCKER_HOST").ok();
// SAFETY: test-only, single-threaded test runner for this test
unsafe {
std::env::set_var("DOCKER_HOST", "unix:///tmp/fake-docker.sock");
}

let err = docker_not_reachable_error(
"daemon not responding",
"Docker socket exists but the daemon is not responding",
);
let msg = format!("{err:?}");

// Restore env
// SAFETY: test-only, restoring previous state
unsafe {
match prev_docker_host {
Some(val) => std::env::set_var("DOCKER_HOST", val),
None => std::env::remove_var("DOCKER_HOST"),
}
}

assert!(
msg.contains("DOCKER_HOST"),
"should mention DOCKER_HOST when it is set"
);
assert!(
msg.contains("unix:///tmp/fake-docker.sock"),
"should show the current DOCKER_HOST value"
);
}

#[test]
fn find_alternative_sockets_returns_vec() {
// Verify the function runs without panic and returns a vec.
// Exact contents depend on the host system, so we just check the type.
let sockets = find_alternative_sockets();
// On any system, /var/run/docker.sock may or may not exist
assert!(
sockets.len() <= 10,
"should return a reasonable number of sockets"
);
}
}
85 changes: 72 additions & 13 deletions crates/openshell-bootstrap/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,16 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
match_mode: MatchMode::Any,
diagnose: diagnose_certificate_issue,
},
// Docker daemon not running
// Docker daemon not running or socket not found
FailurePattern {
matchers: &[
"Cannot connect to the Docker daemon",
"docker daemon is not running",
"Is the docker daemon running",
"Socket not found",
"No such file or directory",
"Failed to create Docker client",
"Docker socket exists but the daemon is not responding",
],
match_mode: MatchMode::Any,
diagnose: diagnose_docker_not_running,
Expand Down Expand Up @@ -203,7 +207,7 @@ fn diagnose_no_default_route(_gateway_name: &str) -> GatewayFailureDiagnosis {
"Stop any container holding the gateway port (default 8080), then retry",
),
RecoveryStep::with_command("Prune unused Docker networks", "docker network prune -f"),
RecoveryStep::new("Restart Docker Desktop (if on Mac/Windows)"),
RecoveryStep::new("Restart your Docker runtime"),
RecoveryStep::new("Then retry: openshell gateway start"),
],
retryable: true,
Expand Down Expand Up @@ -309,10 +313,7 @@ fn diagnose_oom_killed(_gateway_name: &str) -> GatewayFailureDiagnosis {
The gateway requires at least 4GB of memory."
.to_string(),
recovery_steps: vec![
RecoveryStep::new(
"Increase Docker memory allocation to at least 4GB \
(Docker Desktop → Settings → Resources)",
),
RecoveryStep::new("Increase Docker memory allocation to at least 4GB"),
RecoveryStep::new("Close other memory-intensive applications"),
RecoveryStep::new("Then retry: openshell gateway start"),
],
Expand All @@ -335,10 +336,7 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis {
"docker system prune -a --volumes",
),
RecoveryStep::with_command("Check available memory on the host", "free -h"),
RecoveryStep::new(
"Increase Docker resource allocation \
(Docker Desktop → Settings → Resources), or free resources on the host",
),
RecoveryStep::new("Increase Docker resource allocation or free resources on the host"),
RecoveryStep::with_command(
"Destroy and recreate the gateway after freeing resources",
format!("openshell gateway destroy {gateway_name} && openshell gateway start"),
Expand Down Expand Up @@ -392,10 +390,16 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis {
GatewayFailureDiagnosis {
summary: "Docker is not running".to_string(),
explanation: "The Docker daemon is not running or not accessible.".to_string(),
explanation: "The Docker daemon is not running or not accessible. OpenShell requires \
a Docker-compatible container runtime to manage gateway clusters."
.to_string(),
recovery_steps: vec![
RecoveryStep::new("Start Docker Desktop (Mac/Windows) or the Docker service (Linux)"),
RecoveryStep::with_command("Verify Docker is running", "docker info"),
RecoveryStep::new("Start your Docker runtime"),
RecoveryStep::with_command("Verify Docker is accessible", "docker info"),
RecoveryStep::new(
"If using a non-default Docker socket, set DOCKER_HOST:\n \
export DOCKER_HOST=unix:///var/run/docker.sock",
),
RecoveryStep::new("Then retry: openshell gateway start"),
],
retryable: true,
Expand Down Expand Up @@ -558,6 +562,61 @@ mod tests {
);
}

#[test]
fn test_diagnose_docker_not_running() {
let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(d.summary.contains("Docker"));
assert!(d.retryable);
}

#[test]
fn test_diagnose_docker_socket_not_found() {
let diagnosis = diagnose_failure("test", "Socket not found: /var/run/docker.sock", None);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(d.summary.contains("Docker"));
assert!(d.retryable);
}

#[test]
fn test_diagnose_docker_no_such_file() {
let diagnosis = diagnose_failure("test", "No such file or directory (os error 2)", None);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(d.summary.contains("Docker"));
}

#[test]
fn test_diagnose_docker_preflight_error() {
let diagnosis = diagnose_failure(
"test",
"Failed to create Docker client.\n\n connection error",
None,
);
assert!(diagnosis.is_some());
let d = diagnosis.unwrap();
assert!(d.summary.contains("Docker"));
assert!(d.retryable);
}

#[test]
fn test_diagnose_docker_recovery_mentions_docker_host() {
let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None);
let d = diagnosis.unwrap();
let steps_text: String = d
.recovery_steps
.iter()
.map(|s| s.description.clone())
.collect::<Vec<_>>()
.join(" ");
assert!(
steps_text.contains("DOCKER_HOST"),
"recovery steps should mention DOCKER_HOST"
);
}

#[test]
fn test_diagnose_dns_failure_from_namespace_timeout() {
// When wait_for_namespace detects DNS failure, the error message itself
Expand Down
Loading
Loading