From cb2e243276adebc3254fd2a018048c00effc052b Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Fri, 13 Mar 2026 14:46:04 -0700
Subject: [PATCH 01/13] feat(inference): validate endpoints before saving
 routes

Closes #273

Verify inference endpoints synchronously on the server during set/update, expose a --no-verify escape hatch in the CLI and Python helper, and return actionable failures when validation does not pass.
---
 Cargo.lock                               |   2 +
 architecture/inference-routing.md        |  15 +-
 crates/openshell-cli/src/main.rs         |  69 +++-
 crates/openshell-cli/src/run.rs          |   4 +
 crates/openshell-server/Cargo.toml       |   2 +
 crates/openshell-server/src/inference.rs | 398 ++++++++++++++++++++++-
 docs/inference/configure.md              |   4 +-
 proto/inference.proto                    |   2 +
 python/openshell/sandbox.py              |   2 +
 python/openshell/sandbox_test.py         |  34 ++
 10 files changed, 511 insertions(+), 21 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index f4bc2e4d..051d9e25 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2989,6 +2989,7 @@ dependencies = [
  "prost-types",
  "rand 0.9.2",
  "rcgen",
+ "reqwest",
  "russh",
  "rustls",
  "rustls-pemfile",
@@ -3008,6 +3009,7 @@ dependencies = [
  "tracing",
  "tracing-subscriber",
  "uuid",
+ "wiremock",
 ]
 
 [[package]]
diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md
index b8ce8a80..851823f3 100644
--- a/architecture/inference-routing.md
+++ b/architecture/inference-routing.md
@@ -66,8 +66,9 @@ The gateway implements the `Inference` gRPC service defined in `proto/inference.
 1. Validates that both fields are non-empty.
 2. Fetches the named provider record from the store.
 3. Validates the provider by resolving its route (checking that the provider type is supported and has a usable API key).
-4. Builds a managed route spec that stores only `provider_name` and `model_id`. The spec intentionally leaves `base_url`, `api_key`, and `protocols` empty -- these are resolved dynamically at bundle time from the provider record.
-5. Upserts the route with name `inference.local`. Version starts at 1 and increments monotonically on each update.
+4. Unless `skip_validation` is set, performs a lightweight provider-shaped probe against the resolved upstream endpoint (for example, a tiny chat/messages request with `max_tokens: 1`) to confirm the endpoint is reachable and accepts the expected auth/request shape.
+5. Builds a managed route spec that stores only `provider_name` and `model_id`. The spec intentionally leaves `base_url`, `api_key`, and `protocols` empty -- these are resolved dynamically at bundle time from the provider record.
+6. Upserts the route with name `inference.local`. Version starts at 1 and increments monotonically on each update.
 
 `GetClusterInference` returns `provider_name`, `model_id`, and `version` for the managed route. Returns `NOT_FOUND` if cluster inference is not configured.
 
@@ -91,7 +92,7 @@ File: `proto/inference.proto`
 
 Key messages:
 
-- `SetClusterInferenceRequest` -- `provider_name` + `model_id`
+- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `skip_validation`
 - `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `version`
 - `GetInferenceBundleResponse` -- `repeated ResolvedRoute routes` + `revision` + `generated_at_ms`
 - `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`
@@ -296,10 +297,10 @@ The system route is stored as a separate `InferenceRoute` record in the gateway
 
 Cluster inference commands:
 
-- `openshell cluster inference set --provider <name> --model <id>` -- configures user-facing cluster inference
-- `openshell cluster inference set --system --provider <name> --model <id>` -- configures system inference
-- `openshell cluster inference get` -- displays both user and system inference configuration
-- `openshell cluster inference get --system` -- displays only the system inference configuration
+- `openshell inference set --provider <name> --model <id>` -- configures user-facing cluster inference
+- `openshell inference set --system --provider <name> --model <id>` -- configures system inference
+- `openshell inference get` -- displays both user and system inference configuration
+- `openshell inference get --system` -- displays only the system inference configuration
 
 The `--provider` flag references a provider record name (not a provider type). The provider must already exist in the cluster and have a supported inference type (`openai`, `anthropic`, or `nvidia`).
 
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index dcca4703..ca82095a 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -906,9 +906,6 @@ enum InferenceCommands {
         system: bool,
 
         /// Skip endpoint verification before saving the route.
-        ///
-        /// Accepted now so scripts can opt out explicitly ahead of a future
-        /// default switch to verification.
         #[arg(long)]
         no_verify: bool,
     },
@@ -929,9 +926,6 @@ enum InferenceCommands {
         system: bool,
 
         /// Skip endpoint verification before saving the route.
-        ///
-        /// Accepted now so scripts can opt out explicitly ahead of a future
-        /// default switch to verification.
         #[arg(long)]
         no_verify: bool,
     },
@@ -1810,17 +1804,27 @@ async fn main() -> Result<()> {
                     provider,
                     model,
                     system,
+<<<<<<< HEAD
                     no_verify: _,
+=======
+                    no_verify,
+>>>>>>> 7f0504d8 (feat(inference): validate endpoints before saving routes)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
-                    run::gateway_inference_set(endpoint, &provider, &model, route_name, &tls)
-                        .await?;
+                    run::gateway_inference_set(
+                        endpoint, &provider, &model, route_name, no_verify, &tls,
+                    )
+                    .await?;
                 }
                 InferenceCommands::Update {
                     provider,
                     model,
                     system,
+<<<<<<< HEAD
                     no_verify: _,
+=======
+                    no_verify,
+>>>>>>> 7f0504d8 (feat(inference): validate endpoints before saving routes)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_update(
@@ -1828,6 +1832,7 @@ async fn main() -> Result<()> {
                         provider.as_deref(),
                         model.as_deref(),
                         route_name,
+                        no_verify,
                         &tls,
                     )
                     .await?;
@@ -2559,6 +2564,54 @@ mod tests {
         ));
     }
 
+    #[test]
+    fn inference_set_accepts_no_verify_flag() {
+        let cli = Cli::try_parse_from([
+            "openshell",
+            "inference",
+            "set",
+            "--provider",
+            "openai-dev",
+            "--model",
+            "gpt-4.1",
+            "--no-verify",
+        ])
+        .expect("inference set should parse --no-verify");
+
+        assert!(matches!(
+            cli.command,
+            Some(Commands::Inference {
+                command: Some(InferenceCommands::Set {
+                    no_verify: true,
+                    ..
+                })
+            })
+        ));
+    }
+
+    #[test]
+    fn inference_update_accepts_no_verify_flag() {
+        let cli = Cli::try_parse_from([
+            "openshell",
+            "inference",
+            "update",
+            "--provider",
+            "openai-dev",
+            "--no-verify",
+        ])
+        .expect("inference update should parse --no-verify");
+
+        assert!(matches!(
+            cli.command,
+            Some(Commands::Inference {
+                command: Some(InferenceCommands::Update {
+                    no_verify: true,
+                    ..
+                })
+            })
+        ));
+    }
+
     #[test]
     fn completion_script_uses_openshell_command_name() {
         let script = normalize_completion_script(
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 42ecbbb1..878c8c2c 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -3390,6 +3390,7 @@ pub async fn gateway_inference_set(
     provider_name: &str,
     model_id: &str,
     route_name: &str,
+    skip_validation: bool,
     tls: &TlsOptions,
 ) -> Result<()> {
     let mut client = grpc_inference_client(server, tls).await?;
@@ -3398,6 +3399,7 @@ pub async fn gateway_inference_set(
             provider_name: provider_name.to_string(),
             model_id: model_id.to_string(),
             route_name: route_name.to_string(),
+            skip_validation,
         })
         .await
         .into_diagnostic()?;
@@ -3422,6 +3424,7 @@ pub async fn gateway_inference_update(
     provider_name: Option<&str>,
     model_id: Option<&str>,
     route_name: &str,
+    skip_validation: bool,
     tls: &TlsOptions,
 ) -> Result<()> {
     if provider_name.is_none() && model_id.is_none() {
@@ -3449,6 +3452,7 @@ pub async fn gateway_inference_update(
             provider_name: provider.to_string(),
             model_id: model.to_string(),
             route_name: route_name.to_string(),
+            skip_validation,
         })
         .await
         .into_diagnostic()?;
diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml
index 7d53abe6..c4935024 100644
--- a/crates/openshell-server/Cargo.toml
+++ b/crates/openshell-server/Cargo.toml
@@ -61,6 +61,7 @@ serde = { workspace = true }
 serde_json = { workspace = true }
 tokio-stream = { workspace = true }
 sqlx = { workspace = true }
+reqwest = { workspace = true }
 kube = { workspace = true }
 kube-runtime = { workspace = true }
 k8s-openapi = { workspace = true }
@@ -78,6 +79,7 @@ rcgen = { version = "0.13", features = ["crypto", "pem"] }
 tempfile = "3"
 tokio-tungstenite = { workspace = true }
 futures-util = "0.3"
+wiremock = "0.6"
 
 [lints]
 workspace = true
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index f7a8427e..35b704f1 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -1,12 +1,14 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+use openshell_core::inference::AuthHeader;
 use openshell_core::proto::{
     ClusterInferenceConfig, GetClusterInferenceRequest, GetClusterInferenceResponse,
     GetInferenceBundleRequest, GetInferenceBundleResponse, InferenceRoute, Provider, ResolvedRoute,
     SetClusterInferenceRequest, SetClusterInferenceResponse, inference_server::Inference,
 };
 use std::sync::Arc;
+use std::time::Duration;
 use tonic::{Request, Response, Status};
 
 use crate::{
@@ -81,6 +83,7 @@ impl Inference for InferenceService {
             route_name,
             &req.provider_name,
             &req.model_id,
+            req.skip_validation,
         )
         .await?;
 
@@ -111,7 +114,7 @@ impl Inference for InferenceService {
             .map_err(|e| Status::internal(format!("fetch route failed: {e}")))?
             .ok_or_else(|| {
                 Status::not_found(format!(
-                    "inference route '{route_name}' is not configured; run 'openshell cluster inference set --provider <name> --model <id>'"
+                    "inference route '{route_name}' is not configured; run 'openshell inference set --provider <name> --model <id>'"
                 ))
             })?;
 
@@ -140,6 +143,7 @@ async fn upsert_cluster_inference_route(
     route_name: &str,
     provider_name: &str,
     model_id: &str,
+    skip_validation: bool,
 ) -> Result<InferenceRoute, Status> {
     if provider_name.trim().is_empty() {
         return Err(Status::invalid_argument("provider_name is required"));
@@ -156,9 +160,10 @@ async fn upsert_cluster_inference_route(
             Status::failed_precondition(format!("provider '{provider_name}' not found"))
         })?;
 
-    // Validate provider shape at set time; endpoint/auth are resolved from the
-    // provider record when generating sandbox bundles.
-    let _ = resolve_provider_route(&provider)?;
+    let resolved = resolve_provider_route(&provider)?;
+    if !skip_validation {
+        verify_provider_endpoint(&provider.name, model_id, &resolved).await?;
+    }
 
     let config = build_cluster_inference_config(&provider, model_id);
 
@@ -203,6 +208,13 @@ struct ResolvedProviderRoute {
     base_url: String,
     protocols: Vec<String>,
     api_key: String,
+    auth: AuthHeader,
+    default_headers: Vec<(String, String)>,
+}
+
+struct ValidationProbe {
+    path: &'static str,
+    body: serde_json::Value,
 }
 
 fn resolve_provider_route(provider: &Provider) -> Result<ResolvedProviderRoute, Status> {
@@ -241,9 +253,198 @@ fn resolve_provider_route(provider: &Provider) -> Result<ResolvedProviderRoute,
         base_url,
         protocols: profile.protocols.iter().map(|p| (*p).to_string()).collect(),
         api_key,
+        auth: profile.auth.clone(),
+        default_headers: profile
+            .default_headers
+            .iter()
+            .map(|(name, value)| ((*name).to_string(), (*value).to_string()))
+            .collect(),
     })
 }
 
+fn validation_probe(
+    route: &ResolvedProviderRoute,
+    model_id: &str,
+) -> Result<ValidationProbe, Status> {
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_chat_completions")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/chat/completions",
+            body: serde_json::json!({
+                "model": model_id,
+                "messages": [{"role": "user", "content": "ping"}],
+                "max_tokens": 1,
+            }),
+        });
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "anthropic_messages")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/messages",
+            body: serde_json::json!({
+                "model": model_id,
+                "messages": [{"role": "user", "content": "ping"}],
+                "max_tokens": 1,
+            }),
+        });
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_responses")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/responses",
+            body: serde_json::json!({
+                "model": model_id,
+                "input": "ping",
+                "max_output_tokens": 1,
+            }),
+        });
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_completions")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/completions",
+            body: serde_json::json!({
+                "model": model_id,
+                "prompt": "ping",
+                "max_tokens": 1,
+            }),
+        });
+    }
+
+    Err(Status::failed_precondition(format!(
+        "provider type '{}' does not expose a writable inference protocol for validation",
+        route.provider_type
+    )))
+}
+
+fn validation_url(base_url: &str, path: &str) -> String {
+    let base = base_url.trim_end_matches('/');
+    if base.ends_with("/v1") && (path == "/v1" || path.starts_with("/v1/")) {
+        return format!("{base}{}", &path[3..]);
+    }
+
+    format!("{base}{path}")
+}
+
+fn validation_failure(
+    provider_name: &str,
+    model_id: &str,
+    base_url: &str,
+    details: &str,
+    next_steps: &str,
+) -> Status {
+    Status::failed_precondition(format!(
+        "failed to verify inference endpoint for provider '{provider_name}' and model '{model_id}' at '{base_url}': {details}. Next steps: {next_steps}, or retry with '--no-verify' if the endpoint is not up yet"
+    ))
+}
+
+async fn verify_provider_endpoint(
+    provider_name: &str,
+    model_id: &str,
+    route: &ResolvedProviderRoute,
+) -> Result<(), Status> {
+    let probe = validation_probe(route, model_id)?;
+    let url = validation_url(&route.base_url, probe.path);
+    let client = reqwest::Client::builder()
+        .timeout(Duration::from_secs(5))
+        .build()
+        .map_err(|err| Status::internal(format!("build validation client failed: {err}")))?;
+
+    let mut request = client
+        .post(&url)
+        .header(reqwest::header::CONTENT_TYPE, "application/json");
+    request = match &route.auth {
+        AuthHeader::Bearer => request.bearer_auth(&route.api_key),
+        AuthHeader::Custom(name) => request.header(*name, &route.api_key),
+    };
+
+    for (name, value) in &route.default_headers {
+        request = request.header(name, value);
+    }
+
+    let response = request.json(&probe.body).send().await.map_err(|err| {
+        let details = if err.is_timeout() {
+            format!("request to {url} timed out")
+        } else if err.is_connect() {
+            format!("failed to connect to {url}: {err}")
+        } else {
+            format!("request to {url} failed: {err}")
+        };
+
+        validation_failure(
+            provider_name,
+            model_id,
+            &route.base_url,
+            &details,
+            "check that the service is running, confirm the base URL and protocol, and verify credentials",
+        )
+    })?;
+
+    if response.status().is_success() {
+        return Ok(());
+    }
+
+    let status = response.status();
+    let body = response.text().await.unwrap_or_default();
+    let body = body.trim();
+    let body_suffix = if body.is_empty() {
+        String::new()
+    } else {
+        format!(
+            " Response body: {}",
+            body.chars().take(200).collect::<String>()
+        )
+    };
+
+    let (details, next_steps) = match status.as_u16() {
+        400 | 404 | 405 | 422 => (
+            format!("upstream rejected the validation request with HTTP {status}.{body_suffix}"),
+            "confirm the provider type, base URL, and model identifier",
+        ),
+        401 | 403 => (
+            format!("upstream rejected credentials with HTTP {status}.{body_suffix}"),
+            "verify the provider API key and any required auth headers",
+        ),
+        429 => (
+            format!(
+                "upstream rate-limited the validation request with HTTP {status}.{body_suffix}"
+            ),
+            "retry later or verify quota/limits on the upstream provider",
+        ),
+        500..=599 => (
+            format!("upstream returned HTTP {status}.{body_suffix}"),
+            "check whether the endpoint is healthy and serving requests",
+        ),
+        _ => (
+            format!("upstream returned unexpected HTTP {status}.{body_suffix}"),
+            "confirm the endpoint URL, protocol, credentials, and model identifier",
+        ),
+    };
+
+    Err(validation_failure(
+        provider_name,
+        model_id,
+        &route.base_url,
+        &details,
+        next_steps,
+    ))
+}
+
 fn find_provider_api_key(provider: &Provider, preferred_key_names: &[&str]) -> Option<String> {
     for key in preferred_key_names {
         if let Some(value) = provider.credentials.get(*key)
@@ -369,6 +570,8 @@ async fn resolve_route_by_name(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use wiremock::matchers::{body_partial_json, header, method, path};
+    use wiremock::{Mock, MockServer, ResponseTemplate};
 
     fn make_route(name: &str, provider_name: &str, model_id: &str) -> InferenceRoute {
         InferenceRoute {
@@ -392,6 +595,20 @@ mod tests {
         }
     }
 
+    fn make_provider_with_base_url(
+        name: &str,
+        provider_type: &str,
+        key_name: &str,
+        key_value: &str,
+        base_url_key: &str,
+        base_url: &str,
+    ) -> Provider {
+        Provider {
+            config: std::iter::once((base_url_key.to_string(), base_url.to_string())).collect(),
+            ..make_provider(name, provider_type, key_name, key_value)
+        }
+    }
+
     #[tokio::test]
     async fn upsert_cluster_route_creates_and_increments_version() {
         let store = Store::connect("sqlite::memory:?cache=shared")
@@ -409,6 +626,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o",
+            true,
         )
         .await
         .expect("first set should succeed");
@@ -420,6 +638,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4.1",
+            true,
         )
         .await
         .expect("second set should succeed");
@@ -630,6 +849,7 @@ mod tests {
             SANDBOX_SYSTEM_ROUTE_NAME,
             "anthropic-dev",
             "claude-sonnet-4-20250514",
+            true,
         )
         .await
         .expect("should succeed");
@@ -715,6 +935,7 @@ mod tests {
             SANDBOX_SYSTEM_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
+            true,
         )
         .await
         .expect("upsert should succeed");
@@ -730,6 +951,175 @@ mod tests {
         assert_eq!(config.model_id, "gpt-4o-mini");
     }
 
+    #[test]
+    fn openai_validation_probe_uses_lightweight_chat_shape() {
+        let route = ResolvedProviderRoute {
+            provider_type: "openai".to_string(),
+            base_url: "https://api.openai.com/v1".to_string(),
+            protocols: vec!["openai_chat_completions".to_string()],
+            api_key: "sk-test".to_string(),
+            auth: AuthHeader::Bearer,
+            default_headers: Vec::new(),
+        };
+
+        let probe = validation_probe(&route, "gpt-4.1").expect("probe should build");
+
+        assert_eq!(probe.path, "/v1/chat/completions");
+        assert_eq!(probe.body["model"], "gpt-4.1");
+        assert_eq!(probe.body["max_tokens"], 1);
+    }
+
+    #[test]
+    fn anthropic_validation_probe_uses_messages_shape() {
+        let route = ResolvedProviderRoute {
+            provider_type: "anthropic".to_string(),
+            base_url: "https://api.anthropic.com/v1".to_string(),
+            protocols: vec!["anthropic_messages".to_string()],
+            api_key: "sk-test".to_string(),
+            auth: AuthHeader::Custom("x-api-key"),
+            default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+        };
+
+        let probe =
+            validation_probe(&route, "claude-sonnet-4-20250514").expect("probe should build");
+
+        assert_eq!(probe.path, "/v1/messages");
+        assert_eq!(probe.body["model"], "claude-sonnet-4-20250514");
+        assert_eq!(probe.body["max_tokens"], 1);
+    }
+
+    #[tokio::test]
+    async fn upsert_cluster_route_verifies_endpoint_by_default() {
+        let store = Store::connect("sqlite::memory:?cache=shared")
+            .await
+            .expect("store");
+        let mock_server = MockServer::start().await;
+
+        Mock::given(method("POST"))
+            .and(path("/v1/chat/completions"))
+            .and(header("authorization", "Bearer sk-test"))
+            .and(body_partial_json(serde_json::json!({
+                "model": "gpt-4o-mini",
+                "max_tokens": 1,
+            })))
+            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
+                "id": "chatcmpl-123",
+                "object": "chat.completion",
+                "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}],
+                "model": "gpt-4o-mini"
+            })))
+            .mount(&mock_server)
+            .await;
+
+        let provider = make_provider_with_base_url(
+            "openai-dev",
+            "openai",
+            "OPENAI_API_KEY",
+            "sk-test",
+            "OPENAI_BASE_URL",
+            &mock_server.uri(),
+        );
+        store
+            .put_message(&provider)
+            .await
+            .expect("persist provider");
+
+        let route = upsert_cluster_inference_route(
+            &store,
+            CLUSTER_INFERENCE_ROUTE_NAME,
+            "openai-dev",
+            "gpt-4o-mini",
+            false,
+        )
+        .await
+        .expect("validation should succeed");
+
+        assert_eq!(route.version, 1);
+    }
+
+    #[tokio::test]
+    async fn upsert_cluster_route_rejects_failed_validation() {
+        let store = Store::connect("sqlite::memory:?cache=shared")
+            .await
+            .expect("store");
+        let mock_server = MockServer::start().await;
+
+        Mock::given(method("POST"))
+            .and(path("/v1/chat/completions"))
+            .respond_with(ResponseTemplate::new(401).set_body_string("bad key"))
+            .mount(&mock_server)
+            .await;
+
+        let provider = make_provider_with_base_url(
+            "openai-dev",
+            "openai",
+            "OPENAI_API_KEY",
+            "sk-test",
+            "OPENAI_BASE_URL",
+            &mock_server.uri(),
+        );
+        store
+            .put_message(&provider)
+            .await
+            .expect("persist provider");
+
+        let err = upsert_cluster_inference_route(
+            &store,
+            CLUSTER_INFERENCE_ROUTE_NAME,
+            "openai-dev",
+            "gpt-4o-mini",
+            false,
+        )
+        .await
+        .expect_err("validation should fail");
+
+        assert_eq!(err.code(), tonic::Code::FailedPrecondition);
+        assert!(
+            err.message()
+                .contains("failed to verify inference endpoint")
+        );
+        assert!(err.message().contains("verify the provider API key"));
+        assert!(err.message().contains("--no-verify"));
+
+        let persisted = store
+            .get_message_by_name::<InferenceRoute>(CLUSTER_INFERENCE_ROUTE_NAME)
+            .await
+            .expect("fetch route")
+            .is_none();
+        assert!(persisted, "route should not persist on failed validation");
+    }
+
+    #[tokio::test]
+    async fn upsert_cluster_route_skips_validation_when_requested() {
+        let store = Store::connect("sqlite::memory:?cache=shared")
+            .await
+            .expect("store");
+        let provider = make_provider_with_base_url(
+            "openai-dev",
+            "openai",
+            "OPENAI_API_KEY",
+            "sk-test",
+            "OPENAI_BASE_URL",
+            "http://127.0.0.1:9",
+        );
+        store
+            .put_message(&provider)
+            .await
+            .expect("persist provider");
+
+        let route = upsert_cluster_inference_route(
+            &store,
+            CLUSTER_INFERENCE_ROUTE_NAME,
+            "openai-dev",
+            "gpt-4o-mini",
+            true,
+        )
+        .await
+        .expect("skip validation should persist route");
+
+        assert_eq!(route.version, 1);
+    }
+
     #[test]
     fn effective_route_name_defaults_empty_to_inference_local() {
         assert_eq!(
diff --git a/docs/inference/configure.md b/docs/inference/configure.md
index b4dcd781..9d64b622 100644
--- a/docs/inference/configure.md
+++ b/docs/inference/configure.md
@@ -135,9 +135,9 @@ Use this endpoint when inference should stay local to the host for privacy and s
 
 ### Verify the Endpoint from a Sandbox
 
-`openshell inference get` confirms the configuration was saved, but does not verify the upstream endpoint is reachable. The CLI also accepts `--no-verify` on `openshell inference set` and `openshell inference update` so automation can opt out explicitly ahead of a future verify-by-default rollout.
+`openshell inference set` and `openshell inference update` verify the upstream endpoint by default before saving the configuration. If the endpoint is not live yet, try again with `--no-verify` to persist the route without the probe.
 
-To confirm end-to-end connectivity, connect to a sandbox and run:
+`openshell inference get` confirms the current saved configuration. To confirm end-to-end connectivity from a sandbox, run:
 
 ```bash
 curl https://inference.local/v1/responses \
diff --git a/proto/inference.proto b/proto/inference.proto
index 11670c4a..828927ec 100644
--- a/proto/inference.proto
+++ b/proto/inference.proto
@@ -56,6 +56,8 @@ message SetClusterInferenceRequest {
   // Route name to target. Empty string defaults to "inference.local" (user-facing).
   // Use "sandbox-system" for the sandbox system-level inference route.
   string route_name = 3;
+  // Skip synchronous endpoint validation before persistence.
+  bool skip_validation = 4;
 }
 
 message SetClusterInferenceResponse {
diff --git a/python/openshell/sandbox.py b/python/openshell/sandbox.py
index 7b48ab3b..3399c203 100644
--- a/python/openshell/sandbox.py
+++ b/python/openshell/sandbox.py
@@ -398,11 +398,13 @@ def set_cluster(
         *,
         provider_name: str,
         model_id: str,
+        no_verify: bool = False,
     ) -> ClusterInferenceConfig:
         response = self._stub.SetClusterInference(
             inference_pb2.SetClusterInferenceRequest(
                 provider_name=provider_name,
                 model_id=model_id,
+                skip_validation=no_verify,
             ),
             timeout=self._timeout,
         )
diff --git a/python/openshell/sandbox_test.py b/python/openshell/sandbox_test.py
index 4c0eebcd..441e69f4 100644
--- a/python/openshell/sandbox_test.py
+++ b/python/openshell/sandbox_test.py
@@ -10,6 +10,7 @@
 from openshell.sandbox import (
     _PYTHON_CLOUDPICKLE_BOOTSTRAP,
     _SANDBOX_PYTHON_BIN,
+    InferenceRouteClient,
     SandboxClient,
 )
 
@@ -33,6 +34,22 @@ def ExecSandbox(
         )
 
 
+class _FakeInferenceStub:
+    def __init__(self) -> None:
+        self.request = None
+
+    def SetClusterInference(self, request: Any, timeout: float | None = None) -> Any:
+        self.request = request
+        _ = timeout
+
+        class _Response:
+            provider_name = request.provider_name
+            model_id = request.model_id
+            version = 1
+
+        return _Response()
+
+
 def _client_with_fake_stub(stub: _FakeStub) -> SandboxClient:
     client = cast("SandboxClient", object.__new__(SandboxClient))
     client._timeout = 30.0
@@ -120,3 +137,20 @@ def test_from_active_cluster_prefers_openshell_gateway_env(
         assert client._cluster_name == gateway_name
     finally:
         client.close()
+
+
+def test_inference_set_cluster_forwards_no_verify_flag() -> None:
+    stub = _FakeInferenceStub()
+    client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
+    client._timeout = 30.0
+    client._stub = cast("Any", stub)
+
+    result = client.set_cluster(
+        provider_name="openai-dev",
+        model_id="gpt-4.1",
+        no_verify=True,
+    )
+
+    assert result.provider_name == "openai-dev"
+    assert stub.request is not None
+    assert stub.request.skip_validation is True

From d1b1ad081dbfe50f15a11b4319ca95824dbbd70c Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Fri, 13 Mar 2026 16:43:41 -0700
Subject: [PATCH 02/13] feat(inference): make endpoint verification explicit

---
 architecture/inference-routing.md        |   4 +-
 crates/openshell-cli/src/main.rs         |  72 +++++++++-----
 crates/openshell-cli/src/run.rs          |  68 +++++++++++--
 crates/openshell-server/src/inference.rs | 118 +++++++++++++++++------
 proto/inference.proto                    |  13 ++-
 python/openshell/sandbox.py              |   7 +-
 python/openshell/sandbox_test.py         |  39 +++++++-
 7 files changed, 249 insertions(+), 72 deletions(-)

diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md
index 851823f3..25d8dee6 100644
--- a/architecture/inference-routing.md
+++ b/architecture/inference-routing.md
@@ -66,7 +66,7 @@ The gateway implements the `Inference` gRPC service defined in `proto/inference.
 1. Validates that both fields are non-empty.
 2. Fetches the named provider record from the store.
 3. Validates the provider by resolving its route (checking that the provider type is supported and has a usable API key).
-4. Unless `skip_validation` is set, performs a lightweight provider-shaped probe against the resolved upstream endpoint (for example, a tiny chat/messages request with `max_tokens: 1`) to confirm the endpoint is reachable and accepts the expected auth/request shape.
+4. By default, performs a lightweight provider-shaped probe against the resolved upstream endpoint (for example, a tiny chat/messages request with `max_tokens: 1`) to confirm the endpoint is reachable and accepts the expected auth/request shape. `--no-verify` disables this probe when the endpoint is not up yet.
 5. Builds a managed route spec that stores only `provider_name` and `model_id`. The spec intentionally leaves `base_url`, `api_key`, and `protocols` empty -- these are resolved dynamically at bundle time from the provider record.
 6. Upserts the route with name `inference.local`. Version starts at 1 and increments monotonically on each update.
 
@@ -92,7 +92,7 @@ File: `proto/inference.proto`
 
 Key messages:
 
-- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `skip_validation`
+- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `verify` / `no_verify` overrides, with verification enabled by default
 - `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `version`
 - `GetInferenceBundleResponse` -- `repeated ResolvedRoute routes` + `revision` + `generated_at_ms`
 - `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index ca82095a..ee325dcf 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -905,8 +905,12 @@ enum InferenceCommands {
         #[arg(long)]
         system: bool,
 
+        /// Verify the resolved upstream endpoint before saving the route.
+        #[arg(long, conflicts_with = "no_verify")]
+        verify: bool,
+
         /// Skip endpoint verification before saving the route.
-        #[arg(long)]
+        #[arg(long, conflicts_with = "verify")]
         no_verify: bool,
     },
 
@@ -925,8 +929,12 @@ enum InferenceCommands {
         #[arg(long)]
         system: bool,
 
+        /// Verify the resolved upstream endpoint before saving the route.
+        #[arg(long, conflicts_with = "no_verify")]
+        verify: bool,
+
         /// Skip endpoint verification before saving the route.
-        #[arg(long)]
+        #[arg(long, conflicts_with = "verify")]
         no_verify: bool,
     },
 
@@ -1804,15 +1812,18 @@ async fn main() -> Result<()> {
                     provider,
                     model,
                     system,
-<<<<<<< HEAD
-                    no_verify: _,
-=======
+                    verify,
                     no_verify,
->>>>>>> 7f0504d8 (feat(inference): validate endpoints before saving routes)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_set(
-                        endpoint, &provider, &model, route_name, no_verify, &tls,
+                        endpoint,
+                        &provider,
+                        &model,
+                        route_name,
+                        verify,
+                        no_verify,
+                        &tls,
                     )
                     .await?;
                 }
@@ -1820,11 +1831,8 @@ async fn main() -> Result<()> {
                     provider,
                     model,
                     system,
-<<<<<<< HEAD
-                    no_verify: _,
-=======
+                    verify,
                     no_verify,
->>>>>>> 7f0504d8 (feat(inference): validate endpoints before saving routes)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_update(
@@ -1832,6 +1840,7 @@ async fn main() -> Result<()> {
                         provider.as_deref(),
                         model.as_deref(),
                         route_name,
+                        verify,
                         no_verify,
                         &tls,
                     )
@@ -2565,7 +2574,7 @@ mod tests {
     }
 
     #[test]
-    fn inference_set_accepts_no_verify_flag() {
+    fn inference_set_accepts_verify_flag() {
         let cli = Cli::try_parse_from([
             "openshell",
             "inference",
@@ -2574,40 +2583,34 @@ mod tests {
             "openai-dev",
             "--model",
             "gpt-4.1",
-            "--no-verify",
+            "--verify",
         ])
-        .expect("inference set should parse --no-verify");
+        .expect("inference set should parse --verify");
 
         assert!(matches!(
             cli.command,
             Some(Commands::Inference {
-                command: Some(InferenceCommands::Set {
-                    no_verify: true,
-                    ..
-                })
+                command: Some(InferenceCommands::Set { verify: true, .. })
             })
         ));
     }
 
     #[test]
-    fn inference_update_accepts_no_verify_flag() {
+    fn inference_update_accepts_verify_flag() {
         let cli = Cli::try_parse_from([
             "openshell",
             "inference",
             "update",
             "--provider",
             "openai-dev",
-            "--no-verify",
+            "--verify",
         ])
-        .expect("inference update should parse --no-verify");
+        .expect("inference update should parse --verify");
 
         assert!(matches!(
             cli.command,
             Some(Commands::Inference {
-                command: Some(InferenceCommands::Update {
-                    no_verify: true,
-                    ..
-                })
+                command: Some(InferenceCommands::Update { verify: true, .. })
             })
         ));
     }
@@ -2848,4 +2851,23 @@ mod tests {
             })
         ));
     }
+
+    #[test]
+    fn inference_set_rejects_verify_and_no_verify_together() {
+        let err = Cli::try_parse_from([
+            "openshell",
+            "inference",
+            "set",
+            "--provider",
+            "openai-dev",
+            "--model",
+            "gpt-4.1",
+            "--verify",
+            "--no-verify",
+        ])
+        .expect_err("verify and no-verify should conflict");
+
+        assert!(err.to_string().contains("--verify"));
+        assert!(err.to_string().contains("--no-verify"));
+    }
 }
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 878c8c2c..802ce3e5 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -3390,19 +3390,39 @@ pub async fn gateway_inference_set(
     provider_name: &str,
     model_id: &str,
     route_name: &str,
-    skip_validation: bool,
+    verify: bool,
+    no_verify: bool,
     tls: &TlsOptions,
 ) -> Result<()> {
+    let progress = if std::io::stdout().is_terminal() {
+        let spinner = ProgressBar::new_spinner();
+        spinner.set_style(
+            ProgressStyle::with_template("{spinner:.cyan} {msg} ({elapsed})")
+                .unwrap_or_else(|_| ProgressStyle::default_spinner()),
+        );
+        spinner.set_message("Configuring inference...");
+        spinner.enable_steady_tick(Duration::from_millis(120));
+        Some(spinner)
+    } else {
+        None
+    };
+
     let mut client = grpc_inference_client(server, tls).await?;
     let response = client
         .set_cluster_inference(SetClusterInferenceRequest {
             provider_name: provider_name.to_string(),
             model_id: model_id.to_string(),
             route_name: route_name.to_string(),
-            skip_validation,
+            verify,
+            no_verify,
         })
-        .await
-        .into_diagnostic()?;
+        .await;
+
+    if let Some(progress) = &progress {
+        progress.finish_and_clear();
+    }
+
+    let response = response.into_diagnostic()?;
 
     let configured = response.into_inner();
     let label = if configured.route_name == "sandbox-system" {
@@ -3416,6 +3436,12 @@ pub async fn gateway_inference_set(
     println!("  {} {}", "Provider:".dimmed(), configured.provider_name);
     println!("  {} {}", "Model:".dimmed(), configured.model_id);
     println!("  {} {}", "Version:".dimmed(), configured.version);
+    if configured.validation_performed {
+        println!("  {}", "Validated Endpoints:".dimmed());
+        for endpoint in configured.validated_endpoints {
+            println!("    - {} ({})", endpoint.url, endpoint.protocol);
+        }
+    }
     Ok(())
 }
 
@@ -3424,7 +3450,8 @@ pub async fn gateway_inference_update(
     provider_name: Option<&str>,
     model_id: Option<&str>,
     route_name: &str,
-    skip_validation: bool,
+    verify: bool,
+    no_verify: bool,
     tls: &TlsOptions,
 ) -> Result<()> {
     if provider_name.is_none() && model_id.is_none() {
@@ -3447,15 +3474,34 @@ pub async fn gateway_inference_update(
     let provider = provider_name.unwrap_or(&current.provider_name);
     let model = model_id.unwrap_or(&current.model_id);
 
+    let progress = if std::io::stdout().is_terminal() {
+        let spinner = ProgressBar::new_spinner();
+        spinner.set_style(
+            ProgressStyle::with_template("{spinner:.cyan} {msg} ({elapsed})")
+                .unwrap_or_else(|_| ProgressStyle::default_spinner()),
+        );
+        spinner.set_message("Configuring inference...");
+        spinner.enable_steady_tick(Duration::from_millis(120));
+        Some(spinner)
+    } else {
+        None
+    };
+
     let response = client
         .set_cluster_inference(SetClusterInferenceRequest {
             provider_name: provider.to_string(),
             model_id: model.to_string(),
             route_name: route_name.to_string(),
-            skip_validation,
+            verify,
+            no_verify,
         })
-        .await
-        .into_diagnostic()?;
+        .await;
+
+    if let Some(progress) = &progress {
+        progress.finish_and_clear();
+    }
+
+    let response = response.into_diagnostic()?;
 
     let configured = response.into_inner();
     let label = if configured.route_name == "sandbox-system" {
@@ -3469,6 +3515,12 @@ pub async fn gateway_inference_update(
     println!("  {} {}", "Provider:".dimmed(), configured.provider_name);
     println!("  {} {}", "Model:".dimmed(), configured.model_id);
     println!("  {} {}", "Version:".dimmed(), configured.version);
+    if configured.validation_performed {
+        println!("  {}", "Validated Endpoints:".dimmed());
+        for endpoint in configured.validated_endpoints {
+            println!("    - {} ({})", endpoint.url, endpoint.protocol);
+        }
+    }
     Ok(())
 }
 
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 35b704f1..895766e9 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -5,7 +5,8 @@ use openshell_core::inference::AuthHeader;
 use openshell_core::proto::{
     ClusterInferenceConfig, GetClusterInferenceRequest, GetClusterInferenceResponse,
     GetInferenceBundleRequest, GetInferenceBundleResponse, InferenceRoute, Provider, ResolvedRoute,
-    SetClusterInferenceRequest, SetClusterInferenceResponse, inference_server::Inference,
+    SetClusterInferenceRequest, SetClusterInferenceResponse, ValidatedEndpoint,
+    inference_server::Inference,
 };
 use std::sync::Arc;
 use std::time::Duration;
@@ -78,16 +79,18 @@ impl Inference for InferenceService {
     ) -> Result<Response<SetClusterInferenceResponse>, Status> {
         let req = request.into_inner();
         let route_name = effective_route_name(&req.route_name)?;
+        let verify = req.verify || !req.no_verify;
         let route = upsert_cluster_inference_route(
             self.state.store.as_ref(),
             route_name,
             &req.provider_name,
             &req.model_id,
-            req.skip_validation,
+            verify,
         )
         .await?;
 
         let config = route
+            .route
             .config
             .as_ref()
             .ok_or_else(|| Status::internal("managed route missing config"))?;
@@ -95,8 +98,10 @@ impl Inference for InferenceService {
         Ok(Response::new(SetClusterInferenceResponse {
             provider_name: config.provider_name.clone(),
             model_id: config.model_id.clone(),
-            version: route.version,
+            version: route.route.version,
             route_name: route_name.to_string(),
+            validation_performed: !route.validation.is_empty(),
+            validated_endpoints: route.validation,
         }))
     }
 
@@ -143,8 +148,8 @@ async fn upsert_cluster_inference_route(
     route_name: &str,
     provider_name: &str,
     model_id: &str,
-    skip_validation: bool,
-) -> Result<InferenceRoute, Status> {
+    verify: bool,
+) -> Result<UpsertedInferenceRoute, Status> {
     if provider_name.trim().is_empty() {
         return Err(Status::invalid_argument("provider_name is required"));
     }
@@ -161,9 +166,11 @@ async fn upsert_cluster_inference_route(
         })?;
 
     let resolved = resolve_provider_route(&provider)?;
-    if !skip_validation {
-        verify_provider_endpoint(&provider.name, model_id, &resolved).await?;
-    }
+    let validation = if verify {
+        vec![verify_provider_endpoint(&provider.name, model_id, &resolved).await?]
+    } else {
+        Vec::new()
+    };
 
     let config = build_cluster_inference_config(&provider, model_id);
 
@@ -193,7 +200,7 @@ async fn upsert_cluster_inference_route(
         .await
         .map_err(|e| Status::internal(format!("persist route failed: {e}")))?;
 
-    Ok(route)
+    Ok(UpsertedInferenceRoute { route, validation })
 }
 
 fn build_cluster_inference_config(provider: &Provider, model_id: &str) -> ClusterInferenceConfig {
@@ -217,6 +224,12 @@ struct ValidationProbe {
     body: serde_json::Value,
 }
 
+#[derive(Debug)]
+struct UpsertedInferenceRoute {
+    route: InferenceRoute,
+    validation: Vec<ValidatedEndpoint>,
+}
+
 fn resolve_provider_route(provider: &Provider) -> Result<ResolvedProviderRoute, Status> {
     let provider_type = provider.r#type.trim().to_ascii_lowercase();
 
@@ -332,6 +345,45 @@ fn validation_probe(
     )))
 }
 
+fn validated_protocol(route: &ResolvedProviderRoute) -> Result<String, Status> {
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_chat_completions")
+    {
+        return Ok("openai_chat_completions".to_string());
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "anthropic_messages")
+    {
+        return Ok("anthropic_messages".to_string());
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_responses")
+    {
+        return Ok("openai_responses".to_string());
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_completions")
+    {
+        return Ok("openai_completions".to_string());
+    }
+
+    Err(Status::failed_precondition(format!(
+        "provider type '{}' does not expose a writable inference protocol for validation",
+        route.provider_type
+    )))
+}
+
 fn validation_url(base_url: &str, path: &str) -> String {
     let base = base_url.trim_end_matches('/');
     if base.ends_with("/v1") && (path == "/v1" || path.starts_with("/v1/")) {
@@ -357,8 +409,9 @@ async fn verify_provider_endpoint(
     provider_name: &str,
     model_id: &str,
     route: &ResolvedProviderRoute,
-) -> Result<(), Status> {
+) -> Result<ValidatedEndpoint, Status> {
     let probe = validation_probe(route, model_id)?;
+    let protocol = validated_protocol(route)?;
     let url = validation_url(&route.base_url, probe.path);
     let client = reqwest::Client::builder()
         .timeout(Duration::from_secs(5))
@@ -396,7 +449,7 @@ async fn verify_provider_endpoint(
     })?;
 
     if response.status().is_success() {
-        return Ok(());
+        return Ok(ValidatedEndpoint { url, protocol });
     }
 
     let status = response.status();
@@ -626,26 +679,26 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o",
-            true,
+            false,
         )
         .await
         .expect("first set should succeed");
-        assert_eq!(first.name, CLUSTER_INFERENCE_ROUTE_NAME);
-        assert_eq!(first.version, 1);
+        assert_eq!(first.route.name, CLUSTER_INFERENCE_ROUTE_NAME);
+        assert_eq!(first.route.version, 1);
 
         let second = upsert_cluster_inference_route(
             &store,
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4.1",
-            true,
+            false,
         )
         .await
         .expect("second set should succeed");
-        assert_eq!(second.version, 2);
-        assert_eq!(second.id, first.id);
+        assert_eq!(second.route.version, 2);
+        assert_eq!(second.route.id, first.route.id);
 
-        let config = second.config.as_ref().expect("config");
+        let config = second.route.config.as_ref().expect("config");
         assert_eq!(config.provider_name, "openai-dev");
         assert_eq!(config.model_id, "gpt-4.1");
     }
@@ -849,14 +902,14 @@ mod tests {
             SANDBOX_SYSTEM_ROUTE_NAME,
             "anthropic-dev",
             "claude-sonnet-4-20250514",
-            true,
+            false,
         )
         .await
         .expect("should succeed");
 
-        assert_eq!(route.name, SANDBOX_SYSTEM_ROUTE_NAME);
-        assert_eq!(route.version, 1);
-        let config = route.config.as_ref().expect("config");
+        assert_eq!(route.route.name, SANDBOX_SYSTEM_ROUTE_NAME);
+        assert_eq!(route.route.version, 1);
+        let config = route.route.config.as_ref().expect("config");
         assert_eq!(config.provider_name, "anthropic-dev");
         assert_eq!(config.model_id, "claude-sonnet-4-20250514");
     }
@@ -935,7 +988,7 @@ mod tests {
             SANDBOX_SYSTEM_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
-            true,
+            false,
         )
         .await
         .expect("upsert should succeed");
@@ -989,7 +1042,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn upsert_cluster_route_verifies_endpoint_by_default() {
+    async fn upsert_cluster_route_verifies_endpoint_when_requested() {
         let store = Store::connect("sqlite::memory:?cache=shared")
             .await
             .expect("store");
@@ -1029,12 +1082,14 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
-            false,
+            true,
         )
         .await
         .expect("validation should succeed");
 
-        assert_eq!(route.version, 1);
+        assert_eq!(route.route.version, 1);
+        assert_eq!(route.validation.len(), 1);
+        assert_eq!(route.validation[0].protocol, "openai_chat_completions");
     }
 
     #[tokio::test]
@@ -1068,7 +1123,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
-            false,
+            true,
         )
         .await
         .expect_err("validation should fail");
@@ -1090,7 +1145,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn upsert_cluster_route_skips_validation_when_requested() {
+    async fn upsert_cluster_route_skips_validation_by_default() {
         let store = Store::connect("sqlite::memory:?cache=shared")
             .await
             .expect("store");
@@ -1112,12 +1167,13 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
-            true,
+            false,
         )
         .await
-        .expect("skip validation should persist route");
+        .expect("non-verified route should persist");
 
-        assert_eq!(route.version, 1);
+        assert_eq!(route.route.version, 1);
+        assert!(route.validation.is_empty());
     }
 
     #[test]
diff --git a/proto/inference.proto b/proto/inference.proto
index 828927ec..782fa0bb 100644
--- a/proto/inference.proto
+++ b/proto/inference.proto
@@ -56,8 +56,13 @@ message SetClusterInferenceRequest {
   // Route name to target. Empty string defaults to "inference.local" (user-facing).
   // Use "sandbox-system" for the sandbox system-level inference route.
   string route_name = 3;
-  // Skip synchronous endpoint validation before persistence.
-  bool skip_validation = 4;
+  // Verify the resolved upstream endpoint synchronously before persistence.
+  bool verify = 4;
+}
+
+message ValidatedEndpoint {
+  string url = 1;
+  string protocol = 2;
 }
 
 message SetClusterInferenceResponse {
@@ -66,6 +71,10 @@ message SetClusterInferenceResponse {
   uint64 version = 3;
   // Route name that was configured.
   string route_name = 4;
+  // Whether endpoint verification ran as part of this request.
+  bool validation_performed = 5;
+  // The concrete endpoints that were probed during validation, when available.
+  repeated ValidatedEndpoint validated_endpoints = 6;
 }
 
 message GetClusterInferenceRequest {
diff --git a/python/openshell/sandbox.py b/python/openshell/sandbox.py
index 3399c203..f10f450c 100644
--- a/python/openshell/sandbox.py
+++ b/python/openshell/sandbox.py
@@ -398,13 +398,18 @@ def set_cluster(
         *,
         provider_name: str,
         model_id: str,
+        verify: bool = False,
         no_verify: bool = False,
     ) -> ClusterInferenceConfig:
+        if verify and no_verify:
+            raise ValueError("verify and no_verify are mutually exclusive")
+
         response = self._stub.SetClusterInference(
             inference_pb2.SetClusterInferenceRequest(
                 provider_name=provider_name,
                 model_id=model_id,
-                skip_validation=no_verify,
+                verify=verify,
+                no_verify=no_verify,
             ),
             timeout=self._timeout,
         )
diff --git a/python/openshell/sandbox_test.py b/python/openshell/sandbox_test.py
index 441e69f4..d7052c92 100644
--- a/python/openshell/sandbox_test.py
+++ b/python/openshell/sandbox_test.py
@@ -6,6 +6,8 @@
 import json
 from typing import TYPE_CHECKING, Any, cast
 
+import pytest
+
 from openshell._proto import openshell_pb2
 from openshell.sandbox import (
     _PYTHON_CLOUDPICKLE_BOOTSTRAP,
@@ -139,7 +141,7 @@ def test_from_active_cluster_prefers_openshell_gateway_env(
         client.close()
 
 
-def test_inference_set_cluster_forwards_no_verify_flag() -> None:
+def test_inference_set_cluster_forwards_verify_flag() -> None:
     stub = _FakeInferenceStub()
     client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
     client._timeout = 30.0
@@ -148,9 +150,40 @@ def test_inference_set_cluster_forwards_no_verify_flag() -> None:
     result = client.set_cluster(
         provider_name="openai-dev",
         model_id="gpt-4.1",
-        no_verify=True,
+        verify=True,
     )
 
     assert result.provider_name == "openai-dev"
     assert stub.request is not None
-    assert stub.request.skip_validation is True
+    assert stub.request.verify is True
+
+
+def test_inference_set_cluster_forwards_no_verify_flag() -> None:
+    stub = _FakeInferenceStub()
+    client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
+    client._timeout = 30.0
+    client._stub = cast("Any", stub)
+
+    client.set_cluster(
+        provider_name="openai-dev",
+        model_id="gpt-4.1",
+        no_verify=True,
+    )
+
+    assert stub.request is not None
+    assert stub.request.no_verify is True
+
+
+def test_inference_set_cluster_rejects_conflicting_flags() -> None:
+    stub = _FakeInferenceStub()
+    client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
+    client._timeout = 30.0
+    client._stub = cast("Any", stub)
+
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        client.set_cluster(
+            provider_name="openai-dev",
+            model_id="gpt-4.1",
+            verify=True,
+            no_verify=True,
+        )

From 707bb87f3f17c41b3979a3d4cc375d40fd8dc866 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Fri, 13 Mar 2026 17:11:49 -0700
Subject: [PATCH 03/13] feat(inference): add explicit no-verify flag

---
 crates/openshell-cli/src/main.rs | 77 ++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index ee325dcf..51011426 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -1813,7 +1813,11 @@ async fn main() -> Result<()> {
                     model,
                     system,
                     verify,
+<<<<<<< HEAD
                     no_verify,
+=======
+                    no_verify: _,
+>>>>>>> 3e3273e3 (feat(inference): add explicit no-verify flag)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_set(
@@ -1832,7 +1836,11 @@ async fn main() -> Result<()> {
                     model,
                     system,
                     verify,
+<<<<<<< HEAD
                     no_verify,
+=======
+                    no_verify: _,
+>>>>>>> 3e3273e3 (feat(inference): add explicit no-verify flag)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_update(
@@ -2615,6 +2623,75 @@ mod tests {
         ));
     }
 
+    #[test]
+    fn inference_set_accepts_no_verify_flag() {
+        let cli = Cli::try_parse_from([
+            "openshell",
+            "inference",
+            "set",
+            "--provider",
+            "openai-dev",
+            "--model",
+            "gpt-4.1",
+            "--no-verify",
+        ])
+        .expect("inference set should parse --no-verify");
+
+        assert!(matches!(
+            cli.command,
+            Some(Commands::Inference {
+                command: Some(InferenceCommands::Set {
+                    no_verify: true,
+                    verify: false,
+                    ..
+                })
+            })
+        ));
+    }
+
+    #[test]
+    fn inference_update_accepts_no_verify_flag() {
+        let cli = Cli::try_parse_from([
+            "openshell",
+            "inference",
+            "update",
+            "--provider",
+            "openai-dev",
+            "--no-verify",
+        ])
+        .expect("inference update should parse --no-verify");
+
+        assert!(matches!(
+            cli.command,
+            Some(Commands::Inference {
+                command: Some(InferenceCommands::Update {
+                    no_verify: true,
+                    verify: false,
+                    ..
+                })
+            })
+        ));
+    }
+
+    #[test]
+    fn inference_set_rejects_verify_and_no_verify_together() {
+        let err = Cli::try_parse_from([
+            "openshell",
+            "inference",
+            "set",
+            "--provider",
+            "openai-dev",
+            "--model",
+            "gpt-4.1",
+            "--verify",
+            "--no-verify",
+        ])
+        .expect_err("verify and no-verify should conflict");
+
+        assert!(err.to_string().contains("--verify"));
+        assert!(err.to_string().contains("--no-verify"));
+    }
+
     #[test]
     fn completion_script_uses_openshell_command_name() {
         let script = normalize_completion_script(

From 49e87da29a87d6eac6cbfad9853175a05cfef9c7 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Fri, 13 Mar 2026 17:20:01 -0700
Subject: [PATCH 04/13] feat(inference): increase verification timeout

---
 crates/openshell-server/src/inference.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 895766e9..885e9748 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -414,7 +414,7 @@ async fn verify_provider_endpoint(
     let protocol = validated_protocol(route)?;
     let url = validation_url(&route.base_url, probe.path);
     let client = reqwest::Client::builder()
-        .timeout(Duration::from_secs(5))
+        .timeout(Duration::from_secs(10))
         .build()
         .map_err(|err| Status::internal(format!("build validation client failed: {err}")))?;
 

From 7f1fd427ff2579a62045acbc800a2825141305ab Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Fri, 13 Mar 2026 17:45:53 -0700
Subject: [PATCH 05/13] refactor(inference): share endpoint verification logic

---
 Cargo.lock                               |   1 +
 crates/openshell-router/src/backend.rs   | 166 +++++++++++-
 crates/openshell-router/src/lib.rs       |   4 +-
 crates/openshell-server/Cargo.toml       |   1 +
 crates/openshell-server/src/inference.rs | 315 ++++-------------------
 5 files changed, 227 insertions(+), 260 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 051d9e25..3d01356a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2983,6 +2983,7 @@ dependencies = [
  "miette",
  "openshell-core",
  "openshell-policy",
+ "openshell-router",
  "petname",
  "pin-project-lite",
  "prost",
diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs
index a060d3f9..1bce1fa6 100644
--- a/crates/openshell-router/src/backend.rs
+++ b/crates/openshell-router/src/backend.rs
@@ -4,6 +4,18 @@
 use crate::RouterError;
 use crate::config::{AuthHeader, ResolvedRoute};
 
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ValidatedEndpoint {
+    pub url: String,
+    pub protocol: String,
+}
+
+struct ValidationProbe {
+    path: &'static str,
+    protocol: &'static str,
+    body: bytes::Bytes,
+}
+
 /// Response from a proxied HTTP request to a backend (fully buffered).
 #[derive(Debug)]
 pub struct ProxyResponse {
@@ -128,6 +140,112 @@ async fn send_backend_request(
     })
 }
 
+fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, RouterError> {
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_chat_completions")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/chat/completions",
+            protocol: "openai_chat_completions",
+            body: bytes::Bytes::from_static(
+                br#"{"messages":[{"role":"user","content":"ping"}],"max_tokens":1}"#,
+            ),
+        });
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "anthropic_messages")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/messages",
+            protocol: "anthropic_messages",
+            body: bytes::Bytes::from_static(
+                br#"{"messages":[{"role":"user","content":"ping"}],"max_tokens":1}"#,
+            ),
+        });
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_responses")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/responses",
+            protocol: "openai_responses",
+            body: bytes::Bytes::from_static(br#"{"input":"ping","max_output_tokens":1}"#),
+        });
+    }
+
+    if route
+        .protocols
+        .iter()
+        .any(|protocol| protocol == "openai_completions")
+    {
+        return Ok(ValidationProbe {
+            path: "/v1/completions",
+            protocol: "openai_completions",
+            body: bytes::Bytes::from_static(br#"{"prompt":"ping","max_tokens":1}"#),
+        });
+    }
+
+    Err(RouterError::Internal(format!(
+        "route '{}' does not expose a writable inference protocol for validation",
+        route.name
+    )))
+}
+
+pub async fn verify_backend_endpoint(
+    client: &reqwest::Client,
+    route: &ResolvedRoute,
+) -> Result<ValidatedEndpoint, RouterError> {
+    let probe = validation_probe(route)?;
+    let response =
+        send_backend_request(client, route, "POST", probe.path, Vec::new(), probe.body).await?;
+    let url = build_backend_url(&route.endpoint, probe.path);
+
+    if response.status().is_success() {
+        return Ok(ValidatedEndpoint {
+            url,
+            protocol: probe.protocol.to_string(),
+        });
+    }
+
+    let status = response.status();
+    let body = response.text().await.map_err(|e| {
+        RouterError::UpstreamProtocol(format!("failed to read validation response body: {e}"))
+    })?;
+    let body = body.trim();
+    let body_suffix = if body.is_empty() {
+        String::new()
+    } else {
+        format!(
+            " Response body: {}",
+            body.chars().take(200).collect::<String>()
+        )
+    };
+
+    let details = match status.as_u16() {
+        400 | 404 | 405 | 422 => {
+            format!("upstream rejected the validation request with HTTP {status}.{body_suffix}")
+        }
+        401 | 403 => {
+            format!("upstream rejected credentials with HTTP {status}.{body_suffix}")
+        }
+        429 => {
+            format!("upstream rate-limited the validation request with HTTP {status}.{body_suffix}")
+        }
+        500..=599 => format!("upstream returned HTTP {status}.{body_suffix}"),
+        _ => format!("upstream returned unexpected HTTP {status}.{body_suffix}"),
+    };
+
+    Err(RouterError::UpstreamProtocol(details))
+}
+
 /// Extract status and headers from a [`reqwest::Response`].
 fn extract_response_metadata(response: &reqwest::Response) -> (u16, Vec<(String, String)>) {
     let status = response.status().as_u16();
@@ -201,7 +319,11 @@ fn build_backend_url(endpoint: &str, path: &str) -> String {
 
 #[cfg(test)]
 mod tests {
-    use super::build_backend_url;
+    use super::{build_backend_url, verify_backend_endpoint};
+    use crate::config::ResolvedRoute;
+    use openshell_core::inference::AuthHeader;
+    use wiremock::matchers::{body_partial_json, header, method, path};
+    use wiremock::{Mock, MockServer, ResponseTemplate};
 
     #[test]
     fn build_backend_url_dedupes_v1_prefix() {
@@ -226,4 +348,46 @@ mod tests {
             "https://api.openai.com/v1"
         );
     }
+
+    fn test_route(endpoint: &str, protocols: &[&str], auth: AuthHeader) -> ResolvedRoute {
+        ResolvedRoute {
+            name: "inference.local".to_string(),
+            endpoint: endpoint.to_string(),
+            model: "test-model".to_string(),
+            api_key: "sk-test".to_string(),
+            protocols: protocols.iter().map(|p| (*p).to_string()).collect(),
+            auth,
+            default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+        }
+    }
+
+    #[tokio::test]
+    async fn verify_backend_endpoint_uses_route_auth_and_shape() {
+        let mock_server = MockServer::start().await;
+        let route = test_route(
+            &mock_server.uri(),
+            &["anthropic_messages"],
+            AuthHeader::Custom("x-api-key"),
+        );
+
+        Mock::given(method("POST"))
+            .and(path("/v1/messages"))
+            .and(header("x-api-key", "sk-test"))
+            .and(header("anthropic-version", "2023-06-01"))
+            .and(body_partial_json(serde_json::json!({
+                "model": "test-model",
+                "max_tokens": 1,
+            })))
+            .respond_with(
+                ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "msg_1"})),
+            )
+            .mount(&mock_server)
+            .await;
+
+        let client = reqwest::Client::builder().build().unwrap();
+        let validated = verify_backend_endpoint(&client, &route).await.unwrap();
+
+        assert_eq!(validated.protocol, "anthropic_messages");
+        assert_eq!(validated.url, format!("{}/v1/messages", mock_server.uri()));
+    }
 }
diff --git a/crates/openshell-router/src/lib.rs b/crates/openshell-router/src/lib.rs
index 4edd4f87..fe4865c2 100644
--- a/crates/openshell-router/src/lib.rs
+++ b/crates/openshell-router/src/lib.rs
@@ -7,7 +7,9 @@ mod mock;
 
 use std::time::Duration;
 
-pub use backend::{ProxyResponse, StreamingProxyResponse};
+pub use backend::{
+    ProxyResponse, StreamingProxyResponse, ValidatedEndpoint, verify_backend_endpoint,
+};
 use config::{ResolvedRoute, RouterConfig};
 use tracing::info;
 
diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml
index c4935024..7bd72113 100644
--- a/crates/openshell-server/Cargo.toml
+++ b/crates/openshell-server/Cargo.toml
@@ -17,6 +17,7 @@ path = "src/main.rs"
 [dependencies]
 openshell-core = { path = "../openshell-core" }
 openshell-policy = { path = "../openshell-policy" }
+openshell-router = { path = "../openshell-router" }
 
 # Async runtime
 tokio = { workspace = true }
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 885e9748..51b5c512 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -1,13 +1,14 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use openshell_core::inference::AuthHeader;
 use openshell_core::proto::{
     ClusterInferenceConfig, GetClusterInferenceRequest, GetClusterInferenceResponse,
     GetInferenceBundleRequest, GetInferenceBundleResponse, InferenceRoute, Provider, ResolvedRoute,
     SetClusterInferenceRequest, SetClusterInferenceResponse, ValidatedEndpoint,
     inference_server::Inference,
 };
+use openshell_router::config::ResolvedRoute as RouterResolvedRoute;
+use openshell_router::{RouterError, verify_backend_endpoint};
 use std::sync::Arc;
 use std::time::Duration;
 use tonic::{Request, Response, Status};
@@ -212,16 +213,7 @@ fn build_cluster_inference_config(provider: &Provider, model_id: &str) -> Cluste
 
 struct ResolvedProviderRoute {
     provider_type: String,
-    base_url: String,
-    protocols: Vec<String>,
-    api_key: String,
-    auth: AuthHeader,
-    default_headers: Vec<(String, String)>,
-}
-
-struct ValidationProbe {
-    path: &'static str,
-    body: serde_json::Value,
+    route: RouterResolvedRoute,
 }
 
 #[derive(Debug)]
@@ -263,136 +255,22 @@ fn resolve_provider_route(provider: &Provider) -> Result<ResolvedProviderRoute,
 
     Ok(ResolvedProviderRoute {
         provider_type,
-        base_url,
-        protocols: profile.protocols.iter().map(|p| (*p).to_string()).collect(),
-        api_key,
-        auth: profile.auth.clone(),
-        default_headers: profile
-            .default_headers
-            .iter()
-            .map(|(name, value)| ((*name).to_string(), (*value).to_string()))
-            .collect(),
+        route: RouterResolvedRoute {
+            name: provider.name.clone(),
+            endpoint: base_url,
+            model: String::new(),
+            api_key,
+            protocols: profile.protocols.iter().map(|p| (*p).to_string()).collect(),
+            auth: profile.auth.clone(),
+            default_headers: profile
+                .default_headers
+                .iter()
+                .map(|(name, value)| ((*name).to_string(), (*value).to_string()))
+                .collect(),
+        },
     })
 }
 
-fn validation_probe(
-    route: &ResolvedProviderRoute,
-    model_id: &str,
-) -> Result<ValidationProbe, Status> {
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "openai_chat_completions")
-    {
-        return Ok(ValidationProbe {
-            path: "/v1/chat/completions",
-            body: serde_json::json!({
-                "model": model_id,
-                "messages": [{"role": "user", "content": "ping"}],
-                "max_tokens": 1,
-            }),
-        });
-    }
-
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "anthropic_messages")
-    {
-        return Ok(ValidationProbe {
-            path: "/v1/messages",
-            body: serde_json::json!({
-                "model": model_id,
-                "messages": [{"role": "user", "content": "ping"}],
-                "max_tokens": 1,
-            }),
-        });
-    }
-
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "openai_responses")
-    {
-        return Ok(ValidationProbe {
-            path: "/v1/responses",
-            body: serde_json::json!({
-                "model": model_id,
-                "input": "ping",
-                "max_output_tokens": 1,
-            }),
-        });
-    }
-
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "openai_completions")
-    {
-        return Ok(ValidationProbe {
-            path: "/v1/completions",
-            body: serde_json::json!({
-                "model": model_id,
-                "prompt": "ping",
-                "max_tokens": 1,
-            }),
-        });
-    }
-
-    Err(Status::failed_precondition(format!(
-        "provider type '{}' does not expose a writable inference protocol for validation",
-        route.provider_type
-    )))
-}
-
-fn validated_protocol(route: &ResolvedProviderRoute) -> Result<String, Status> {
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "openai_chat_completions")
-    {
-        return Ok("openai_chat_completions".to_string());
-    }
-
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "anthropic_messages")
-    {
-        return Ok("anthropic_messages".to_string());
-    }
-
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "openai_responses")
-    {
-        return Ok("openai_responses".to_string());
-    }
-
-    if route
-        .protocols
-        .iter()
-        .any(|protocol| protocol == "openai_completions")
-    {
-        return Ok("openai_completions".to_string());
-    }
-
-    Err(Status::failed_precondition(format!(
-        "provider type '{}' does not expose a writable inference protocol for validation",
-        route.provider_type
-    )))
-}
-
-fn validation_url(base_url: &str, path: &str) -> String {
-    let base = base_url.trim_end_matches('/');
-    if base.ends_with("/v1") && (path == "/v1" || path.starts_with("/v1/")) {
-        return format!("{base}{}", &path[3..]);
-    }
-
-    format!("{base}{path}")
-}
-
 fn validation_failure(
     provider_name: &str,
     model_id: &str,
@@ -405,97 +283,55 @@ fn validation_failure(
     ))
 }
 
+fn validation_next_steps(details: &str) -> &'static str {
+    if details.contains("credentials") {
+        return "verify the provider API key and any required auth headers";
+    }
+    if details.contains("rate-limited") {
+        return "retry later or verify quota/limits on the upstream provider";
+    }
+    if details.contains("validation request") || details.contains("unexpected HTTP") {
+        return "confirm the provider type, base URL, and model identifier";
+    }
+    if details.contains("failed to connect") || details.contains("timed out") {
+        return "check that the service is running, confirm the base URL and protocol, and verify credentials";
+    }
+    if details.contains("upstream returned HTTP") {
+        return "check whether the endpoint is healthy and serving requests";
+    }
+    "confirm the endpoint URL, protocol, credentials, and model identifier"
+}
+
 async fn verify_provider_endpoint(
     provider_name: &str,
     model_id: &str,
     route: &ResolvedProviderRoute,
 ) -> Result<ValidatedEndpoint, Status> {
-    let probe = validation_probe(route, model_id)?;
-    let protocol = validated_protocol(route)?;
-    let url = validation_url(&route.base_url, probe.path);
     let client = reqwest::Client::builder()
         .timeout(Duration::from_secs(10))
         .build()
         .map_err(|err| Status::internal(format!("build validation client failed: {err}")))?;
+    let mut route = route.route.clone();
+    route.model = model_id.to_string();
 
-    let mut request = client
-        .post(&url)
-        .header(reqwest::header::CONTENT_TYPE, "application/json");
-    request = match &route.auth {
-        AuthHeader::Bearer => request.bearer_auth(&route.api_key),
-        AuthHeader::Custom(name) => request.header(*name, &route.api_key),
-    };
-
-    for (name, value) in &route.default_headers {
-        request = request.header(name, value);
-    }
-
-    let response = request.json(&probe.body).send().await.map_err(|err| {
-        let details = if err.is_timeout() {
-            format!("request to {url} timed out")
-        } else if err.is_connect() {
-            format!("failed to connect to {url}: {err}")
-        } else {
-            format!("request to {url} failed: {err}")
-        };
-
-        validation_failure(
-            provider_name,
-            model_id,
-            &route.base_url,
-            &details,
-            "check that the service is running, confirm the base URL and protocol, and verify credentials",
-        )
-    })?;
-
-    if response.status().is_success() {
-        return Ok(ValidatedEndpoint { url, protocol });
-    }
-
-    let status = response.status();
-    let body = response.text().await.unwrap_or_default();
-    let body = body.trim();
-    let body_suffix = if body.is_empty() {
-        String::new()
-    } else {
-        format!(
-            " Response body: {}",
-            body.chars().take(200).collect::<String>()
-        )
-    };
-
-    let (details, next_steps) = match status.as_u16() {
-        400 | 404 | 405 | 422 => (
-            format!("upstream rejected the validation request with HTTP {status}.{body_suffix}"),
-            "confirm the provider type, base URL, and model identifier",
-        ),
-        401 | 403 => (
-            format!("upstream rejected credentials with HTTP {status}.{body_suffix}"),
-            "verify the provider API key and any required auth headers",
-        ),
-        429 => (
-            format!(
-                "upstream rate-limited the validation request with HTTP {status}.{body_suffix}"
+    verify_backend_endpoint(&client, &route)
+        .await
+        .map(|validated| ValidatedEndpoint {
+            url: validated.url,
+            protocol: validated.protocol,
+        })
+        .map_err(|err| match err {
+            RouterError::Internal(details)
+            | RouterError::UpstreamUnavailable(details)
+            | RouterError::UpstreamProtocol(details) => validation_failure(
+                provider_name,
+                model_id,
+                &route.endpoint,
+                &details,
+                validation_next_steps(&details),
             ),
-            "retry later or verify quota/limits on the upstream provider",
-        ),
-        500..=599 => (
-            format!("upstream returned HTTP {status}.{body_suffix}"),
-            "check whether the endpoint is healthy and serving requests",
-        ),
-        _ => (
-            format!("upstream returned unexpected HTTP {status}.{body_suffix}"),
-            "confirm the endpoint URL, protocol, credentials, and model identifier",
-        ),
-    };
-
-    Err(validation_failure(
-        provider_name,
-        model_id,
-        &route.base_url,
-        &details,
-        next_steps,
-    ))
+            other => Status::internal(format!("unexpected validation router error: {other}")),
+        })
 }
 
 fn find_provider_api_key(provider: &Provider, preferred_key_names: &[&str]) -> Option<String> {
@@ -612,10 +448,10 @@ async fn resolve_route_by_name(
 
     Ok(Some(ResolvedRoute {
         name: route_name.to_string(),
-        base_url: resolved.base_url,
+        base_url: resolved.route.endpoint,
         model_id: config.model_id.clone(),
-        api_key: resolved.api_key,
-        protocols: resolved.protocols,
+        api_key: resolved.route.api_key,
+        protocols: resolved.route.protocols,
         provider_type: resolved.provider_type,
     }))
 }
@@ -1004,43 +840,6 @@ mod tests {
         assert_eq!(config.model_id, "gpt-4o-mini");
     }
 
-    #[test]
-    fn openai_validation_probe_uses_lightweight_chat_shape() {
-        let route = ResolvedProviderRoute {
-            provider_type: "openai".to_string(),
-            base_url: "https://api.openai.com/v1".to_string(),
-            protocols: vec!["openai_chat_completions".to_string()],
-            api_key: "sk-test".to_string(),
-            auth: AuthHeader::Bearer,
-            default_headers: Vec::new(),
-        };
-
-        let probe = validation_probe(&route, "gpt-4.1").expect("probe should build");
-
-        assert_eq!(probe.path, "/v1/chat/completions");
-        assert_eq!(probe.body["model"], "gpt-4.1");
-        assert_eq!(probe.body["max_tokens"], 1);
-    }
-
-    #[test]
-    fn anthropic_validation_probe_uses_messages_shape() {
-        let route = ResolvedProviderRoute {
-            provider_type: "anthropic".to_string(),
-            base_url: "https://api.anthropic.com/v1".to_string(),
-            protocols: vec!["anthropic_messages".to_string()],
-            api_key: "sk-test".to_string(),
-            auth: AuthHeader::Custom("x-api-key"),
-            default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
-        };
-
-        let probe =
-            validation_probe(&route, "claude-sonnet-4-20250514").expect("probe should build");
-
-        assert_eq!(probe.path, "/v1/messages");
-        assert_eq!(probe.body["model"], "claude-sonnet-4-20250514");
-        assert_eq!(probe.body["max_tokens"], 1);
-    }
-
     #[tokio::test]
     async fn upsert_cluster_route_verifies_endpoint_when_requested() {
         let store = Store::connect("sqlite::memory:?cache=shared")

From 23940f93ff6e7bc75b0a0682b90efb53f7eeceac Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Fri, 13 Mar 2026 17:58:29 -0700
Subject: [PATCH 06/13] refactor(inference): classify validation failures

---
 crates/openshell-router/src/backend.rs   | 69 ++++++++++++++++++++----
 crates/openshell-router/src/lib.rs       |  3 +-
 crates/openshell-server/src/inference.rs | 51 +++++++++---------
 3 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs
index 1bce1fa6..bfaae625 100644
--- a/crates/openshell-router/src/backend.rs
+++ b/crates/openshell-router/src/backend.rs
@@ -10,6 +10,22 @@ pub struct ValidatedEndpoint {
     pub protocol: String,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ValidationFailureKind {
+    RequestShape,
+    Credentials,
+    RateLimited,
+    Connectivity,
+    UpstreamHealth,
+    Unexpected,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ValidationFailure {
+    pub kind: ValidationFailureKind,
+    pub details: String,
+}
+
 struct ValidationProbe {
     path: &'static str,
     protocol: &'static str,
@@ -140,7 +156,7 @@ async fn send_backend_request(
     })
 }
 
-fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, RouterError> {
+fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, ValidationFailure> {
     if route
         .protocols
         .iter()
@@ -193,19 +209,40 @@ fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, RouterErro
         });
     }
 
-    Err(RouterError::Internal(format!(
-        "route '{}' does not expose a writable inference protocol for validation",
-        route.name
-    )))
+    Err(ValidationFailure {
+        kind: ValidationFailureKind::RequestShape,
+        details: format!(
+            "route '{}' does not expose a writable inference protocol for validation",
+            route.name
+        ),
+    })
 }
 
 pub async fn verify_backend_endpoint(
     client: &reqwest::Client,
     route: &ResolvedRoute,
-) -> Result<ValidatedEndpoint, RouterError> {
+) -> Result<ValidatedEndpoint, ValidationFailure> {
     let probe = validation_probe(route)?;
-    let response =
-        send_backend_request(client, route, "POST", probe.path, Vec::new(), probe.body).await?;
+    let response = send_backend_request(client, route, "POST", probe.path, Vec::new(), probe.body)
+        .await
+        .map_err(|err| match err {
+            RouterError::UpstreamUnavailable(details) => ValidationFailure {
+                kind: ValidationFailureKind::Connectivity,
+                details,
+            },
+            RouterError::Internal(details) | RouterError::UpstreamProtocol(details) => {
+                ValidationFailure {
+                    kind: ValidationFailureKind::Unexpected,
+                    details,
+                }
+            }
+            RouterError::RouteNotFound(details)
+            | RouterError::NoCompatibleRoute(details)
+            | RouterError::Unauthorized(details) => ValidationFailure {
+                kind: ValidationFailureKind::Unexpected,
+                details,
+            },
+        })?;
     let url = build_backend_url(&route.endpoint, probe.path);
 
     if response.status().is_success() {
@@ -216,8 +253,9 @@ pub async fn verify_backend_endpoint(
     }
 
     let status = response.status();
-    let body = response.text().await.map_err(|e| {
-        RouterError::UpstreamProtocol(format!("failed to read validation response body: {e}"))
+    let body = response.text().await.map_err(|e| ValidationFailure {
+        kind: ValidationFailureKind::Unexpected,
+        details: format!("failed to read validation response body: {e}"),
     })?;
     let body = body.trim();
     let body_suffix = if body.is_empty() {
@@ -243,7 +281,16 @@ pub async fn verify_backend_endpoint(
         _ => format!("upstream returned unexpected HTTP {status}.{body_suffix}"),
     };
 
-    Err(RouterError::UpstreamProtocol(details))
+    Err(ValidationFailure {
+        kind: match status.as_u16() {
+            400 | 404 | 405 | 422 => ValidationFailureKind::RequestShape,
+            401 | 403 => ValidationFailureKind::Credentials,
+            429 => ValidationFailureKind::RateLimited,
+            500..=599 => ValidationFailureKind::UpstreamHealth,
+            _ => ValidationFailureKind::Unexpected,
+        },
+        details,
+    })
 }
 
 /// Extract status and headers from a [`reqwest::Response`].
diff --git a/crates/openshell-router/src/lib.rs b/crates/openshell-router/src/lib.rs
index fe4865c2..a5712d9a 100644
--- a/crates/openshell-router/src/lib.rs
+++ b/crates/openshell-router/src/lib.rs
@@ -8,7 +8,8 @@ mod mock;
 use std::time::Duration;
 
 pub use backend::{
-    ProxyResponse, StreamingProxyResponse, ValidatedEndpoint, verify_backend_endpoint,
+    ProxyResponse, StreamingProxyResponse, ValidatedEndpoint, ValidationFailure,
+    ValidationFailureKind, verify_backend_endpoint,
 };
 use config::{ResolvedRoute, RouterConfig};
 use tracing::info;
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 51b5c512..07fa1753 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -8,7 +8,7 @@ use openshell_core::proto::{
     inference_server::Inference,
 };
 use openshell_router::config::ResolvedRoute as RouterResolvedRoute;
-use openshell_router::{RouterError, verify_backend_endpoint};
+use openshell_router::{ValidationFailureKind, verify_backend_endpoint};
 use std::sync::Arc;
 use std::time::Duration;
 use tonic::{Request, Response, Status};
@@ -283,23 +283,27 @@ fn validation_failure(
     ))
 }
 
-fn validation_next_steps(details: &str) -> &'static str {
-    if details.contains("credentials") {
-        return "verify the provider API key and any required auth headers";
-    }
-    if details.contains("rate-limited") {
-        return "retry later or verify quota/limits on the upstream provider";
-    }
-    if details.contains("validation request") || details.contains("unexpected HTTP") {
-        return "confirm the provider type, base URL, and model identifier";
-    }
-    if details.contains("failed to connect") || details.contains("timed out") {
-        return "check that the service is running, confirm the base URL and protocol, and verify credentials";
-    }
-    if details.contains("upstream returned HTTP") {
-        return "check whether the endpoint is healthy and serving requests";
+fn validation_next_steps(kind: ValidationFailureKind) -> &'static str {
+    match kind {
+        ValidationFailureKind::Credentials => {
+            "verify the provider API key and any required auth headers"
+        }
+        ValidationFailureKind::RateLimited => {
+            "retry later or verify quota/limits on the upstream provider"
+        }
+        ValidationFailureKind::RequestShape => {
+            "confirm the provider type, base URL, and model identifier"
+        }
+        ValidationFailureKind::Connectivity => {
+            "check that the service is running, confirm the base URL and protocol, and verify credentials"
+        }
+        ValidationFailureKind::UpstreamHealth => {
+            "check whether the endpoint is healthy and serving requests"
+        }
+        ValidationFailureKind::Unexpected => {
+            "confirm the endpoint URL, protocol, credentials, and model identifier"
+        }
     }
-    "confirm the endpoint URL, protocol, credentials, and model identifier"
 }
 
 async fn verify_provider_endpoint(
@@ -320,17 +324,14 @@ async fn verify_provider_endpoint(
             url: validated.url,
             protocol: validated.protocol,
         })
-        .map_err(|err| match err {
-            RouterError::Internal(details)
-            | RouterError::UpstreamUnavailable(details)
-            | RouterError::UpstreamProtocol(details) => validation_failure(
+        .map_err(|err| {
+            validation_failure(
                 provider_name,
                 model_id,
                 &route.endpoint,
-                &details,
-                validation_next_steps(&details),
-            ),
-            other => Status::internal(format!("unexpected validation router error: {other}")),
+                &err.details,
+                validation_next_steps(err.kind),
+            )
         })
 }
 

From 1fa174f69bce1c4f367d429a3068a4dcf0dc2910 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 16:50:32 -0700
Subject: [PATCH 07/13] feat(inference): verify routes by default

---
 architecture/inference-routing.md        |  2 +
 crates/openshell-cli/src/main.rs         | 83 +-----------------------
 crates/openshell-server/src/inference.rs |  2 +-
 docs/inference/configure.md              |  2 +-
 proto/inference.proto                    |  2 +
 5 files changed, 7 insertions(+), 84 deletions(-)

diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md
index 25d8dee6..81cc3ff4 100644
--- a/architecture/inference-routing.md
+++ b/architecture/inference-routing.md
@@ -304,6 +304,8 @@ Cluster inference commands:
 
 The `--provider` flag references a provider record name (not a provider type). The provider must already exist in the cluster and have a supported inference type (`openai`, `anthropic`, or `nvidia`).
 
+Inference writes verify by default. `--no-verify` is the explicit opt-out for endpoints that are not up yet.
+
 ## Provider Discovery
 
 Files:
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 51011426..21dc4b60 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -1813,21 +1813,11 @@ async fn main() -> Result<()> {
                     model,
                     system,
                     verify,
-<<<<<<< HEAD
                     no_verify,
-=======
-                    no_verify: _,
->>>>>>> 3e3273e3 (feat(inference): add explicit no-verify flag)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_set(
-                        endpoint,
-                        &provider,
-                        &model,
-                        route_name,
-                        verify,
-                        no_verify,
-                        &tls,
+                        endpoint, &provider, &model, route_name, verify, no_verify, &tls,
                     )
                     .await?;
                 }
@@ -1836,11 +1826,7 @@ async fn main() -> Result<()> {
                     model,
                     system,
                     verify,
-<<<<<<< HEAD
                     no_verify,
-=======
-                    no_verify: _,
->>>>>>> 3e3273e3 (feat(inference): add explicit no-verify flag)
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_update(
@@ -2880,71 +2866,4 @@ mod tests {
             other => panic!("expected SshProxy, got: {other:?}"),
         }
     }
-
-    #[test]
-    fn inference_set_accepts_no_verify_flag() {
-        let cli = Cli::try_parse_from([
-            "openshell",
-            "inference",
-            "set",
-            "--provider",
-            "openai-dev",
-            "--model",
-            "gpt-4.1",
-            "--no-verify",
-        ])
-        .expect("inference set should parse --no-verify");
-
-        assert!(matches!(
-            cli.command,
-            Some(Commands::Inference {
-                command: Some(InferenceCommands::Set {
-                    no_verify: true,
-                    ..
-                })
-            })
-        ));
-    }
-
-    #[test]
-    fn inference_update_accepts_no_verify_flag() {
-        let cli = Cli::try_parse_from([
-            "openshell",
-            "inference",
-            "update",
-            "--provider",
-            "openai-dev",
-            "--no-verify",
-        ])
-        .expect("inference update should parse --no-verify");
-
-        assert!(matches!(
-            cli.command,
-            Some(Commands::Inference {
-                command: Some(InferenceCommands::Update {
-                    no_verify: true,
-                    ..
-                })
-            })
-        ));
-    }
-
-    #[test]
-    fn inference_set_rejects_verify_and_no_verify_together() {
-        let err = Cli::try_parse_from([
-            "openshell",
-            "inference",
-            "set",
-            "--provider",
-            "openai-dev",
-            "--model",
-            "gpt-4.1",
-            "--verify",
-            "--no-verify",
-        ])
-        .expect_err("verify and no-verify should conflict");
-
-        assert!(err.to_string().contains("--verify"));
-        assert!(err.to_string().contains("--no-verify"));
-    }
 }
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 07fa1753..45f233dc 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -312,7 +312,7 @@ async fn verify_provider_endpoint(
     route: &ResolvedProviderRoute,
 ) -> Result<ValidatedEndpoint, Status> {
     let client = reqwest::Client::builder()
-        .timeout(Duration::from_secs(10))
+        .timeout(Duration::from_secs(15))
         .build()
         .map_err(|err| Status::internal(format!("build validation client failed: {err}")))?;
     let mut route = route.route.clone();
diff --git a/docs/inference/configure.md b/docs/inference/configure.md
index 9d64b622..bf0103a7 100644
--- a/docs/inference/configure.md
+++ b/docs/inference/configure.md
@@ -135,7 +135,7 @@ Use this endpoint when inference should stay local to the host for privacy and s
 
 ### Verify the Endpoint from a Sandbox
 
-`openshell inference set` and `openshell inference update` verify the upstream endpoint by default before saving the configuration. If the endpoint is not live yet, try again with `--no-verify` to persist the route without the probe.
+`openshell inference set` and `openshell inference update` verify the resolved upstream endpoint by default before saving the configuration. If the endpoint is not live yet, retry with `--no-verify` to persist the route without the probe.
 
 `openshell inference get` confirms the current saved configuration. To confirm end-to-end connectivity from a sandbox, run:
 
diff --git a/proto/inference.proto b/proto/inference.proto
index 782fa0bb..a15f4b84 100644
--- a/proto/inference.proto
+++ b/proto/inference.proto
@@ -58,6 +58,8 @@ message SetClusterInferenceRequest {
   string route_name = 3;
   // Verify the resolved upstream endpoint synchronously before persistence.
   bool verify = 4;
+  // Skip synchronous endpoint validation before persistence.
+  bool no_verify = 5;
 }
 
 message ValidatedEndpoint {

From 557fa1a70b815e0345f31c25659a3f4cf98a1fe0 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 17:30:24 -0700
Subject: [PATCH 08/13] refactor(inference): drop explicit verify flag

---
 crates/openshell-cli/src/main.rs         | 80 +-----------------------
 crates/openshell-cli/src/run.rs          |  6 +-
 crates/openshell-server/src/inference.rs |  2 +-
 python/openshell/sandbox.py              |  5 --
 python/openshell/sandbox_test.py         | 34 ----------
 5 files changed, 6 insertions(+), 121 deletions(-)

diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 21dc4b60..8995c3df 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -905,12 +905,8 @@ enum InferenceCommands {
         #[arg(long)]
         system: bool,
 
-        /// Verify the resolved upstream endpoint before saving the route.
-        #[arg(long, conflicts_with = "no_verify")]
-        verify: bool,
-
         /// Skip endpoint verification before saving the route.
-        #[arg(long, conflicts_with = "verify")]
+        #[arg(long)]
         no_verify: bool,
     },
 
@@ -929,12 +925,8 @@ enum InferenceCommands {
         #[arg(long)]
         system: bool,
 
-        /// Verify the resolved upstream endpoint before saving the route.
-        #[arg(long, conflicts_with = "no_verify")]
-        verify: bool,
-
         /// Skip endpoint verification before saving the route.
-        #[arg(long, conflicts_with = "verify")]
+        #[arg(long)]
         no_verify: bool,
     },
 
@@ -1812,12 +1804,11 @@ async fn main() -> Result<()> {
                     provider,
                     model,
                     system,
-                    verify,
                     no_verify,
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_set(
-                        endpoint, &provider, &model, route_name, verify, no_verify, &tls,
+                        endpoint, &provider, &model, route_name, no_verify, &tls,
                     )
                     .await?;
                 }
@@ -1825,7 +1816,6 @@ async fn main() -> Result<()> {
                     provider,
                     model,
                     system,
-                    verify,
                     no_verify,
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
@@ -1834,7 +1824,6 @@ async fn main() -> Result<()> {
                         provider.as_deref(),
                         model.as_deref(),
                         route_name,
-                        verify,
                         no_verify,
                         &tls,
                     )
@@ -2567,48 +2556,6 @@ mod tests {
         ));
     }
 
-    #[test]
-    fn inference_set_accepts_verify_flag() {
-        let cli = Cli::try_parse_from([
-            "openshell",
-            "inference",
-            "set",
-            "--provider",
-            "openai-dev",
-            "--model",
-            "gpt-4.1",
-            "--verify",
-        ])
-        .expect("inference set should parse --verify");
-
-        assert!(matches!(
-            cli.command,
-            Some(Commands::Inference {
-                command: Some(InferenceCommands::Set { verify: true, .. })
-            })
-        ));
-    }
-
-    #[test]
-    fn inference_update_accepts_verify_flag() {
-        let cli = Cli::try_parse_from([
-            "openshell",
-            "inference",
-            "update",
-            "--provider",
-            "openai-dev",
-            "--verify",
-        ])
-        .expect("inference update should parse --verify");
-
-        assert!(matches!(
-            cli.command,
-            Some(Commands::Inference {
-                command: Some(InferenceCommands::Update { verify: true, .. })
-            })
-        ));
-    }
-
     #[test]
     fn inference_set_accepts_no_verify_flag() {
         let cli = Cli::try_parse_from([
@@ -2628,7 +2575,6 @@ mod tests {
             Some(Commands::Inference {
                 command: Some(InferenceCommands::Set {
                     no_verify: true,
-                    verify: false,
                     ..
                 })
             })
@@ -2652,32 +2598,12 @@ mod tests {
             Some(Commands::Inference {
                 command: Some(InferenceCommands::Update {
                     no_verify: true,
-                    verify: false,
                     ..
                 })
             })
         ));
     }
 
-    #[test]
-    fn inference_set_rejects_verify_and_no_verify_together() {
-        let err = Cli::try_parse_from([
-            "openshell",
-            "inference",
-            "set",
-            "--provider",
-            "openai-dev",
-            "--model",
-            "gpt-4.1",
-            "--verify",
-            "--no-verify",
-        ])
-        .expect_err("verify and no-verify should conflict");
-
-        assert!(err.to_string().contains("--verify"));
-        assert!(err.to_string().contains("--no-verify"));
-    }
-
     #[test]
     fn completion_script_uses_openshell_command_name() {
         let script = normalize_completion_script(
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 802ce3e5..d17f4c36 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -3390,7 +3390,6 @@ pub async fn gateway_inference_set(
     provider_name: &str,
     model_id: &str,
     route_name: &str,
-    verify: bool,
     no_verify: bool,
     tls: &TlsOptions,
 ) -> Result<()> {
@@ -3413,7 +3412,7 @@ pub async fn gateway_inference_set(
             provider_name: provider_name.to_string(),
             model_id: model_id.to_string(),
             route_name: route_name.to_string(),
-            verify,
+            verify: false,
             no_verify,
         })
         .await;
@@ -3450,7 +3449,6 @@ pub async fn gateway_inference_update(
     provider_name: Option<&str>,
     model_id: Option<&str>,
     route_name: &str,
-    verify: bool,
     no_verify: bool,
     tls: &TlsOptions,
 ) -> Result<()> {
@@ -3492,7 +3490,7 @@ pub async fn gateway_inference_update(
             provider_name: provider.to_string(),
             model_id: model.to_string(),
             route_name: route_name.to_string(),
-            verify,
+            verify: false,
             no_verify,
         })
         .await;
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 45f233dc..55f43944 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -80,7 +80,7 @@ impl Inference for InferenceService {
     ) -> Result<Response<SetClusterInferenceResponse>, Status> {
         let req = request.into_inner();
         let route_name = effective_route_name(&req.route_name)?;
-        let verify = req.verify || !req.no_verify;
+        let verify = !req.no_verify;
         let route = upsert_cluster_inference_route(
             self.state.store.as_ref(),
             route_name,
diff --git a/python/openshell/sandbox.py b/python/openshell/sandbox.py
index f10f450c..19bdcdf6 100644
--- a/python/openshell/sandbox.py
+++ b/python/openshell/sandbox.py
@@ -398,17 +398,12 @@ def set_cluster(
         *,
         provider_name: str,
         model_id: str,
-        verify: bool = False,
         no_verify: bool = False,
     ) -> ClusterInferenceConfig:
-        if verify and no_verify:
-            raise ValueError("verify and no_verify are mutually exclusive")
-
         response = self._stub.SetClusterInference(
             inference_pb2.SetClusterInferenceRequest(
                 provider_name=provider_name,
                 model_id=model_id,
-                verify=verify,
                 no_verify=no_verify,
             ),
             timeout=self._timeout,
diff --git a/python/openshell/sandbox_test.py b/python/openshell/sandbox_test.py
index d7052c92..c0148dcc 100644
--- a/python/openshell/sandbox_test.py
+++ b/python/openshell/sandbox_test.py
@@ -6,8 +6,6 @@
 import json
 from typing import TYPE_CHECKING, Any, cast
 
-import pytest
-
 from openshell._proto import openshell_pb2
 from openshell.sandbox import (
     _PYTHON_CLOUDPICKLE_BOOTSTRAP,
@@ -141,23 +139,6 @@ def test_from_active_cluster_prefers_openshell_gateway_env(
         client.close()
 
 
-def test_inference_set_cluster_forwards_verify_flag() -> None:
-    stub = _FakeInferenceStub()
-    client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
-    client._timeout = 30.0
-    client._stub = cast("Any", stub)
-
-    result = client.set_cluster(
-        provider_name="openai-dev",
-        model_id="gpt-4.1",
-        verify=True,
-    )
-
-    assert result.provider_name == "openai-dev"
-    assert stub.request is not None
-    assert stub.request.verify is True
-
-
 def test_inference_set_cluster_forwards_no_verify_flag() -> None:
     stub = _FakeInferenceStub()
     client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
@@ -172,18 +153,3 @@ def test_inference_set_cluster_forwards_no_verify_flag() -> None:
 
     assert stub.request is not None
     assert stub.request.no_verify is True
-
-
-def test_inference_set_cluster_rejects_conflicting_flags() -> None:
-    stub = _FakeInferenceStub()
-    client = cast("InferenceRouteClient", object.__new__(InferenceRouteClient))
-    client._timeout = 30.0
-    client._stub = cast("Any", stub)
-
-    with pytest.raises(ValueError, match="mutually exclusive"):
-        client.set_cluster(
-            provider_name="openai-dev",
-            model_id="gpt-4.1",
-            verify=True,
-            no_verify=True,
-        )

From 8ced87ca0c098dfb49a1584d0307e720e5d6ab12 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 17:36:01 -0700
Subject: [PATCH 09/13] test(e2e): cover inference endpoint verification

---
 e2e/rust/tests/host_gateway_alias.rs | 111 +++++++++++++++++++++++----
 1 file changed, 95 insertions(+), 16 deletions(-)

diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs
index 76d8be57..547a9238 100644
--- a/e2e/rust/tests/host_gateway_alias.rs
+++ b/e2e/rust/tests/host_gateway_alias.rs
@@ -16,6 +16,7 @@ use tempfile::NamedTempFile;
 use tokio::time::{interval, timeout};
 
 const INFERENCE_PROVIDER_NAME: &str = "e2e-host-inference";
+const INFERENCE_PROVIDER_UNREACHABLE_NAME: &str = "e2e-host-inference-unreachable";
 const TEST_SERVER_IMAGE: &str = "python:3.13-alpine";
 static INFERENCE_ROUTE_LOCK: Mutex<()> = Mutex::new(());
 
@@ -177,6 +178,22 @@ async fn delete_provider(name: &str) {
     let _ = cmd.status().await;
 }
 
+async fn create_openai_provider(name: &str, base_url: &str) -> Result<String, String> {
+    run_cli(&[
+        "provider",
+        "create",
+        "--name",
+        name,
+        "--type",
+        "openai",
+        "--credential",
+        "OPENAI_API_KEY=dummy",
+        "--config",
+        &format!("OPENAI_BASE_URL={base_url}"),
+    ])
+    .await
+}
+
 fn write_policy(port: u16) -> Result<NamedTempFile, String> {
     let mut file = NamedTempFile::new().map_err(|e| format!("create temp policy file: {e}"))?;
     let policy = format!(
@@ -282,36 +299,33 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() {
         delete_provider(INFERENCE_PROVIDER_NAME).await;
     }
 
-    run_cli(&[
-        "provider",
-        "create",
-        "--name",
+    let create_output = create_openai_provider(
         INFERENCE_PROVIDER_NAME,
-        "--type",
-        "openai",
-        "--credential",
-        "OPENAI_API_KEY=dummy",
-        "--config",
-        &format!(
-            "OPENAI_BASE_URL=http://host.openshell.internal:{}/v1",
-            server.port
-        ),
-    ])
+        &format!("http://host.openshell.internal:{}/v1", server.port),
+    )
     .await
     .expect("create host-backed OpenAI provider");
 
-    run_cli(&[
+    let inference_output = run_cli(&[
         "inference",
         "set",
         "--provider",
         INFERENCE_PROVIDER_NAME,
         "--model",
         "host-echo-model",
-        "--no-verify",
     ])
     .await
     .expect("point inference.local at host-backed provider");
 
+    assert!(
+        inference_output.contains("Validated Endpoints:"),
+        "expected verification details in output:\n{inference_output}"
+    );
+    assert!(
+        inference_output.contains("/v1/chat/completions (openai_chat_completions)"),
+        "expected validated endpoint in output:\n{inference_output}"
+    );
+
     let guard = SandboxGuard::create(&[
         "--",
         "curl",
@@ -338,4 +352,69 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() {
         "expected sandbox to receive echoed inference content:\n{}",
         guard.create_output
     );
+
+    let _ = create_output;
+}
+
+#[tokio::test]
+async fn inference_set_supports_no_verify_for_unreachable_endpoint() {
+    let _inference_lock = INFERENCE_ROUTE_LOCK
+        .lock()
+        .unwrap_or_else(std::sync::PoisonError::into_inner);
+
+    let current_inference = run_cli(&["inference", "get"])
+        .await
+        .expect("read current inference config");
+    if !current_inference.contains("Not configured") {
+        eprintln!("Skipping test: existing inference config would make shared state unsafe");
+        return;
+    }
+
+    if provider_exists(INFERENCE_PROVIDER_UNREACHABLE_NAME).await {
+        delete_provider(INFERENCE_PROVIDER_UNREACHABLE_NAME).await;
+    }
+
+    create_openai_provider(
+        INFERENCE_PROVIDER_UNREACHABLE_NAME,
+        "http://host.openshell.internal:9/v1",
+    )
+    .await
+    .expect("create unreachable OpenAI provider");
+
+    let verify_err = run_cli(&[
+        "inference",
+        "set",
+        "--provider",
+        INFERENCE_PROVIDER_UNREACHABLE_NAME,
+        "--model",
+        "host-echo-model",
+    ])
+    .await
+    .expect_err("default verification should fail for unreachable endpoint");
+
+    assert!(
+        verify_err.contains("failed to verify inference endpoint"),
+        "expected verification failure output:\n{verify_err}"
+    );
+    assert!(
+        verify_err.contains("--no-verify"),
+        "expected retry hint in failure output:\n{verify_err}"
+    );
+
+    let no_verify_output = run_cli(&[
+        "inference",
+        "set",
+        "--provider",
+        INFERENCE_PROVIDER_UNREACHABLE_NAME,
+        "--model",
+        "host-echo-model",
+        "--no-verify",
+    ])
+    .await
+    .expect("no-verify should bypass validation");
+
+    assert!(
+        !no_verify_output.contains("Validated Endpoints:"),
+        "did not expect validation output when bypassing verification:\n{no_verify_output}"
+    );
 }

From d24916c0f6cb5fb34c0a93d58f94076af75807b7 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 17:51:03 -0700
Subject: [PATCH 10/13] fix(cli): improve inference verification feedback

---
 crates/openshell-cli/src/run.rs          | 16 +++++++++++++---
 crates/openshell-server/src/inference.rs |  4 ++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index d17f4c36..22123c1d 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -40,7 +40,7 @@ use std::io::{IsTerminal, Write};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
-use tonic::Code;
+use tonic::{Code, Status};
 
 // Re-export SSH functions for backward compatibility
 pub use crate::ssh::{Editor, print_ssh_config};
@@ -3421,7 +3421,7 @@ pub async fn gateway_inference_set(
         progress.finish_and_clear();
     }
 
-    let response = response.into_diagnostic()?;
+    let response = response.map_err(format_inference_status)?;
 
     let configured = response.into_inner();
     let label = if configured.route_name == "sandbox-system" {
@@ -3499,7 +3499,7 @@ pub async fn gateway_inference_update(
         progress.finish_and_clear();
     }
 
-    let response = response.into_diagnostic()?;
+    let response = response.map_err(format_inference_status)?;
 
     let configured = response.into_inner();
     let label = if configured.route_name == "sandbox-system" {
@@ -3590,6 +3590,16 @@ async fn print_inference_route(
     }
 }
 
+fn format_inference_status(status: Status) -> miette::Report {
+    let message = status.message().trim();
+
+    if message.is_empty() {
+        return miette::miette!("inference configuration failed ({})", status.code());
+    }
+
+    miette::miette!("{message}")
+}
+
 pub fn git_repo_root(local_path: &Path) -> Result<PathBuf> {
     let git_dir = if local_path.is_dir() {
         local_path
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index 55f43944..c1354e75 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -279,7 +279,7 @@ fn validation_failure(
     next_steps: &str,
 ) -> Status {
     Status::failed_precondition(format!(
-        "failed to verify inference endpoint for provider '{provider_name}' and model '{model_id}' at '{base_url}': {details}. Next steps: {next_steps}, or retry with '--no-verify' if the endpoint is not up yet"
+        "failed to verify inference endpoint for provider '{provider_name}' and model '{model_id}' at '{base_url}': {details}. Next steps: {next_steps}, or retry with '--no-verify' if you want to skip verification"
     ))
 }
 
@@ -312,7 +312,7 @@ async fn verify_provider_endpoint(
     route: &ResolvedProviderRoute,
 ) -> Result<ValidatedEndpoint, Status> {
     let client = reqwest::Client::builder()
-        .timeout(Duration::from_secs(15))
+        .timeout(Duration::from_secs(30))
         .build()
         .map_err(|err| Status::internal(format!("build validation client failed: {err}")))?;
     let mut route = route.route.clone();

From 936ae4976e7be6dee449239bcb9557ecdd097800 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 18:15:08 -0700
Subject: [PATCH 11/13] fix(gateway): add host openshell alias

---
 crates/openshell-bootstrap/src/docker.rs         | 10 +++++++---
 deploy/helm/openshell/templates/statefulset.yaml |  7 +++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index 1cb62b7b..3dc832aa 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -526,9 +526,13 @@ pub async fn ensure_container(
         port_bindings: Some(port_bindings),
         binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
         network_mode: Some(network_name(name)),
-        // Add host.docker.internal mapping for DNS resolution
-        // This allows the entrypoint script to configure CoreDNS to use the host gateway
-        extra_hosts: Some(vec!["host.docker.internal:host-gateway".to_string()]),
+        // Add host gateway aliases for DNS resolution.
+        // This allows both the entrypoint script and the running gateway
+        // process to reach services on the Docker host.
+        extra_hosts: Some(vec![
+            "host.docker.internal:host-gateway".to_string(),
+            "host.openshell.internal:host-gateway".to_string(),
+        ]),
         ..Default::default()
     };
 
diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml
index 175b2606..83ece499 100644
--- a/deploy/helm/openshell/templates/statefulset.yaml
+++ b/deploy/helm/openshell/templates/statefulset.yaml
@@ -31,6 +31,13 @@ spec:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       serviceAccountName: {{ include "openshell.serviceAccountName" . }}
+      {{- if .Values.server.hostGatewayIP }}
+      hostAliases:
+        - ip: {{ .Values.server.hostGatewayIP | quote }}
+          hostnames:
+            - host.docker.internal
+            - host.openshell.internal
+      {{- end }}
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
       containers:

From f4b971f79b217a87ea80337b32d61af5b3d4bd60 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 18:57:31 -0700
Subject: [PATCH 12/13] fix(router): allow mock inference verification

---
 crates/openshell-router/src/backend.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs
index bfaae625..f9d53800 100644
--- a/crates/openshell-router/src/backend.rs
+++ b/crates/openshell-router/src/backend.rs
@@ -3,6 +3,7 @@
 
 use crate::RouterError;
 use crate::config::{AuthHeader, ResolvedRoute};
+use crate::mock;
 
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ValidatedEndpoint {
@@ -223,6 +224,14 @@ pub async fn verify_backend_endpoint(
     route: &ResolvedRoute,
 ) -> Result<ValidatedEndpoint, ValidationFailure> {
     let probe = validation_probe(route)?;
+
+    if mock::is_mock_route(route) {
+        return Ok(ValidatedEndpoint {
+            url: build_backend_url(&route.endpoint, probe.path),
+            protocol: probe.protocol.to_string(),
+        });
+    }
+
     let response = send_backend_request(client, route, "POST", probe.path, Vec::new(), probe.body)
         .await
         .map_err(|err| match err {
@@ -437,4 +446,19 @@ mod tests {
         assert_eq!(validated.protocol, "anthropic_messages");
         assert_eq!(validated.url, format!("{}/v1/messages", mock_server.uri()));
     }
+
+    #[tokio::test]
+    async fn verify_backend_endpoint_accepts_mock_routes() {
+        let route = test_route(
+            "mock://test-backend",
+            &["openai_chat_completions"],
+            AuthHeader::Bearer,
+        );
+
+        let client = reqwest::Client::builder().build().unwrap();
+        let validated = verify_backend_endpoint(&client, &route).await.unwrap();
+
+        assert_eq!(validated.protocol, "openai_chat_completions");
+        assert_eq!(validated.url, "mock://test-backend/v1/chat/completions");
+    }
 }

From bde135c518d4ac13b76eb450396c50af9fdd5d38 Mon Sep 17 00:00:00 2001
From: Piotr Mlocek <pmlocek@nvidia.com>
Date: Sun, 15 Mar 2026 19:03:50 -0700
Subject: [PATCH 13/13] docs(inference): remove stale verify references

---
 architecture/inference-routing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md
index 81cc3ff4..0d3a95af 100644
--- a/architecture/inference-routing.md
+++ b/architecture/inference-routing.md
@@ -92,7 +92,7 @@ File: `proto/inference.proto`
 
 Key messages:
 
-- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `verify` / `no_verify` overrides, with verification enabled by default
+- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `no_verify` override, with verification enabled by default
 - `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `version`
 - `GetInferenceBundleResponse` -- `repeated ResolvedRoute routes` + `revision` + `generated_at_ms`
 - `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`