From 0af91c5845a9426496990869a94e25f9121c1eaf Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Thu, 19 Mar 2026 18:47:24 -0700 Subject: [PATCH] fix(gateway): allow first live network policy update --- architecture/sandbox.md | 2 +- architecture/security-policy.md | 14 +-- crates/openshell-server/src/grpc.rs | 128 +++++++++++++++++----------- e2e/python/test_sandbox_policy.py | 81 +++++++++++++++++- 4 files changed, 160 insertions(+), 65 deletions(-) diff --git a/architecture/sandbox.md b/architecture/sandbox.md index a8e4d247..e5c831a8 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -311,7 +311,7 @@ In gRPC mode, the sandbox can receive policy updates at runtime without restarti | `landlock` | No | Landlock LSM in pre_exec | Configuration for the above; same restriction | | `process` | No | `setuid`/`setgid` in pre-exec | Privileges dropped irrevocably before exec | -The gateway's `UpdateSandboxPolicy` RPC enforces this boundary: it rejects any update where the static fields (`filesystem`, `landlock`, `process`) differ from the version 1 (creation-time) policy. It also rejects updates that would change the network mode (e.g., adding `network_policies` to a sandbox that started in `Block` mode), because the network namespace and proxy infrastructure are set up once at startup. +The gateway's `UpdateSandboxPolicy` RPC enforces this boundary: it rejects any update where the static fields (`filesystem`, `landlock`, `process`) differ from the version 1 (creation-time) policy. `network_policies` remain live-editable, including transitions between an empty rule set and a non-empty one, because proto-backed sandboxes already start with the proxy and network namespace infrastructure in place. ### Poll loop diff --git a/architecture/security-policy.md b/architecture/security-policy.md index cd4d697f..4be33589 100644 --- a/architecture/security-policy.md +++ b/architecture/security-policy.md @@ -90,12 +90,9 @@ Attempting to change a static field in an update request returns an `INVALID_ARG ### Network Mode Immutability -The network mode (Block vs. Proxy) cannot change after sandbox creation. This is because switching modes requires infrastructure changes that only happen at startup: +Proto-backed sandboxes always run with proxy networking. The proxy, network namespace, and OPA evaluation path are created at sandbox startup and stay in place for the lifetime of the sandbox. -- **Block to Proxy**: Requires creating a network namespace, veth pair, and starting the CONNECT proxy -- none of which exist if the sandbox started in Block mode. -- **Proxy to Block**: Requires removing the proxy, veth pair, and network namespace, and applying a stricter seccomp filter that blocks `AF_INET`/`AF_INET6` -- not possible on a running process. - -An update that adds `network_policies` to a sandbox created without them (or removes all `network_policies` from a sandbox created with them) is rejected. See `crates/openshell-server/src/grpc.rs` -- `validate_network_mode_unchanged()`. +That means `network_policies` can change freely at runtime, including transitions between an empty map (proxy-backed deny-all) and a non-empty map (proxy-backed allowlist). The immutable boundary is the proxy infrastructure itself, not whether the current policy has any rules. ### Update Flow @@ -110,7 +107,6 @@ sequenceDiagram CLI->>GW: UpdateSandboxPolicy(name, new_policy) GW->>GW: Validate static fields unchanged - GW->>GW: Validate network mode unchanged GW->>DB: put_policy_revision(version=N, status=pending) GW->>DB: supersede_pending_policies(before_version=N) GW-->>CLI: UpdateSandboxPolicyResponse(version=N, hash) @@ -377,7 +373,7 @@ process: ### `network_policies` -A map of named network policy rules. Each rule defines which binary/endpoint pairs are allowed to make outbound network connections. This is the core of the network access control system. **Dynamic field** -- can be updated on a running sandbox via live policy updates (see [Live Policy Updates](#live-policy-updates)). However, the overall network mode (Block vs. Proxy) is immutable. +A map of named network policy rules. Each rule defines which binary/endpoint pairs are allowed to make outbound network connections. This is the core of the network access control system. **Dynamic field** -- can be updated on a running sandbox via live policy updates (see [Live Policy Updates](#live-policy-updates)). **Behavioral trigger**: The sandbox always starts in **proxy mode** regardless of whether `network_policies` is present. The proxy is required so that all egress can be evaluated by OPA and the virtual hostname `inference.local` is always addressable for inference routing. When `network_policies` is empty, the OPA engine denies all connections. @@ -621,7 +617,7 @@ In proxy mode: When `network_policies` is empty, the OPA engine denies all outbound connections (except `inference.local` which is handled separately by the proxy before OPA evaluation). -**Gateway-side validation**: The `validate_network_mode_unchanged()` function on the server still rejects live policy updates that would add `network_policies` to a sandbox created without them or remove all `network_policies` from a sandbox created with them. This prevents unexpected behavioral changes in the OPA allow/deny logic. See `crates/openshell-server/src/grpc.rs` -- `validate_network_mode_unchanged()`. +The gateway validates that static fields stay unchanged across live policy updates, then persists a new policy revision for the supervisor to load. Empty and non-empty `network_policies` revisions follow the same live-update path. **Proxy sub-modes**: In proxy mode, the proxy handles two distinct request types: @@ -937,8 +933,6 @@ These errors are returned by the gateway's `UpdateSandboxPolicy` handler and rej | `filesystem_policy` differs from version 1 | `filesystem policy cannot be changed on a live sandbox (applied at startup)` | | `landlock` differs from version 1 | `landlock policy cannot be changed on a live sandbox (applied at startup)` | | `process` differs from version 1 | `process policy cannot be changed on a live sandbox (applied at startup)` | -| Adding `network_policies` when version 1 had none | `cannot add network policies to a sandbox created without them (Block -> Proxy mode change requires restart)` | -| Removing all `network_policies` when version 1 had some | `cannot remove all network policies from a sandbox created with them (Proxy -> Block mode change requires restart)` | ### Warnings (Log Only) diff --git a/crates/openshell-server/src/grpc.rs b/crates/openshell-server/src/grpc.rs index a2c6a58f..740cdb80 100644 --- a/crates/openshell-server/src/grpc.rs +++ b/crates/openshell-server/src/grpc.rs @@ -1017,8 +1017,10 @@ impl OpenShell for OpenShellService { // Validate static fields haven't changed. validate_static_fields_unchanged(baseline_policy, &new_policy)?; - // Validate network mode hasn't changed (Block ↔ Proxy). - validate_network_mode_unchanged(baseline_policy, &new_policy)?; + // Allow network policy additions/removals on live sandboxes. The + // cluster runtime always uses proxy mode for proto-backed sandbox + // policies, so an empty `network_policies` map is no longer a real + // mode boundary. // Validate policy safety (no root, no path traversal, etc.). validate_policy_safety(&new_policy)?; @@ -1590,7 +1592,8 @@ impl OpenShell for OpenShellService { ); // Merge the approved rule into the current policy (with optimistic retry). - let (version, hash) = merge_chunk_into_policy(&self.state, &sandbox_id, &chunk).await?; + let (version, hash) = + merge_chunk_into_policy(self.state.store.as_ref(), &sandbox_id, &chunk).await?; // Mark chunk as approved. let now_ms = @@ -1757,7 +1760,8 @@ impl OpenShell for OpenShellService { ); // Merge each chunk into the policy (with optimistic retry). - let (version, hash) = merge_chunk_into_policy(&self.state, &sandbox_id, chunk).await?; + let (version, hash) = + merge_chunk_into_policy(self.state.store.as_ref(), &sandbox_id, chunk).await?; last_version = version; last_hash = hash; @@ -2071,7 +2075,7 @@ fn draft_chunk_record_to_proto(record: &DraftChunkRecord) -> Result Result<(i64, String), Status> { @@ -2083,8 +2087,7 @@ async fn merge_chunk_into_policy( for attempt in 1..=MERGE_RETRY_LIMIT { // Get the current active policy (re-read on each attempt). - let latest = state - .store + let latest = store .get_latest_policy(sandbox_id) .await .map_err(|e| Status::internal(format!("fetch latest policy failed: {e}")))?; @@ -2129,14 +2132,12 @@ async fn merge_chunk_into_policy( let next_version = base_version + 1; let policy_id = uuid::Uuid::new_v4().to_string(); - match state - .store + match store .put_policy_revision(&policy_id, sandbox_id, next_version, &payload, &hash) .await { Ok(()) => { - let _ = state - .store + let _ = store .supersede_older_policies(sandbox_id, next_version) .await; @@ -2609,25 +2610,6 @@ fn validate_filesystem_additive( Ok(()) } -/// Validate that network mode hasn't changed (Block ↔ Proxy). -/// Adding `network_policies` when none existed (or removing all) changes the mode. -fn validate_network_mode_unchanged( - baseline: &ProtoSandboxPolicy, - new: &ProtoSandboxPolicy, -) -> Result<(), Status> { - let baseline_has_policies = !baseline.network_policies.is_empty(); - let new_has_policies = !new.network_policies.is_empty(); - if baseline_has_policies != new_has_policies { - let msg = if new_has_policies { - "cannot add network policies to a sandbox created without them (Block → Proxy mode change requires restart)" - } else { - "cannot remove all network policies from a sandbox created with them (Proxy → Block mode change requires restart)" - }; - return Err(Status::invalid_argument(msg)); - } - Ok(()) -} - /// Convert a `PolicyRecord` to a `SandboxPolicyRevision` proto message. fn policy_record_to_revision(record: &PolicyRecord, include_policy: bool) -> SandboxPolicyRevision { let status = match record.status.as_str() { @@ -3322,11 +3304,12 @@ mod tests { MAX_PROVIDER_CREDENTIALS_ENTRIES, MAX_PROVIDER_TYPE_LEN, MAX_PROVIDERS, MAX_TEMPLATE_MAP_ENTRIES, MAX_TEMPLATE_STRING_LEN, MAX_TEMPLATE_STRUCT_SIZE, clamp_limit, create_provider_record, delete_provider_record, get_provider_record, is_valid_env_key, - list_provider_records, resolve_provider_environment, update_provider_record, - validate_provider_fields, validate_sandbox_spec, + list_provider_records, merge_chunk_into_policy, resolve_provider_environment, + update_provider_record, validate_provider_fields, validate_sandbox_spec, }; - use crate::persistence::Store; + use crate::persistence::{DraftChunkRecord, Store}; use openshell_core::proto::{Provider, SandboxSpec, SandboxTemplate}; + use prost::Message; use std::collections::HashMap; use tonic::Code; @@ -4012,7 +3995,7 @@ mod tests { #[test] fn validate_static_fields_allows_unchanged() { - use super::{validate_network_mode_unchanged, validate_static_fields_unchanged}; + use super::validate_static_fields_unchanged; use openshell_core::proto::{ FilesystemPolicy, LandlockPolicy, ProcessPolicy, SandboxPolicy as ProtoSandboxPolicy, }; @@ -4034,7 +4017,6 @@ mod tests { ..Default::default() }; assert!(validate_static_fields_unchanged(&policy, &policy).is_ok()); - assert!(validate_network_mode_unchanged(&policy, &policy).is_ok()); } #[test] @@ -4152,23 +4134,6 @@ mod tests { assert!(result.unwrap_err().message().contains("include_workdir")); } - #[test] - fn validate_network_mode_rejects_block_to_proxy() { - use super::validate_network_mode_unchanged; - use openshell_core::proto::{NetworkPolicyRule, SandboxPolicy as ProtoSandboxPolicy}; - - let baseline = ProtoSandboxPolicy::default(); // no network policies = Block - let mut changed = ProtoSandboxPolicy::default(); - changed.network_policies.insert( - "test".into(), - NetworkPolicyRule { - name: "test".into(), - ..Default::default() - }, - ); - assert!(validate_network_mode_unchanged(&baseline, &changed).is_err()); - } - // ---- Sandbox creation without policy ---- #[tokio::test] @@ -4262,6 +4227,65 @@ mod tests { assert_eq!(policy.process.unwrap().run_as_user, "sandbox"); } + #[tokio::test] + async fn merge_chunk_into_policy_adds_first_network_rule_to_empty_policy() { + use openshell_core::proto::{NetworkBinary, NetworkEndpoint, NetworkPolicyRule}; + + let store = Store::connect("sqlite::memory:").await.unwrap(); + let rule = NetworkPolicyRule { + name: "google".to_string(), + endpoints: vec![NetworkEndpoint { + host: "google.com".to_string(), + port: 443, + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + let chunk = DraftChunkRecord { + id: "chunk-1".to_string(), + sandbox_id: "sb-empty".to_string(), + draft_version: 1, + status: "pending".to_string(), + rule_name: "google".to_string(), + proposed_rule: rule.encode_to_vec(), + rationale: String::new(), + security_notes: String::new(), + confidence: 1.0, + created_at_ms: 0, + decided_at_ms: None, + host: "google.com".to_string(), + port: 443, + binary: "/usr/bin/curl".to_string(), + hit_count: 1, + first_seen_ms: 0, + last_seen_ms: 0, + }; + + let (version, _) = merge_chunk_into_policy(&store, &chunk.sandbox_id, &chunk) + .await + .unwrap(); + + assert_eq!(version, 1); + + let latest = store + .get_latest_policy(&chunk.sandbox_id) + .await + .unwrap() + .expect("policy revision should be persisted"); + let policy = openshell_core::proto::SandboxPolicy::decode(latest.policy_payload.as_slice()) + .expect("policy payload should decode"); + let stored_rule = policy + .network_policies + .get("google") + .expect("merged rule should be present"); + assert_eq!(stored_rule.endpoints[0].host, "google.com"); + assert_eq!(stored_rule.endpoints[0].port, 443); + assert_eq!(stored_rule.binaries[0].path, "/usr/bin/curl"); + } + // ── petname default name generation ─────────────────────────────── /// Verify that `petname::petname(2, "-")` produces names matching the diff --git a/e2e/python/test_sandbox_policy.py b/e2e/python/test_sandbox_policy.py index fdb8a1d0..ab135d63 100644 --- a/e2e/python/test_sandbox_policy.py +++ b/e2e/python/test_sandbox_policy.py @@ -6,12 +6,14 @@ import json from typing import TYPE_CHECKING +import pytest + from openshell._proto import datamodel_pb2, sandbox_pb2 if TYPE_CHECKING: from collections.abc import Callable - from openshell import Sandbox + from openshell import Sandbox, SandboxClient # ============================================================================= @@ -1101,7 +1103,7 @@ def test_l7_tls_log_fields( def test_live_policy_update_and_logs( sandbox: Callable[..., Sandbox], - sandbox_client: "SandboxClient", + sandbox_client: SandboxClient, ) -> None: """End-to-end: live policy update lifecycle with log verification.""" from openshell._proto import openshell_pb2, sandbox_pb2 @@ -1269,6 +1271,81 @@ def test_live_policy_update_and_logs( assert has_fields, "CONNECT logs should have structured fields" +def test_live_policy_update_from_empty_network_policies( + sandbox: Callable[..., Sandbox], + sandbox_client: SandboxClient, +) -> None: + """End-to-end: add the first network rule to a running sandbox.""" + from openshell._proto import openshell_pb2, sandbox_pb2 + + initial_policy = _base_policy() + updated_policy = _base_policy( + network_policies={ + "example": sandbox_pb2.NetworkPolicyRule( + name="example", + endpoints=[ + sandbox_pb2.NetworkEndpoint(host="example.com", port=443), + ], + binaries=[sandbox_pb2.NetworkBinary(path="/**")], + ), + }, + ) + + spec = datamodel_pb2.SandboxSpec(policy=initial_policy) + stub = sandbox_client._stub + + with sandbox(spec=spec, delete_on_exit=True) as sb: + sandbox_name = sb.sandbox.name + + denied = sb.exec_python(_proxy_connect(), args=("example.com", 443)) + assert denied.exit_code == 0, denied.stderr + assert "403" in denied.stdout, denied.stdout + + initial_status = stub.GetSandboxPolicyStatus( + openshell_pb2.GetSandboxPolicyStatusRequest(name=sandbox_name, version=0) + ) + initial_version = initial_status.revision.version + + update_resp = stub.UpdateSandboxPolicy( + openshell_pb2.UpdateSandboxPolicyRequest( + name=sandbox_name, + policy=updated_policy, + ) + ) + new_version = update_resp.version + assert new_version > initial_version, ( + f"Adding the first network rule should create a new version > {initial_version}, " + f"got {new_version}" + ) + + import time + + deadline = time.time() + 90 + loaded = False + while time.time() < deadline: + status_resp = stub.GetSandboxPolicyStatus( + openshell_pb2.GetSandboxPolicyStatusRequest( + name=sandbox_name, version=new_version + ) + ) + status = status_resp.revision.status + if status == openshell_pb2.POLICY_STATUS_LOADED: + loaded = True + break + if status == openshell_pb2.POLICY_STATUS_FAILED: + pytest.fail( + f"Policy v{new_version} failed to load: " + f"{status_resp.revision.load_error}" + ) + time.sleep(2) + + assert loaded, f"Policy v{new_version} was not loaded within 90s" + + allowed = sb.exec_python(_proxy_connect(), args=("example.com", 443)) + assert allowed.exit_code == 0, allowed.stderr + assert "200" in allowed.stdout, allowed.stdout + + # ============================================================================= # Forward proxy tests (plain HTTP, non-CONNECT) # =============================================================================