Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ members = [
"crates/aionui-api-types",
"crates/aionui-realtime",
"crates/aionui-runtime",
"crates/aionui-process",
"crates/aionui-auth",
"crates/aionui-system",
"crates/aionui-file",
Expand Down Expand Up @@ -36,6 +37,7 @@ aionui-db = { path = "crates/aionui-db" }
aionui-api-types = { path = "crates/aionui-api-types" }
aionui-realtime = { path = "crates/aionui-realtime" }
aionui-runtime = { path = "crates/aionui-runtime" }
aionui-process = { path = "crates/aionui-process" }
aionui-auth = { path = "crates/aionui-auth" }
aionui-system = { path = "crates/aionui-system" }
aionui-file = { path = "crates/aionui-file" }
Expand Down
44 changes: 44 additions & 0 deletions crates/aionui-process/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[package]
name = "aionui-process"
version.workspace = true
edition.workspace = true

# Self-contained subprocess mechanism (feature 001). Foundation layer:
# depends ONLY on aionui-common + aionui-runtime. Manages ONLY the processes
# it itself spawns — never touches the existing CliAgentProcess / registry.

[dependencies]
aionui-common.workspace = true
aionui-runtime.workspace = true
async-trait.workspace = true
serde = { workspace = true }
serde_json.workspace = true
thiserror.workspace = true
tracing.workspace = true
uuid = { workspace = true, features = ["serde", "v4"] }
fs2.workspace = true
tokio = { workspace = true, features = ["process", "io-util", "sync", "rt", "macros", "time"] }
tokio-util = { version = "0.7", features = ["compat"] }

[dev-dependencies]
tempfile.workspace = true

[target.'cfg(unix)'.dependencies]
libc.workspace = true

# Windows (feature 005 batch B): raw Win32 FFI (windows-sys, zero-overhead, NOT
# the heavy `windows` crate — Decision 2). Covers:
# - liveness/identity probe + cold-reap: OpenProcess / GetProcessTimes /
# WaitForSingleObject / TerminateProcess (Win32_System_Threading);
# - Job Object subtree containment (hot path): CreateJobObjectW /
# SetInformationJobObject(KILL_ON_JOB_CLOSE) / AssignProcessToJobObject /
# TerminateJobObject (Win32_System_JobObjects); the CREATE_SUSPENDED → assign
# → resume race-close needs toolhelp thread-walk (Win32_System_Diagnostics_ToolHelp).
# aionui-process owns the child's lifetime, so the Job handle lives here.
[target.'cfg(windows)'.dependencies]
windows-sys = { version = "0.61", features = [
"Win32_Foundation",
"Win32_System_Threading",
"Win32_System_JobObjects",
"Win32_System_Diagnostics_ToolHelp",
] }
136 changes: 136 additions & 0 deletions crates/aionui-process/src/capabilities.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//! Per-platform capability descriptor (feature 005, WORKFLOW discipline 7).
//!
//! Turns "what this crate can actually do on this OS" from scattered, silent
//! `cfg` branches into a single TYPED, ASSERTABLE value. The matrix in the 005
//! design doc maps 1:1 to these fields; the `capabilities_matrix_per_platform`
//! test pins each platform's row, so a capability regression (e.g. someone
//! re-stubs macOS start-time to `None`) turns a test RED instead of silently
//! degrading reap safety.
//!
//! "Hot" vs "cold" kill is the load-bearing distinction (design I-9): while a
//! live `ManagedProcess` handle is held (normal exit / explicit kill / Drop)
//! the whole subtree is torn down on every platform; only the post-CRASH
//! cold-reap (reconstruct from a persisted pid, no live handle) degrades — and
//! only on Windows, where the Job handle does not survive the owner's death.

use serde::{Deserialize, Serialize};

/// What kind of OS primitive contains a spawned subtree.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ContainmentKind {
/// No subtree containment (grandchildren are not corralled).
None,
/// POSIX process group (`setpgid` + `kill(-pgid)`); a `setsid` grandchild escapes.
ProcessGroup,
/// Windows Job Object (`KILL_ON_JOB_CLOSE` + `TerminateJobObject`); stronger than a group.
JobObject,
}

/// How well crash-recovery reap (from a persisted pid, no live handle) works.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ReapSupport {
/// No cross-restart reap on this platform.
None,
/// Full subtree reap survives restart (Unix: persisted pgid → `kill(-pgid)`).
Full,
/// Single-process kill after identity gating, plus a best-effort `taskkill /T`
/// sweep (Windows: the Job handle does not persist across the owner's death,
/// so the subtree guarantee degrades — design I-9).
SingleProcessGated,
}

/// The subprocess-mechanism capabilities of the platform this binary was
/// compiled for. A `const fn` per-platform value — no runtime probing.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct Capabilities {
/// Can actively kill a process we spawned.
pub can_kill: bool,
/// What contains a spawned subtree.
pub subtree_containment: ContainmentKind,
/// `probe` can truthfully report liveness.
pub liveness_probe: bool,
/// `read_process_start_time` yields a real value (the reap-safety identity gate).
pub identity_gate: bool,
/// While a live handle is held (normal exit / kill / Drop), the WHOLE subtree
/// is torn down. True on every supported platform — no degradation here.
pub hot_kill_subtree: bool,
/// Crash-recovery reap quality (no live handle, from a persisted pid).
pub cold_reap: ReapSupport,
/// The kernel auto-kills our children when the parent dies (Linux
/// `PR_SET_PDEATHSIG` / Windows `KILL_ON_JOB_CLOSE`); shrinks crash orphans.
/// macOS has no equivalent.
pub parent_death_signal: bool,
/// Dropping a `ManagedProcess` reaps its subtree.
pub drop_reaps: bool,
}

impl Capabilities {
/// The capabilities of the current compile target.
pub const fn current() -> Self {
#[cfg(target_os = "linux")]
{
Self {
can_kill: true,
subtree_containment: ContainmentKind::ProcessGroup,
liveness_probe: true,
identity_gate: true, // /proc/<pid>/stat field 22
hot_kill_subtree: true,
cold_reap: ReapSupport::Full, // persisted pgid → kill(-pgid)
parent_death_signal: true, // PR_SET_PDEATHSIG (R9)
drop_reaps: true,
}
}
#[cfg(target_os = "macos")]
{
Self {
can_kill: true,
subtree_containment: ContainmentKind::ProcessGroup,
liveness_probe: true,
identity_gate: true, // proc_pidinfo PROC_PIDTBSDINFO (R1)
hot_kill_subtree: true,
cold_reap: ReapSupport::Full, // persisted pgid → kill(-pgid)
parent_death_signal: false, // no PDEATHSIG equivalent; reaper is load-bearing
drop_reaps: true,
}
}
#[cfg(target_os = "windows")]
{
// BATCH B implemented (feature 005). Windows now has real:
// - probe + identity gate: OpenProcess + WaitForSingleObject +
// GetProcessTimes creation-FILETIME (proc_control windows_impl);
// - hot-kill subtree: Job Object (CREATE_SUSPENDED → assign →
// resume) + TerminateJobObject / KILL_ON_JOB_CLOSE on Drop;
// - parent-death: KILL_ON_JOB_CLOSE (job dies with the last handle).
// cold-reap stays SingleProcessGated (I-9): the Job handle does NOT
// persist across the owner's death, so a from-disk pid is terminated
// as a single process (TerminateProcess), not the whole subtree.
// ⚠️ Verified by cross-compile (cargo-xwin) + must be run on a real
// Windows host / UTM VM (no x86 CI lane) — until then treat the
// RUNTIME behavior as LocalVerifiedOnly in spirit.
Self {
can_kill: true, // TerminateJobObject / TerminateProcess
subtree_containment: ContainmentKind::JobObject, // Job Object
liveness_probe: true, // OpenProcess + WaitForSingleObject
identity_gate: true, // GetProcessTimes creation FILETIME
hot_kill_subtree: true, // Job terminate while handle held
cold_reap: ReapSupport::SingleProcessGated, // Job doesn't persist (I-9)
parent_death_signal: true, // KILL_ON_JOB_CLOSE
drop_reaps: true, // Drop terminates the Job
}
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
// Unknown platform: claim nothing (safe defaults — never kill on doubt).
Self {
can_kill: false,
subtree_containment: ContainmentKind::None,
liveness_probe: false,
identity_gate: false,
hot_kill_subtree: false,
cold_reap: ReapSupport::None,
parent_death_signal: false,
drop_reaps: false,
}
}
}
}
69 changes: 69 additions & 0 deletions crates/aionui-process/src/containment.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
//! Per-platform lifecycle fence (Containment). Tears down a whole subprocess
//! subtree (agent CLI + grandchildren like MCP servers), not just the direct
//! child. Lifecycle-only — orthogonal to any security sandbox.
//!
//! Single tier ships: [`ProcessGroupContainment`] (best-effort Unix process
//! group). Job Object / cgroup tiers are intentionally not built (no CI lane;
//! they collapse to the process-group kill on testable platforms). The seam
//! lets them land later without touching callers.

use crate::ProcessError;

/// Strength of a containment's teardown guarantee.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReapGuarantee {
/// Process-group SIGKILL: reaps descendants that stay in the group; misses
/// any that escaped via `setsid` (documented gap).
BestEffort,
}

/// Outcome of [`Containment::kill_all`] — never a bare `Ok(())` the caller can
/// misread as "tree definitely gone".
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ContainmentKillOutcome {
/// Post-kill liveness probe confirmed the group is gone.
ProbedGone,
/// Kill issued but not confirmed gone (e.g. a member escaped the group).
DegradedBestEffort,
}

/// A lifecycle fence around a spawned subprocess subtree.
pub trait Containment: Send + Sync {
fn kill_all(&self) -> Result<ContainmentKillOutcome, ProcessError>;
fn guarantee(&self) -> ReapGuarantee;
}

/// Best-effort containment via the Unix process group captured at spawn.
pub struct ProcessGroupContainment {
pid: u32,
process_group_id: Option<u32>,
}

impl ProcessGroupContainment {
pub fn new(pid: u32, process_group_id: Option<u32>) -> Self {
Self { pid, process_group_id }
}
}

impl Containment for ProcessGroupContainment {
fn kill_all(&self) -> Result<ContainmentKillOutcome, ProcessError> {
crate::force_kill(self.pid, self.process_group_id)?;
// SIGKILL is async; give the kernel a brief bounded settle before the
// confirmation probe, else a clean kill almost always reads alive and
// ProbedGone would be unreachable. Still alive after settle => honest
// Degraded (escaped grandchild) rather than a false "gone".
const ATTEMPTS: u32 = 20;
const STEP: std::time::Duration = std::time::Duration::from_millis(25);
for _ in 0..ATTEMPTS {
if !crate::process_group_alive(self.process_group_id) {
return Ok(ContainmentKillOutcome::ProbedGone);
}
std::thread::sleep(STEP);
}
Ok(ContainmentKillOutcome::DegradedBestEffort)
}

fn guarantee(&self) -> ReapGuarantee {
ReapGuarantee::BestEffort
}
}
75 changes: 75 additions & 0 deletions crates/aionui-process/src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//! Mechanism-layer error. This crate is Foundation-layer and must not depend
//! on any domain error type; it owns a small enum covering only what the
//! spawn / lifecycle / reap mechanism produces.

/// Errors produced by the subprocess mechanism layer.
#[derive(Debug, thiserror::Error)]
#[non_exhaustive]
pub enum ProcessError {
/// Invalid caller input (e.g. a missing / non-directory / whitespace cwd).
#[error("bad request: {0}")]
BadRequest(String),
/// Workspace path contains a whitespace segment the bundled runtime cannot handle.
#[error("workspace path contains whitespace (runtime unsupported): {0}")]
WorkspacePathContainsWhitespaceRuntimeUnsupported(String),
/// An OS / runtime failure (spawn failed, pipe capture failed, kill failed, fs error).
#[error("internal error: {0}")]
Internal(String),
}

impl ProcessError {
pub fn bad_request(message: impl Into<String>) -> Self {
Self::BadRequest(message.into())
}

pub fn workspace_path_contains_whitespace_runtime_unsupported(path: impl Into<String>) -> Self {
Self::WorkspacePathContainsWhitespaceRuntimeUnsupported(path.into())
}

pub fn internal(message: impl Into<String>) -> Self {
Self::Internal(message.into())
}
}

impl From<std::io::Error> for ProcessError {
fn from(e: std::io::Error) -> Self {
Self::Internal(e.to_string())
}
}

#[cfg(test)]
mod tests {
use super::*;

/// INPUTVAL-B4: the named constructors build the matching variant and the
/// `Display` impl renders the documented prefix (callers/log scrapers rely
/// on these exact prefixes).
#[test]
fn constructors_build_matching_variant_and_render_prefix() {
let bad = ProcessError::bad_request("nope");
assert!(matches!(bad, ProcessError::BadRequest(ref m) if m == "nope"));
assert_eq!(bad.to_string(), "bad request: nope");

let ws = ProcessError::workspace_path_contains_whitespace_runtime_unsupported("/a b");
assert!(matches!(
ws,
ProcessError::WorkspacePathContainsWhitespaceRuntimeUnsupported(ref p) if p == "/a b"
));
assert_eq!(
ws.to_string(),
"workspace path contains whitespace (runtime unsupported): /a b"
);

let internal = ProcessError::internal("boom");
assert!(matches!(internal, ProcessError::Internal(ref m) if m == "boom"));
assert_eq!(internal.to_string(), "internal error: boom");
}

/// `From<io::Error>` maps to `Internal` carrying the io error's text.
#[test]
fn io_error_maps_to_internal() {
let io = std::io::Error::new(std::io::ErrorKind::PermissionDenied, "denied");
let err: ProcessError = io.into();
assert!(matches!(err, ProcessError::Internal(ref m) if m.contains("denied")));
}
}
Loading