diff --git a/.github/workflows/record_pr.yml b/.github/workflows/record_pr.yml index ab36aeaf..0badd2b8 100644 --- a/.github/workflows/record_pr.yml +++ b/.github/workflows/record_pr.yml @@ -4,10 +4,13 @@ on: pull_request: types: [opened, reopened] -# This workflow only invokes a reusable workflow, which declares its own -# `permissions: contents: read`. The caller itself needs no permissions. +# Workflow-root default: deny everything. This caller has no steps of its +# own; the job below forwards the `contents: read` that the reusable +# workflow declares in its own `workflow_call` permissions block. permissions: {} jobs: call-record-workflow: uses: OpenJobDescription/.github/.github/workflows/reusable_record_pr_details.yml@mainline + permissions: + contents: read diff --git a/crates/openjd-sessions/src/win32.rs b/crates/openjd-sessions/src/win32.rs index 654dd18e..356a96ba 100644 --- a/crates/openjd-sessions/src/win32.rs +++ b/crates/openjd-sessions/src/win32.rs @@ -96,29 +96,70 @@ pub fn logon_user(username: &str, password: &str) -> Result`); Windows unloads +/// that hive asynchronously when the profile's refcount drops to zero. When +/// a new logon for the same user races with a not-yet-complete unload of the +/// previous session, `CreateEnvironmentBlock` can fail with: +/// +/// - `0x800703FA` `ERROR_KEY_DELETED` — "Illegal operation attempted on a +/// registry key that has been marked for deletion." This is the canonical +/// symptom observed in our cross-user test suite when tests run back-to-back +/// against the same Windows test user. +/// - `0x80070005` `E_ACCESSDENIED` — documented by Microsoft as a related +/// symptom of the same race. +/// +/// Both are transient: the hive finishes unloading (and reloading under the +/// new logon) within milliseconds. The upstream Python implementation has a +/// first-hand comment describing this same error in +/// `_win32/_locate_executable.py`. +/// +/// Retries are bounded: 5 attempts with geometric backoff (~20/40/80/160 ms, +/// worst-case ~300 ms total) so a real failure still surfaces promptly. pub fn environment_for_user( token: HANDLE, ) -> Result, windows::core::Error> { - let mut block_ptr: *mut std::ffi::c_void = std::ptr::null_mut(); - unsafe { - CreateEnvironmentBlock(&mut block_ptr, Some(token), false)?; - } - - // `CreateEnvironmentBlock` writes a pointer to the environment block into - // `block_ptr` on success. `?` above propagates any error, so a `null` - // here would indicate a Windows API misbehavior; guard defensively rather - // than dereferencing. - if block_ptr.is_null() { - return Ok(HashMap::new()); - } - - let env = parse_environment_block(block_ptr); - - unsafe { - let _ = DestroyEnvironmentBlock(block_ptr); + /// `ERROR_KEY_DELETED` as an HRESULT (`HRESULT_FROM_WIN32(1018)`). + const ERROR_KEY_DELETED_HRESULT: u32 = 0x800703FA; + /// `E_ACCESSDENIED` — documented transient symptom of the profile-unload race. + const E_ACCESSDENIED_HRESULT: u32 = 0x80070005; + const MAX_ATTEMPTS: u32 = 5; + + let mut attempt: u32 = 0; + loop { + let mut block_ptr: *mut std::ffi::c_void = std::ptr::null_mut(); + let result = unsafe { CreateEnvironmentBlock(&mut block_ptr, Some(token), false) }; + + match result { + Ok(()) => { + // `CreateEnvironmentBlock` writes a pointer to the environment + // block into `block_ptr` on success. An `Ok(())` with a null + // pointer would indicate a Windows API misbehavior; guard + // defensively rather than dereferencing or passing to + // `DestroyEnvironmentBlock`. + if block_ptr.is_null() { + return Ok(HashMap::new()); + } + let env = parse_environment_block(block_ptr); + unsafe { + let _ = DestroyEnvironmentBlock(block_ptr); + } + return Ok(env); + } + Err(e) => { + let code = e.code().0 as u32; + let transient = code == ERROR_KEY_DELETED_HRESULT || code == E_ACCESSDENIED_HRESULT; + attempt += 1; + if !transient || attempt >= MAX_ATTEMPTS { + return Err(e); + } + // Geometric backoff: 20, 40, 80, 160 ms. + let backoff_ms = 20u64 << (attempt - 1); + std::thread::sleep(std::time::Duration::from_millis(backoff_ms)); + } + } } - - Ok(env) } /// Parse a Win32 environment block (null-delimited, double-null terminated). diff --git a/specs/sessions/subprocess.md b/specs/sessions/subprocess.md index 3610a467..aef42acd 100644 --- a/specs/sessions/subprocess.md +++ b/specs/sessions/subprocess.md @@ -320,3 +320,23 @@ token is available) or `CreateProcessWithLogonW` (username + password). The - Environment block construction for the target user - Stdin/stdout pipe creation with inheritable handles - Process creation with `CREATE_NEW_PROCESS_GROUP` flag + +### CreateEnvironmentBlock profile-unload race + +`environment_for_user()` retries `CreateEnvironmentBlock` on transient +profile-unload-race errors. The API reads from the user's registry hive at +`HKEY_USERS\`; Windows unloads that hive asynchronously when the +profile's refcount drops to zero. Back-to-back logons for the same user can +race with an in-progress unload and surface as: + +- `0x800703FA` `ERROR_KEY_DELETED` — "Illegal operation attempted on a + registry key that has been marked for deletion" (the symptom observed in + the cross-user integration tests). +- `0x80070005` `E_ACCESSDENIED` — documented by Microsoft as a related + symptom of the same race. + +Retries are bounded at 5 attempts with geometric backoff (20/40/80/160 ms, +worst-case ~300 ms total). Non-transient errors return immediately so real +failures aren't hidden. The upstream Python implementation has a first-hand +comment describing this same class of error in `_win32/_locate_executable.py` +but uses a different mitigation (forcing handle cleanup between calls).