Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/record_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ on:
pull_request:
types: [opened, reopened]

# This workflow only invokes a reusable workflow, which declares its own
# `permissions: contents: read`. The caller itself needs no permissions.
# Workflow-root default: deny everything. This caller has no steps of its
# own; the job below forwards the `contents: read` that the reusable
# workflow declares in its own `workflow_call` permissions block.
permissions: {}

jobs:
call-record-workflow:
uses: OpenJobDescription/.github/.github/workflows/reusable_record_pr_details.yml@mainline
permissions:
contents: read
79 changes: 60 additions & 19 deletions crates/openjd-sessions/src/win32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,29 +96,70 @@ pub fn logon_user(username: &str, password: &str) -> Result<LogonToken, windows:
}

/// Create an environment block for a logon token and return it as a HashMap.
///
/// Retries transient profile-unload-race errors. `CreateEnvironmentBlock`
/// reads from the user's registry hive (`HKEY_USERS\<SID>`); Windows unloads
/// that hive asynchronously when the profile's refcount drops to zero. When
/// a new logon for the same user races with a not-yet-complete unload of the
/// previous session, `CreateEnvironmentBlock` can fail with:
///
/// - `0x800703FA` `ERROR_KEY_DELETED` — "Illegal operation attempted on a
/// registry key that has been marked for deletion." This is the canonical
/// symptom observed in our cross-user test suite when tests run back-to-back
/// against the same Windows test user.
/// - `0x80070005` `E_ACCESSDENIED` — documented by Microsoft as a related
/// symptom of the same race.
///
/// Both are transient: the hive finishes unloading (and reloading under the
/// new logon) within milliseconds. The upstream Python implementation has a
/// first-hand comment describing this same error in
/// `_win32/_locate_executable.py`.
///
/// Retries are bounded: 5 attempts with geometric backoff (~20/40/80/160 ms,
/// worst-case ~300 ms total) so a real failure still surfaces promptly.
pub fn environment_for_user(
token: HANDLE,
) -> Result<HashMap<String, String>, windows::core::Error> {
let mut block_ptr: *mut std::ffi::c_void = std::ptr::null_mut();
unsafe {
CreateEnvironmentBlock(&mut block_ptr, Some(token), false)?;
}

// `CreateEnvironmentBlock` writes a pointer to the environment block into
// `block_ptr` on success. `?` above propagates any error, so a `null`
// here would indicate a Windows API misbehavior; guard defensively rather
// than dereferencing.
if block_ptr.is_null() {
return Ok(HashMap::new());
}

let env = parse_environment_block(block_ptr);

unsafe {
let _ = DestroyEnvironmentBlock(block_ptr);
/// `ERROR_KEY_DELETED` as an HRESULT (`HRESULT_FROM_WIN32(1018)`).
const ERROR_KEY_DELETED_HRESULT: u32 = 0x800703FA;
/// `E_ACCESSDENIED` — documented transient symptom of the profile-unload race.
const E_ACCESSDENIED_HRESULT: u32 = 0x80070005;
const MAX_ATTEMPTS: u32 = 5;

let mut attempt: u32 = 0;
loop {
let mut block_ptr: *mut std::ffi::c_void = std::ptr::null_mut();
let result = unsafe { CreateEnvironmentBlock(&mut block_ptr, Some(token), false) };

match result {
Ok(()) => {
// `CreateEnvironmentBlock` writes a pointer to the environment
// block into `block_ptr` on success. An `Ok(())` with a null
// pointer would indicate a Windows API misbehavior; guard
// defensively rather than dereferencing or passing to
// `DestroyEnvironmentBlock`.
if block_ptr.is_null() {
return Ok(HashMap::new());
}
let env = parse_environment_block(block_ptr);
unsafe {
let _ = DestroyEnvironmentBlock(block_ptr);
}
return Ok(env);
}
Err(e) => {
let code = e.code().0 as u32;
let transient = code == ERROR_KEY_DELETED_HRESULT || code == E_ACCESSDENIED_HRESULT;
attempt += 1;
if !transient || attempt >= MAX_ATTEMPTS {
return Err(e);
}
// Geometric backoff: 20, 40, 80, 160 ms.
let backoff_ms = 20u64 << (attempt - 1);
std::thread::sleep(std::time::Duration::from_millis(backoff_ms));
}
}
}

Ok(env)
}

/// Parse a Win32 environment block (null-delimited, double-null terminated).
Expand Down
20 changes: 20 additions & 0 deletions specs/sessions/subprocess.md
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,23 @@ token is available) or `CreateProcessWithLogonW` (username + password). The
- Environment block construction for the target user
- Stdin/stdout pipe creation with inheritable handles
- Process creation with `CREATE_NEW_PROCESS_GROUP` flag

### CreateEnvironmentBlock profile-unload race

`environment_for_user()` retries `CreateEnvironmentBlock` on transient
profile-unload-race errors. The API reads from the user's registry hive at
`HKEY_USERS\<SID>`; Windows unloads that hive asynchronously when the
profile's refcount drops to zero. Back-to-back logons for the same user can
race with an in-progress unload and surface as:

- `0x800703FA` `ERROR_KEY_DELETED` — "Illegal operation attempted on a
registry key that has been marked for deletion" (the symptom observed in
the cross-user integration tests).
- `0x80070005` `E_ACCESSDENIED` — documented by Microsoft as a related
symptom of the same race.

Retries are bounded at 5 attempts with geometric backoff (20/40/80/160 ms,
worst-case ~300 ms total). Non-transient errors return immediately so real
failures aren't hidden. The upstream Python implementation has a first-hand
comment describing this same class of error in `_win32/_locate_executable.py`
but uses a different mitigation (forcing handle cleanup between calls).
Loading