From b87f33e279a31061fe2f2eedd15f5c5da784cd38 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Wed, 11 Mar 2026 23:26:13 +0000 Subject: [PATCH 1/4] fix concurrency bugs --- litebox_runner_lvbs/src/lib.rs | 114 ++++++++++++++++++++++-------- litebox_shim_optee/src/lib.rs | 4 +- litebox_shim_optee/src/session.rs | 84 +++++++++++++++++++++- 3 files changed, 170 insertions(+), 32 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index ed6302601..427abb34a 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -44,7 +44,7 @@ use litebox_shim_optee::msg_handler::{ decode_ta_request, handle_optee_msg_args, handle_optee_smc_args, update_optee_msg_args, }; use litebox_shim_optee::session::{ - MAX_TA_INSTANCES, SessionIdGuard, SessionManager, TaInstance, allocate_session_id, + CreationReservation, SessionIdGuard, SessionManager, TaInstance, allocate_session_id, }; use litebox_shim_optee::{NormalWorldConstPtr, NormalWorldMutPtr, UserConstPtr}; use once_cell::race::OnceBox; @@ -486,26 +486,44 @@ fn handle_open_session( let client_identity = ta_req_info.client_identity; let params = &ta_req_info.params; + // Fast path: reuse a cached single-instance TA if one exists. if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { - // Try to reuse existing single-instance TA, or create a new instance - // If the TA is busy (lock held), return EThreadLimit - driver will wait and retry - open_session_single_instance( + return open_session_single_instance( msg_args, msg_args_phys_addr, existing, params, ta_uuid, &ta_req_info, - ) - } else { - open_session_new_instance( + ); + } + + // Slow path: atomically reserve a creation slot. This re-checks the + // single-instance cache under a lock to close the TOCTOU window, prevents + // duplicate UUID loading, and enforces the instance capacity limit. + match session_manager().reserve_creation_slot(&ta_uuid)? { + CreationReservation::ExistingSingleInstance(existing) => open_session_single_instance( msg_args, msg_args_phys_addr, + existing, params, ta_uuid, - client_identity, &ta_req_info, - ) + ), + CreationReservation::SlotReserved => { + let result = open_session_new_instance( + msg_args, + msg_args_phys_addr, + params, + ta_uuid, + client_identity, + &ta_req_info, + ); + // Release the creation slot on both success and failure. + // On success the instance is now tracked by session/cache maps. + session_manager().release_creation_slot(&ta_uuid); + result + } } } @@ -624,11 +642,19 @@ fn open_session_single_instance( Some(ta_req_info), )?; + // Re-acquire the TA lock BEFORE removing from cache and + // tearing down, so concurrent open_session_single_instance + // callers that cloned this Arc will fail try_lock(). + let instance = instance_arc.lock(); + session_manager().remove_single_instance(&ta_uuid); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&instance_arc.lock().shim, task_pt_id) }; + // The lock is held, so no other core can enter the TA. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + + drop(instance); // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just @@ -675,6 +701,10 @@ fn open_session_single_instance( /// Create a new TA instance for a session. /// +/// The caller must have already reserved a creation slot via +/// [`SessionManager::reserve_creation_slot`] and is responsible for +/// calling [`SessionManager::release_creation_slot`] after this returns. +/// /// If ldelf loading or OpenSession entry point fails, the page table is torn down. /// Per OP-TEE OS semantics: if OpenSession returns non-success, cleanup happens. fn open_session_new_instance( @@ -685,12 +715,7 @@ fn open_session_new_instance( client_identity: Option, ta_req_info: &litebox_shim_optee::msg_handler::TaRequestInfo, ) -> Result<(), OpteeSmcReturnCode> { - // Check TA instance limit - // TODO: consider better resource management strategy - if session_manager().instance_count() >= MAX_TA_INSTANCES { - debug_serial_println!("TA instance limit reached ({} instances)", MAX_TA_INSTANCES); - return Err(OpteeSmcReturnCode::ENomem); - } + // Instance capacity is enforced by reserve_creation_slot() in the caller. // Create and switch to new page table let task_pt_id = create_task_page_table()?; @@ -704,13 +729,18 @@ fn open_session_new_instance( })?; } - // Allocate session ID before loading - return EBusy to normal world if exhausted - let runner_session_id = allocate_session_id().ok_or_else(|| { + // Allocate session ID before loading - return EBusy to normal world if exhausted. + // Use SessionIdGuard to ensure the ID is recycled on any error path + // (before it is registered with the session manager). + let session_id_guard = SessionIdGuard::new(allocate_session_id().ok_or_else(|| { + // Safety: We're switching to base page table; no user-space refs held. unsafe { switch_to_base_page_table() }; // Safety: We've switched to the base page table above. let _ = unsafe { delete_task_page_table(task_pt_id) }; OpteeSmcReturnCode::EBusy - })?; + })?); + // Safe to unwrap: guard was just created with Some(id). + let runner_session_id = session_id_guard.id().unwrap(); // Load ldelf and TA - Box immediately to keep at fixed heap address let shim = litebox_shim_optee::OpteeShimBuilder::new().build(); @@ -807,12 +837,20 @@ fn open_session_new_instance( } // Read TA output parameters from the stack buffer - let params_address = loaded_program - .params_address - .ok_or(OpteeSmcReturnCode::EBadAddr)?; + let params_address = loaded_program.params_address.ok_or_else(|| { + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + OpteeSmcReturnCode::EBadAddr + })?; let ta_params = UserConstPtr::::from_usize(params_address) .read_at_offset(0) - .ok_or(OpteeSmcReturnCode::EBadAddr)?; + .ok_or_else(|| { + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + OpteeSmcReturnCode::EBadAddr + })?; // Check the return code from the TA's OpenSession entry point let return_code: u32 = ctx.rax.truncate(); @@ -840,6 +878,7 @@ fn open_session_new_instance( // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + // session_id_guard drops here, recycling the session ID return Ok(()); } @@ -855,7 +894,9 @@ fn open_session_new_instance( session_manager().cache_single_instance(ta_uuid, instance.clone()); } - // Register session (runner_session_id already allocated above) + // Disarm the guard: ownership transfers to session manager via register_session. + // Safe to unwrap: guard has not been disarmed yet. + let runner_session_id = session_id_guard.disarm().unwrap(); session_manager().register_session(runner_session_id, instance.clone(), ta_uuid, ta_flags); // Write success response back to normal world @@ -986,6 +1027,13 @@ fn handle_invoke_command( )?; if is_last_session { + // Re-acquire the TA lock BEFORE removing from cache and tearing + // down. While we hold this lock, any concurrent + // open_session_single_instance will fail try_lock() and return + // EThreadLimit, preventing use of a page table that is about to + // be destroyed. + let instance = instance_arc.lock(); + // Clear single-instance cache if applicable if ta_flags.is_single_instance() { session_manager().remove_single_instance(&ta_uuid); @@ -993,7 +1041,11 @@ fn handle_invoke_command( // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&instance_arc.lock().shim, task_pt_id) }; + // The lock is held, so no other core can enter the TA. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + + drop(instance); + debug_serial_println!( "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", task_pt_id @@ -1112,16 +1164,22 @@ fn handle_close_session( return Ok(()); } + // Re-acquire the TA lock BEFORE removing from cache and tearing + // down. While we hold this lock, any concurrent + // open_session_single_instance will fail try_lock() and return + // EThreadLimit, preventing use of a page table that is about to + // be destroyed. + let instance = entry.instance.lock(); + let task_pt_id = instance.task_page_table_id; + // Clear single-instance cache if this was a single-instance TA if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); } - let instance = entry.instance.lock(); - let task_pt_id = instance.task_page_table_id; - // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. + // The lock is held, so no other core can enter the TA. unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; // Drop the instance to release shim/loaded_program resources diff --git a/litebox_shim_optee/src/lib.rs b/litebox_shim_optee/src/lib.rs index 9fcc82d09..9b68fb5e7 100644 --- a/litebox_shim_optee/src/lib.rs +++ b/litebox_shim_optee/src/lib.rs @@ -38,8 +38,8 @@ pub mod ptr; // Re-export session management types for convenience pub use session::{ - MAX_TA_INSTANCES, SessionEntry, SessionManager, SessionMap, SingleInstanceCache, TaInstance, - allocate_session_id, + CreationReservation, MAX_TA_INSTANCES, SessionEntry, SessionManager, SessionMap, + SingleInstanceCache, TaInstance, allocate_session_id, }; const MAX_KERNEL_BUF_SIZE: usize = 0x80_000; diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 86518a373..0e6608231 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -94,8 +94,8 @@ use crate::{LoadedProgram, OpteeShim, SessionIdPool}; use alloc::sync::Arc; -use hashbrown::HashMap; -use litebox_common_optee::{TaFlags, TeeUuid}; +use hashbrown::{HashMap, HashSet}; +use litebox_common_optee::{OpteeSmcReturnCode, TaFlags, TeeUuid}; use spin::mutex::SpinMutex; /// Maximum number of concurrent TA instances to avoid out of memory situations. @@ -315,6 +315,30 @@ impl Drop for SessionIdGuard { } } +/// Result of [`SessionManager::reserve_creation_slot`]. +pub enum CreationReservation { + /// An existing single-instance TA was found (another core cached it + /// between our initial lookup and the reservation). Reuse this instance. + ExistingSingleInstance(Arc>), + /// A slot was reserved. The caller MUST call + /// [`SessionManager::release_creation_slot`] when done (success or failure). + SlotReserved, +} + +/// State for coordinating concurrent instance creation. +/// +/// Guarded by a single lock to provide atomic capacity checks and +/// duplicate-UUID prevention. +struct CreationState { + /// UUIDs currently being loaded. Prevents two cores from simultaneously + /// creating a new instance for the same TA UUID (which would violate the + /// single-instance invariant if the TA turns out to have that flag). + pending_uuids: HashSet, + /// Number of instances currently being created (not yet registered). + /// Added to [`SessionManager::instance_count`] for accurate capacity checks. + pending_count: usize, +} + /// Session manager that coordinates session and instance lifecycle. /// /// This provides a unified interface for: @@ -326,6 +350,8 @@ pub struct SessionManager { sessions: SessionMap, /// Cache of single-instance TAs by UUID. single_instance_cache: SingleInstanceCache, + /// Coordination state for concurrent instance creation. + creation_state: SpinMutex, } impl SessionManager { @@ -334,6 +360,10 @@ impl SessionManager { Self { sessions: SessionMap::new(), single_instance_cache: SingleInstanceCache::new(), + creation_state: SpinMutex::new(CreationState { + pending_uuids: HashSet::new(), + pending_count: 0, + }), } } @@ -418,6 +448,56 @@ impl SessionManager { pub fn is_at_capacity(&self) -> bool { self.instance_count() >= MAX_TA_INSTANCES } + + /// Atomically reserve a slot for creating a new TA instance. + /// + /// This performs the following checks under a single lock: + /// 1. Re-checks the single-instance cache (handles TOCTOU with the + /// caller's earlier `get_single_instance` check). + /// 2. Rejects if another core is already loading the same UUID + /// (returns `EThreadLimit` so the driver retries). + /// 3. Rejects if the instance count (including pending) would exceed + /// `MAX_TA_INSTANCES` (returns `ENomem`). + /// + /// On `SlotReserved`, the caller **must** call [`release_creation_slot`] + /// on both success and error paths. + pub fn reserve_creation_slot( + &self, + uuid: &TeeUuid, + ) -> Result { + let mut state = self.creation_state.lock(); + + // Re-check single-instance cache under the creation lock to close + // the TOCTOU window between the caller's get_single_instance() and here. + if let Some(existing) = self.single_instance_cache.get(uuid) { + return Ok(CreationReservation::ExistingSingleInstance(existing)); + } + + // Another core is already loading this UUID — tell the driver to retry. + if state.pending_uuids.contains(uuid) { + return Err(OpteeSmcReturnCode::EThreadLimit); + } + + // Capacity check including in-flight creations. + let total = self.instance_count() + state.pending_count; + if total >= MAX_TA_INSTANCES { + return Err(OpteeSmcReturnCode::ENomem); + } + + state.pending_uuids.insert(*uuid); + state.pending_count += 1; + Ok(CreationReservation::SlotReserved) + } + + /// Release a creation slot previously reserved by [`reserve_creation_slot`]. + /// + /// Must be called exactly once for every `SlotReserved` result, on both + /// success and error paths. + pub fn release_creation_slot(&self, uuid: &TeeUuid) { + let mut state = self.creation_state.lock(); + state.pending_uuids.remove(uuid); + state.pending_count = state.pending_count.saturating_sub(1); + } } impl Default for SessionManager { From 98e17b1db98fa33e31ee551695c5f82400044aa3 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Wed, 11 Mar 2026 23:26:13 +0000 Subject: [PATCH 2/4] fix concurrency bugs --- litebox_runner_lvbs/src/lib.rs | 34 ++++++------- litebox_shim_optee/src/session.rs | 79 +++++++++++++++++-------------- 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 427abb34a..9f798e4c9 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -501,7 +501,17 @@ fn handle_open_session( // Slow path: atomically reserve a creation slot. This re-checks the // single-instance cache under a lock to close the TOCTOU window, prevents // duplicate UUID loading, and enforces the instance capacity limit. - match session_manager().reserve_creation_slot(&ta_uuid)? { + // The slot is automatically released when the closure returns. + match session_manager().with_creation_slot(&ta_uuid, || { + open_session_new_instance( + msg_args, + msg_args_phys_addr, + existing, + params, + ta_uuid, + &ta_req_info, + ) + })? { CreationReservation::ExistingSingleInstance(existing) => open_session_single_instance( msg_args, msg_args_phys_addr, @@ -510,20 +520,7 @@ fn handle_open_session( ta_uuid, &ta_req_info, ), - CreationReservation::SlotReserved => { - let result = open_session_new_instance( - msg_args, - msg_args_phys_addr, - params, - ta_uuid, - client_identity, - &ta_req_info, - ); - // Release the creation slot on both success and failure. - // On success the instance is now tracked by session/cache maps. - session_manager().release_creation_slot(&ta_uuid); - result - } + CreationReservation::SlotReserved => Ok(()), } } @@ -701,9 +698,8 @@ fn open_session_single_instance( /// Create a new TA instance for a session. /// -/// The caller must have already reserved a creation slot via -/// [`SessionManager::reserve_creation_slot`] and is responsible for -/// calling [`SessionManager::release_creation_slot`] after this returns. +/// The caller must invoke this inside [`SessionManager::with_creation_slot`] +/// to ensure a creation slot is held during execution and released afterward. /// /// If ldelf loading or OpenSession entry point fails, the page table is torn down. /// Per OP-TEE OS semantics: if OpenSession returns non-success, cleanup happens. @@ -715,7 +711,7 @@ fn open_session_new_instance( client_identity: Option, ta_req_info: &litebox_shim_optee::msg_handler::TaRequestInfo, ) -> Result<(), OpteeSmcReturnCode> { - // Instance capacity is enforced by reserve_creation_slot() in the caller. + // Instance capacity is enforced by with_creation_slot() in the caller. // Create and switch to new page table let task_pt_id = create_task_page_table()?; diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 0e6608231..c41c6be7f 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -315,13 +315,12 @@ impl Drop for SessionIdGuard { } } -/// Result of [`SessionManager::reserve_creation_slot`]. +/// Result of [`SessionManager::with_creation_slot`]. pub enum CreationReservation { /// An existing single-instance TA was found (another core cached it /// between our initial lookup and the reservation). Reuse this instance. ExistingSingleInstance(Arc>), - /// A slot was reserved. The caller MUST call - /// [`SessionManager::release_creation_slot`] when done (success or failure). + /// The creation closure ran successfully inside the reserved slot. SlotReserved, } @@ -449,9 +448,9 @@ impl SessionManager { self.instance_count() >= MAX_TA_INSTANCES } - /// Atomically reserve a slot for creating a new TA instance. + /// Atomically reserve a creation slot, run the closure, then release. /// - /// This performs the following checks under a single lock: + /// Performs the following checks under a single lock before calling `f`: /// 1. Re-checks the single-instance cache (handles TOCTOU with the /// caller's earlier `get_single_instance` check). /// 2. Rejects if another core is already loading the same UUID @@ -459,44 +458,52 @@ impl SessionManager { /// 3. Rejects if the instance count (including pending) would exceed /// `MAX_TA_INSTANCES` (returns `ENomem`). /// - /// On `SlotReserved`, the caller **must** call [`release_creation_slot`] - /// on both success and error paths. - pub fn reserve_creation_slot( + /// If an existing single-instance TA is found in step 1, returns + /// `CreationReservation::ExistingSingleInstance` without calling `f`. + /// Otherwise, calls `f` while the slot is reserved and guarantees the + /// slot is released when `f` returns (success or error). + pub fn with_creation_slot( &self, uuid: &TeeUuid, - ) -> Result { - let mut state = self.creation_state.lock(); - - // Re-check single-instance cache under the creation lock to close - // the TOCTOU window between the caller's get_single_instance() and here. - if let Some(existing) = self.single_instance_cache.get(uuid) { - return Ok(CreationReservation::ExistingSingleInstance(existing)); + f: F, + ) -> Result + where + F: FnOnce() -> Result<(), OpteeSmcReturnCode>, + { + { + let mut state = self.creation_state.lock(); + + // Re-check single-instance cache under the creation lock to close + // the TOCTOU window between the caller's get_single_instance() and here. + if let Some(existing) = self.single_instance_cache.get(uuid) { + return Ok(CreationReservation::ExistingSingleInstance(existing)); + } + + // Another core is already loading this UUID. Tell the VTL0 driver to retry. + if state.pending_uuids.contains(uuid) { + return Err(OpteeSmcReturnCode::EThreadLimit); + } + + // Capacity check including in-flight creations. + let total = self.instance_count() + state.pending_count; + if total >= MAX_TA_INSTANCES { + return Err(OpteeSmcReturnCode::ENomem); + } + + state.pending_uuids.insert(*uuid); + state.pending_count += 1; + // Lock drops here } - // Another core is already loading this UUID — tell the driver to retry. - if state.pending_uuids.contains(uuid) { - return Err(OpteeSmcReturnCode::EThreadLimit); - } + let result = f(); - // Capacity check including in-flight creations. - let total = self.instance_count() + state.pending_count; - if total >= MAX_TA_INSTANCES { - return Err(OpteeSmcReturnCode::ENomem); + { + let mut state = self.creation_state.lock(); + state.pending_uuids.remove(uuid); + state.pending_count = state.pending_count.saturating_sub(1); } - state.pending_uuids.insert(*uuid); - state.pending_count += 1; - Ok(CreationReservation::SlotReserved) - } - - /// Release a creation slot previously reserved by [`reserve_creation_slot`]. - /// - /// Must be called exactly once for every `SlotReserved` result, on both - /// success and error paths. - pub fn release_creation_slot(&self, uuid: &TeeUuid) { - let mut state = self.creation_state.lock(); - state.pending_uuids.remove(uuid); - state.pending_count = state.pending_count.saturating_sub(1); + result.map(|()| CreationReservation::SlotReserved) } } From 2a62475327bb96ce61211ee0a7bf5f735c7f665f Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Thu, 12 Mar 2026 00:10:44 +0000 Subject: [PATCH 3/4] separate single and multi-instance TAs --- litebox_runner_lvbs/src/lib.rs | 81 ++++++++++++------------------- litebox_shim_optee/src/session.rs | 73 +++++++++++++++++----------- 2 files changed, 78 insertions(+), 76 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 9f798e4c9..de8c3abd4 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -486,29 +486,37 @@ fn handle_open_session( let client_identity = ta_req_info.client_identity; let params = &ta_req_info.params; - // Fast path: reuse a cached single-instance TA if one exists. - if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { - return open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing, - params, - ta_uuid, - &ta_req_info, - ); + // Look up cached TA flags to determine single vs multi-instance. + // For the first-ever load of a UUID (no cached flags), conservatively + // assume single-instance to preserve all safety invariants. + let is_single_instance = session_manager() + .get_known_flags(&ta_uuid) + .is_none_or(|f| f.is_single_instance()); + + if is_single_instance { + // Fast path: Reuse a cached single-instance TA if one exists. + if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { + return open_session_single_instance( + msg_args, + msg_args_phys_addr, + existing, + params, + ta_uuid, + &ta_req_info, + ); + } } - // Slow path: atomically reserve a creation slot. This re-checks the - // single-instance cache under a lock to close the TOCTOU window, prevents - // duplicate UUID loading, and enforces the instance capacity limit. - // The slot is automatically released when the closure returns. - match session_manager().with_creation_slot(&ta_uuid, || { + // Create a new TA instance. For single-instance TAs, this also re-checks the cache + // under the lock and prevents concurrent instance creation of the same UUID. + // For multi-instance TAs, only the global capacity limit is enforced. + match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { open_session_new_instance( msg_args, msg_args_phys_addr, - existing, params, ta_uuid, + client_identity, &ta_req_info, ) })? { @@ -603,9 +611,6 @@ fn open_session_single_instance( let return_code: u32 = ctx.rax.truncate(); let return_code = TeeResult::try_from(return_code).unwrap_or(TeeResult::GenericError); - // Drop the lock before potential cleanup - drop(instance); - // Per OP-TEE OS: if OpenSession fails, don't register the session // Reference: tee_ta_open_session() in tee_ta_manager.c if return_code != TeeResult::Success { @@ -639,16 +644,10 @@ fn open_session_single_instance( Some(ta_req_info), )?; - // Re-acquire the TA lock BEFORE removing from cache and - // tearing down, so concurrent open_session_single_instance - // callers that cloned this Arc will fail try_lock(). - let instance = instance_arc.lock(); - session_manager().remove_single_instance(&ta_uuid); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. - // The lock is held, so no other core can enter the TA. unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; drop(instance); @@ -661,6 +660,8 @@ fn open_session_single_instance( } } + drop(instance); + // Write error response back to normal world write_msg_args_to_normal_world( msg_args, @@ -674,6 +675,8 @@ fn open_session_single_instance( return Ok(()); } + drop(instance); + // Success: register session and disarm the guard (ownership transfers to session map) // Safe to unwrap: guard has not been disarmed yet. let runner_session_id = session_id_guard.disarm().unwrap(); @@ -711,8 +714,6 @@ fn open_session_new_instance( client_identity: Option, ta_req_info: &litebox_shim_optee::msg_handler::TaRequestInfo, ) -> Result<(), OpteeSmcReturnCode> { - // Instance capacity is enforced by with_creation_slot() in the caller. - // Create and switch to new page table let task_pt_id = create_task_page_table()?; @@ -874,7 +875,6 @@ fn open_session_new_instance( // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; - // session_id_guard drops here, recycling the session ID return Ok(()); } @@ -999,9 +999,6 @@ fn handle_invoke_command( let ta_flags = session_entry.ta_flags; let instance_arc = session_entry.instance.clone(); - // Drop the instance lock before cleanup - drop(instance); - // Remove the session from the map session_manager().unregister_session(session_id); @@ -1023,13 +1020,6 @@ fn handle_invoke_command( )?; if is_last_session { - // Re-acquire the TA lock BEFORE removing from cache and tearing - // down. While we hold this lock, any concurrent - // open_session_single_instance will fail try_lock() and return - // EThreadLimit, preventing use of a page table that is about to - // be destroyed. - let instance = instance_arc.lock(); - // Clear single-instance cache if applicable if ta_flags.is_single_instance() { session_manager().remove_single_instance(&ta_uuid); @@ -1050,6 +1040,8 @@ fn handle_invoke_command( // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. + } else { + drop(instance); } return Ok(()); @@ -1135,9 +1127,6 @@ fn handle_close_session( // Clone the instance Arc before dropping the lock for later cleanup check let instance_arc = session_entry.instance.clone(); - // Drop the instance lock before removing from map - drop(instance); - // Remove the session entry from the map let removed_entry = session_manager().unregister_session(session_id); @@ -1153,6 +1142,7 @@ fn handle_close_session( // If this is a single-instance TA with keep_alive flag, don't remove it from memory. // Note: keep_alive is only meaningful for single-instance TAs. if entry.ta_flags.is_single_instance() && entry.ta_flags.is_keep_alive() { + drop(instance); debug_serial_println!( "CloseSession complete: session_id={}, TA kept alive (INSTANCE_KEEP_ALIVE flag)", session_id @@ -1160,14 +1150,6 @@ fn handle_close_session( return Ok(()); } - // Re-acquire the TA lock BEFORE removing from cache and tearing - // down. While we hold this lock, any concurrent - // open_session_single_instance will fail try_lock() and return - // EThreadLimit, preventing use of a page table that is about to - // be destroyed. - let instance = entry.instance.lock(); - let task_pt_id = instance.task_page_table_id; - // Clear single-instance cache if this was a single-instance TA if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); @@ -1188,6 +1170,7 @@ fn handle_close_session( ); } } else { + drop(instance); debug_serial_println!( "CloseSession complete: session_id={}, other sessions remaining on TA", session_id diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index c41c6be7f..ce758548a 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -329,9 +329,10 @@ pub enum CreationReservation { /// Guarded by a single lock to provide atomic capacity checks and /// duplicate-UUID prevention. struct CreationState { - /// UUIDs currently being loaded. Prevents two cores from simultaneously - /// creating a new instance for the same TA UUID (which would violate the - /// single-instance invariant if the TA turns out to have that flag). + /// UUIDs of single-instance TAs currently being loaded. Prevents multiple cores + /// from simultaneously creating a new instance for the same single-instance + /// TA UUID (which would violate the single-instance invariant). + /// Multi-instance TAs are not tracked here. They can be created concurrently. pending_uuids: HashSet, /// Number of instances currently being created (not yet registered). /// Added to [`SessionManager::instance_count`] for accurate capacity checks. @@ -351,6 +352,8 @@ pub struct SessionManager { single_instance_cache: SingleInstanceCache, /// Coordination state for concurrent instance creation. creation_state: SpinMutex, + /// Cached TA flags by UUID, populated on first successful session registration. + known_flags: SpinMutex>, } impl SessionManager { @@ -363,6 +366,7 @@ impl SessionManager { pending_uuids: HashSet::new(), pending_count: 0, }), + known_flags: SpinMutex::new(HashMap::new()), } } @@ -396,6 +400,14 @@ impl SessionManager { self.sessions.get_entry(session_id) } + /// Look up previously observed TA flags for a UUID. + /// + /// Returns `None` if this UUID has never been successfully loaded. + /// Callers should conservatively assume single-instance when `None`. + pub fn get_known_flags(&self, uuid: &TeeUuid) -> Option { + self.known_flags.lock().get(uuid).copied() + } + /// Register a new session. pub fn register_session( &self, @@ -404,6 +416,7 @@ impl SessionManager { ta_uuid: TeeUuid, ta_flags: TaFlags, ) { + self.known_flags.lock().entry(ta_uuid).or_insert(ta_flags); self.sessions .insert(session_id, instance, ta_uuid, ta_flags); } @@ -448,23 +461,21 @@ impl SessionManager { self.instance_count() >= MAX_TA_INSTANCES } - /// Atomically reserve a creation slot, run the closure, then release. + /// Atomically reserve a creation slot and run `f` to create a new TA instance. + /// + /// Behavior depends on whether the TA is: /// - /// Performs the following checks under a single lock before calling `f`: - /// 1. Re-checks the single-instance cache (handles TOCTOU with the - /// caller's earlier `get_single_instance` check). - /// 2. Rejects if another core is already loading the same UUID - /// (returns `EThreadLimit` so the driver retries). - /// 3. Rejects if the instance count (including pending) would exceed - /// `MAX_TA_INSTANCES` (returns `ENomem`). + /// - **Single-instance**: Re-checks the single-instance cache under the lock to + /// close TOCTOU windows, and prevents duplicate concurrent creation of + /// the same UUID via `pending_uuids`. /// - /// If an existing single-instance TA is found in step 1, returns - /// `CreationReservation::ExistingSingleInstance` without calling `f`. - /// Otherwise, calls `f` while the slot is reserved and guarantees the - /// slot is released when `f` returns (success or error). + /// - **Multi-instance**: Each session gets its own independent TA instance, + /// matching OP-TEE OS behavior. Multiple cores may create instances of + /// the same UUID concurrently. pub fn with_creation_slot( &self, uuid: &TeeUuid, + is_single_instance: bool, f: F, ) -> Result where @@ -473,15 +484,20 @@ impl SessionManager { { let mut state = self.creation_state.lock(); - // Re-check single-instance cache under the creation lock to close - // the TOCTOU window between the caller's get_single_instance() and here. - if let Some(existing) = self.single_instance_cache.get(uuid) { - return Ok(CreationReservation::ExistingSingleInstance(existing)); - } - - // Another core is already loading this UUID. Tell the VTL0 driver to retry. - if state.pending_uuids.contains(uuid) { - return Err(OpteeSmcReturnCode::EThreadLimit); + if is_single_instance { + // Re-check single-instance cache under the creation lock to close + // the TOCTOU window between the caller's get_single_instance() and here. + if let Some(existing) = self.single_instance_cache.get(uuid) { + return Ok(CreationReservation::ExistingSingleInstance(existing)); + } + + // Another core is currently in the middle of creating an instance + // for this single-instance UUID. The instance isn't cached yet, + // so we cannot reuse it. Return EThreadLimit to have the + // normal-world driver wait and retry. + if state.pending_uuids.contains(uuid) { + return Err(OpteeSmcReturnCode::EThreadLimit); + } } // Capacity check including in-flight creations. @@ -490,16 +506,19 @@ impl SessionManager { return Err(OpteeSmcReturnCode::ENomem); } - state.pending_uuids.insert(*uuid); + if is_single_instance { + state.pending_uuids.insert(*uuid); + } state.pending_count += 1; - // Lock drops here } let result = f(); { let mut state = self.creation_state.lock(); - state.pending_uuids.remove(uuid); + if is_single_instance { + state.pending_uuids.remove(uuid); + } state.pending_count = state.pending_count.saturating_sub(1); } From e6eaaf925a70204a023dc8f838ebcbb106fcb4ad Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Sat, 14 Mar 2026 23:20:08 +0000 Subject: [PATCH 4/4] comment --- litebox_shim_optee/src/session.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index ce758548a..0ab69f544 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -334,7 +334,8 @@ struct CreationState { /// TA UUID (which would violate the single-instance invariant). /// Multi-instance TAs are not tracked here. They can be created concurrently. pending_uuids: HashSet, - /// Number of instances currently being created (not yet registered). + /// Number of instances currently being created (not yet registered). This + /// covers both single-instance and multi-instance TAs. /// Added to [`SessionManager::instance_count`] for accurate capacity checks. pending_count: usize, }