From fd4dc12cbcab6a05db8e64c2fbda15e16c64016a Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Fri, 17 Apr 2026 21:30:58 +0000 Subject: [PATCH 01/12] fix UAF condition of single-instance TA --- litebox_runner_lvbs/src/lib.rs | 44 ++++++++++++++++++++++--------- litebox_shim_optee/src/session.rs | 6 ++++- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index ee598e4be..0eb56074c 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -547,10 +547,17 @@ fn open_session_single_instance( ) -> Result<(), OpteeSmcReturnCode> { // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let instance = instance_arc + let mut instance = instance_arc .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // `task_page_table_id == None` means the instance was destroyed by a concurrent + // close/panic between our cache lookup and acquiring the instance lock. + // Return `EThreadLimit` so the Linux driver retries. + let task_pt_id = instance + .task_page_table_id + .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path // (before it is registered with the session manager). @@ -562,11 +569,10 @@ fn open_session_single_instance( debug_serial_println!( "Reusing single-instance TA: uuid={:?}, task_pt_id={}, session_id={}", ta_uuid, - instance.task_page_table_id, + task_pt_id, runner_session_id ); - let task_pt_id = instance.task_page_table_id; let ta_flags = instance.loaded_program.ta_flags; // Switch to the existing TA's page table @@ -642,6 +648,9 @@ fn open_session_single_instance( )?; session_manager().remove_single_instance(&ta_uuid); + // Invalidate the id in the struct *before* teardown so any + // future lock holder observes `None` and bails. + instance.task_page_table_id = None; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -780,9 +789,6 @@ fn open_session_new_instance( "ldelf/TA_CreateEntryPoint failed: return_code={:?}", ldelf_return_code ); - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&shim, task_pt_id) }; // Write error response back to normal world write_msg_args_to_normal_world( @@ -794,6 +800,10 @@ fn open_session_new_instance( Some(ta_req_info), )?; + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + return Ok(()); } @@ -879,7 +889,7 @@ fn open_session_new_instance( let instance = Arc::new(SpinMutex::new(TaInstance { shim, loaded_program, - task_page_table_id: task_pt_id, + task_page_table_id: Some(task_pt_id), })); // Cache single-instance TAs for future sessions @@ -936,11 +946,14 @@ fn handle_invoke_command( .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let Some(instance) = session_entry.instance.try_lock() else { + let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - - let task_pt_id = instance.task_page_table_id; + // `task_page_table_id == None` means the instance was destroyed. + // Return `EBadCmd` because the client must start over with OpenSession. + let task_pt_id = instance + .task_page_table_id + .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1021,6 +1034,7 @@ fn handle_invoke_command( if ta_flags.is_single_instance() { session_manager().remove_single_instance(&ta_uuid); } + instance.task_page_table_id = None; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -1079,11 +1093,14 @@ fn handle_close_session( .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let Some(instance) = session_entry.instance.try_lock() else { + let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - - let task_pt_id = instance.task_page_table_id; + // `task_page_table_id == None` means the instance was destroyed. + // Return `EBadCmd` because the client must start over with OpenSession. + let task_pt_id = instance + .task_page_table_id + .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1151,6 +1168,7 @@ fn handle_close_session( if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); } + instance.task_page_table_id = None; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 0ab69f544..974a7c58e 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -116,7 +116,11 @@ pub struct TaInstance { /// after initialization because it contains internal state that may not survive moves. pub loaded_program: alloc::boxed::Box, /// The task page table ID associated with this TA instance. - pub task_page_table_id: usize, + /// + /// `None` after the page table has been torn down. Any lock holder that + /// observes `None` must treat the instance as destroyed and bail out + /// without touching the page table. + pub task_page_table_id: Option, } // SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) From 5203fa17c644933a45e60d71da8f73a83c8a0764 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 03:29:27 +0000 Subject: [PATCH 02/12] add fall-through path for a concurrently closed single-instance TA --- litebox_runner_lvbs/src/lib.rs | 137 ++++++++++++++++++++---------- litebox_shim_optee/src/session.rs | 25 ++++++ 2 files changed, 119 insertions(+), 43 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 0eb56074c..91c325166 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -474,6 +474,8 @@ fn handle_open_session( msg_args: &mut OpteeMsgArgs, msg_args_phys_addr: u64, ) -> Result<(), OpteeSmcReturnCode> { + const MAX_OPEN_SESSION_RETRIES: u32 = 4; + let ta_req_info = decode_ta_request(msg_args).map_err(|_| OpteeSmcReturnCode::EBadCmd)?; if ta_req_info.entry_func != UteeEntryFunc::OpenSession { return Err(OpteeSmcReturnCode::EBadCmd); @@ -493,47 +495,80 @@ fn handle_open_session( if is_single_instance { // Fast path: Reuse a cached single-instance TA if one exists. if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { - return open_session_single_instance( + match open_session_single_instance( msg_args, msg_args_phys_addr, - existing, + existing.clone(), params, ta_uuid, &ta_req_info, - ); + )? { + OpenSessionOutcome::Done => return Ok(()), + // Cached instance was torn down in a race with a concurrent + // close/panic. Drop the stale entry from the cache only if + // still the same Arc and fall through. + OpenSessionOutcome::InstanceDestroyed => { + session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + } + } } } - // Create a new TA instance. For single-instance TAs, this also re-checks the cache - // under the lock and prevents concurrent instance creation of the same UUID. - // For multi-instance TAs, only the global capacity limit is enforced. - match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { - open_session_new_instance( - msg_args, - msg_args_phys_addr, - params, - ta_uuid, - client_identity, - &ta_req_info, - ) - })? { - CreationReservation::ExistingSingleInstance(existing) => open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing, - params, - ta_uuid, - &ta_req_info, - ), - CreationReservation::SlotReserved => Ok(()), + // Create a new TA instance, or reuse a freshly-cached one. For + // single-instance TAs, `with_creation_slot` re-checks the cache under + // its lock and serializes instance creation per UUID. + // + // If we observe a destroyed instance via `ExistingSingleInstance`, we + // clear the stale cache entry and loop. In normal operation the + // destroyer removes the cache entry before setting + // `task_page_table_id = None`, so this loop terminates in at + // most one extra iteration. The bounded retry cap guards + // against pathological cases where other cores repeatedly install and + // destroy fresh instances for the same UUID. After the cap we fall + // back to `EThreadLimit` so the driver retries the whole call. + for _ in 0..MAX_OPEN_SESSION_RETRIES { + match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { + open_session_new_instance( + msg_args, + msg_args_phys_addr, + params, + ta_uuid, + client_identity, + &ta_req_info, + ) + })? { + CreationReservation::ExistingSingleInstance(existing) => { + match open_session_single_instance( + msg_args, + msg_args_phys_addr, + existing.clone(), + params, + ta_uuid, + &ta_req_info, + )? { + OpenSessionOutcome::Done => return Ok(()), + OpenSessionOutcome::InstanceDestroyed => { + session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + } + } + } + CreationReservation::SlotReserved => return Ok(()), + } } + + Err(OpteeSmcReturnCode::EThreadLimit) +} + +/// Outcome of [`open_session_single_instance`]. +enum OpenSessionOutcome { + /// Session was successfully opened (or a TA-level error was written back). + Done, + /// The cached `TaInstance` was torn down concurrently. + InstanceDestroyed, } /// Open a new session on an existing single-instance TA. /// -/// Returns `Err(OpteeSmcReturnCode::EThreadLimit)` if the TA instance is currently in use. -/// The Linux driver will wait and retry automatically. -/// /// If the TA's OpenSession entry point returns an error, the session is not registered. /// For cleanup semantics, see OP-TEE OS `tee_ta_open_session()` in `tee_ta_manager.c`. #[allow(clippy::type_complexity)] @@ -544,19 +579,19 @@ fn open_session_single_instance( params: &[litebox_common_optee::UteeParamOwned], ta_uuid: litebox_common_optee::TeeUuid, ta_req_info: &litebox_shim_optee::msg_handler::TaRequestInfo, -) -> Result<(), OpteeSmcReturnCode> { +) -> Result { // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying let mut instance = instance_arc .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; - // `task_page_table_id == None` means the instance was destroyed by a concurrent - // close/panic between our cache lookup and acquiring the instance lock. - // Return `EThreadLimit` so the Linux driver retries. - let task_pt_id = instance - .task_page_table_id - .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // `task_page_table_id == None` means the instance was destroyed by a + // concurrent close/panic between our cache lookup and acquiring the + // instance lock. + let Some(task_pt_id) = instance.task_page_table_id else { + return Ok(OpenSessionOutcome::InstanceDestroyed); + }; // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path @@ -662,7 +697,7 @@ fn open_session_single_instance( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. - return Ok(()); + return Ok(OpenSessionOutcome::Done); } } @@ -678,7 +713,7 @@ fn open_session_single_instance( Some(ta_req_info), )?; - return Ok(()); + return Ok(OpenSessionOutcome::Done); } drop(instance); @@ -702,7 +737,7 @@ fn open_session_single_instance( runner_session_id ); - Ok(()) + Ok(OpenSessionOutcome::Done) } /// Create a new TA instance for a session. @@ -1096,11 +1131,27 @@ fn handle_close_session( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the instance was destroyed. - // Return `EBadCmd` because the client must start over with OpenSession. - let task_pt_id = instance - .task_page_table_id - .ok_or(OpteeSmcReturnCode::EBadCmd)?; + // `task_page_table_id == None` means the TA instance was already torn + // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD and cleaned + // it up). From the client's perspective the session no longer exists, + // so CloseSession is trivially successful. + let Some(task_pt_id) = instance.task_page_table_id else { + drop(instance); + session_manager().unregister_session(session_id); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + TeeResult::Success, + None, + None, + None, + )?; + debug_serial_println!( + "CloseSession complete: session_id={}, TA instance already destroyed", + session_id + ); + return Ok(()); + }; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 974a7c58e..04bec7616 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -246,6 +246,19 @@ impl SingleInstanceCache { self.inner.lock().remove(uuid) } + /// Remove a cached single-instance TA by UUID, but only if the currently + /// cached entry is the same `Arc` as `expected`. + pub fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { + let mut guard = self.inner.lock(); + match guard.get(uuid) { + Some(current) if Arc::ptr_eq(current, expected) => { + guard.remove(uuid); + true + } + _ => false, + } + } + /// Get the number of cached single-instance TAs. pub fn len(&self) -> usize { self.inner.lock().len() @@ -440,6 +453,18 @@ impl SessionManager { self.single_instance_cache.remove(uuid) } + /// Remove a single-instance TA from the cache only if the currently + /// cached `Arc` is the same as `expected`. + /// + /// See [`SingleInstanceCache::remove_if_same`]. + pub fn remove_single_instance_if_same( + &self, + uuid: &TeeUuid, + expected: &Arc>, + ) -> bool { + self.single_instance_cache.remove_if_same(uuid, expected) + } + /// Get the total count of unique TA instances (for limit checking). /// /// This counts: From a6a4e84f00a967de32c5662ce89e22dba04a595c Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 03:58:12 +0000 Subject: [PATCH 03/12] tear down target_dead --- litebox_runner_lvbs/src/lib.rs | 79 ++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 91c325166..2e52a6ee4 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -984,11 +984,14 @@ fn handle_invoke_command( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the instance was destroyed. - // Return `EBadCmd` because the client must start over with OpenSession. - let task_pt_id = instance - .task_page_table_id - .ok_or(OpteeSmcReturnCode::EBadCmd)?; + // `task_page_table_id == None` means the TA instance was already torn + // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD). The + // session is orphaned. + let Some(task_pt_id) = instance.task_page_table_id else { + drop(instance); + session_manager().unregister_session(session_id); + return Err(OpteeSmcReturnCode::EBadCmd); + }; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1032,8 +1035,17 @@ fn handle_invoke_command( let return_code: u32 = ctx.rax.truncate(); let return_code = TeeResult::try_from(return_code).unwrap_or(TeeResult::GenericError); - // Per OP-TEE OS: if TA panics (TARGET_DEAD), clean up the session/instance - // Reference: tee_ta_invoke_command() in tee_ta_manager.c + // Per OP-TEE OS: if TA panics (TARGET_DEAD), the TA context is + // unrecoverable; all sessions on the same single-instance TA are + // implicitly dead (Ref: tee_ta_invoke_command() in tee_ta_manager.c). + // Tear down the instance so that any subsequent op on remaining + // sessions observes `task_page_table_id == None` and fails cleanly. + // Remaining sessions stay in the session map until their clients + // call CloseSession; at that point `handle_close_session`'s None + // path will unregister them and report Success. Note that we don't + // unregister/recycle those session IDs immediately to avoid potential + // use-after-free issues (clients don't know whether those session IDs + // are invalid). if return_code == TeeResult::TargetDead { debug_serial_println!( "InvokeCommand: TA panicked (TARGET_DEAD), session_id={}", @@ -1042,18 +1054,10 @@ fn handle_invoke_command( let ta_uuid = session_entry.ta_uuid; let ta_flags = session_entry.ta_flags; - let instance_arc = session_entry.instance.clone(); - // Remove the session from the map + // Remove this session from the map session_manager().unregister_session(session_id); - // Check if this was the last session using the TA instance by counting - // remaining sessions that reference this instance. - let remaining_sessions = session_manager() - .sessions() - .count_sessions_for_instance(&instance_arc); - let is_last_session = remaining_sessions == 0; - // Write response BEFORE switching page tables (accesses user memory) write_msg_args_to_normal_world( msg_args, @@ -1064,31 +1068,32 @@ fn handle_invoke_command( Some(&ta_req_info), )?; - if is_last_session { - // Clear single-instance cache if applicable - if ta_flags.is_single_instance() { - session_manager().remove_single_instance(&ta_uuid); - } - instance.task_page_table_id = None; + // Clear single-instance cache so new OpenSessions for this UUID + // create a fresh instance instead of hitting the zombie one. + if ta_flags.is_single_instance() { + session_manager().remove_single_instance(&ta_uuid); + } - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - // The lock is held, so no other core can enter the TA. - unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + // Invalidate the id in the struct *before* teardown so any other + // lock holder (on another session sharing this Arc) observes `None` + // and bails. + instance.task_page_table_id = None; - drop(instance); + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + // The lock is held, so no other core can enter the TA. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; - debug_serial_println!( - "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", - task_pt_id - ); + drop(instance); - // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not - // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just - // cleaning it up. Currently we always clean up on panic. - } else { - drop(instance); - } + debug_serial_println!( + "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", + task_pt_id + ); + + // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not + // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just + // cleaning it up. Currently we always clean up on panic. return Ok(()); } From 11a526eb10fbc30d0abe02acc032f8678d2fe15c Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 04:11:55 +0000 Subject: [PATCH 04/12] let the normal world know target_dead --- litebox_runner_lvbs/src/lib.rs | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 2e52a6ee4..60ebc497e 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -518,14 +518,10 @@ fn handle_open_session( // single-instance TAs, `with_creation_slot` re-checks the cache under // its lock and serializes instance creation per UUID. // - // If we observe a destroyed instance via `ExistingSingleInstance`, we - // clear the stale cache entry and loop. In normal operation the - // destroyer removes the cache entry before setting - // `task_page_table_id = None`, so this loop terminates in at - // most one extra iteration. The bounded retry cap guards - // against pathological cases where other cores repeatedly install and - // destroy fresh instances for the same UUID. After the cap we fall - // back to `EThreadLimit` so the driver retries the whole call. + // The bounded retry cap (2 iterations suffice in steady state; 4 gives + // slack for back-to-back churn where other cores repeatedly install and + // destroy fresh instances for the same UUID) ultimately falls back to + // `EThreadLimit` so the driver retries the whole call. for _ in 0..MAX_OPEN_SESSION_RETRIES { match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { open_session_new_instance( @@ -548,6 +544,7 @@ fn handle_open_session( )? { OpenSessionOutcome::Done => return Ok(()), OpenSessionOutcome::InstanceDestroyed => { + // defense-in-depth against future reordering of destroyer steps session_manager().remove_single_instance_if_same(&ta_uuid, &existing); } } @@ -569,6 +566,9 @@ enum OpenSessionOutcome { /// Open a new session on an existing single-instance TA. /// +/// Returns `Err(OpteeSmcReturnCode::EThreadLimit)` if the TA instance is currently in use. +/// The Linux driver will wait and retry automatically. +/// /// If the TA's OpenSession entry point returns an error, the session is not registered. /// For cleanup semantics, see OP-TEE OS `tee_ta_open_session()` in `tee_ta_manager.c`. #[allow(clippy::type_complexity)] @@ -986,11 +986,23 @@ fn handle_invoke_command( }; // `task_page_table_id == None` means the TA instance was already torn // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD). The - // session is orphaned. + // session is orphaned. Report TARGET_DEAD to the client. let Some(task_pt_id) = instance.task_page_table_id else { drop(instance); session_manager().unregister_session(session_id); - return Err(OpteeSmcReturnCode::EBadCmd); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + TeeResult::TargetDead, + None, + None, + None, + )?; + debug_serial_println!( + "InvokeCommand: session_id={} on already-destroyed TA instance", + session_id + ); + return Ok(()); }; // Switch to the TA instance's page table @@ -1233,7 +1245,6 @@ fn handle_close_session( // Drop the instance to release shim/loaded_program resources drop(instance); - drop(entry); debug_serial_println!( "CloseSession complete: deleted task_pt_id={} (last session)", From 4bf0677a05e9de21524ebaee018715c52023401c Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 20:12:14 +0000 Subject: [PATCH 05/12] drop racy fast path which has less benefit --- litebox_runner_lvbs/src/lib.rs | 136 ++++++++++++------------------ litebox_shim_optee/src/session.rs | 24 +++--- 2 files changed, 64 insertions(+), 96 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 60ebc497e..23efd6ebe 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -474,8 +474,6 @@ fn handle_open_session( msg_args: &mut OpteeMsgArgs, msg_args_phys_addr: u64, ) -> Result<(), OpteeSmcReturnCode> { - const MAX_OPEN_SESSION_RETRIES: u32 = 4; - let ta_req_info = decode_ta_request(msg_args).map_err(|_| OpteeSmcReturnCode::EBadCmd)?; if ta_req_info.entry_func != UteeEntryFunc::OpenSession { return Err(OpteeSmcReturnCode::EBadCmd); @@ -492,9 +490,23 @@ fn handle_open_session( .get_known_flags(&ta_uuid) .is_none_or(|f| f.is_single_instance()); - if is_single_instance { - // Fast path: Reuse a cached single-instance TA if one exists. - if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { + // Resolve or create the TA instance. + // For single-instance TAs, `with_creation_slot` re-checks the cache + // under its lock and serializes instance creation per UUID. + // If a cache hit returns a zombie (an instance torn down by a + // concurrent close/panic), we mirror OP-TEE's `tee_ta_open_session` + // behavior: evict the dead entry and report `TargetDead` to the caller. + match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { + open_session_new_instance( + msg_args, + msg_args_phys_addr, + params, + ta_uuid, + client_identity, + &ta_req_info, + ) + })? { + CreationReservation::ExistingSingleInstance(existing) => { match open_session_single_instance( msg_args, msg_args_phys_addr, @@ -503,57 +515,25 @@ fn handle_open_session( ta_uuid, &ta_req_info, )? { - OpenSessionOutcome::Done => return Ok(()), - // Cached instance was torn down in a race with a concurrent - // close/panic. Drop the stale entry from the cache only if - // still the same Arc and fall through. + OpenSessionOutcome::Done => Ok(()), OpenSessionOutcome::InstanceDestroyed => { + // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` + // removing the dead ctx from `tee_ctxes`. session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + TeeResult::TargetDead, + None, + None, + Some(&ta_req_info), + )?; + Ok(()) } } } + CreationReservation::SlotReserved => Ok(()), } - - // Create a new TA instance, or reuse a freshly-cached one. For - // single-instance TAs, `with_creation_slot` re-checks the cache under - // its lock and serializes instance creation per UUID. - // - // The bounded retry cap (2 iterations suffice in steady state; 4 gives - // slack for back-to-back churn where other cores repeatedly install and - // destroy fresh instances for the same UUID) ultimately falls back to - // `EThreadLimit` so the driver retries the whole call. - for _ in 0..MAX_OPEN_SESSION_RETRIES { - match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { - open_session_new_instance( - msg_args, - msg_args_phys_addr, - params, - ta_uuid, - client_identity, - &ta_req_info, - ) - })? { - CreationReservation::ExistingSingleInstance(existing) => { - match open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing.clone(), - params, - ta_uuid, - &ta_req_info, - )? { - OpenSessionOutcome::Done => return Ok(()), - OpenSessionOutcome::InstanceDestroyed => { - // defense-in-depth against future reordering of destroyer steps - session_manager().remove_single_instance_if_same(&ta_uuid, &existing); - } - } - } - CreationReservation::SlotReserved => return Ok(()), - } - } - - Err(OpteeSmcReturnCode::EThreadLimit) } /// Outcome of [`open_session_single_instance`]. @@ -586,12 +566,11 @@ fn open_session_single_instance( .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; - // `task_page_table_id == None` means the instance was destroyed by a - // concurrent close/panic between our cache lookup and acquiring the - // instance lock. - let Some(task_pt_id) = instance.task_page_table_id else { + // `dead == true` means the instance was destroyed by a concurrent close/panic + if instance.dead { return Ok(OpenSessionOutcome::InstanceDestroyed); - }; + } + let task_pt_id = instance.task_page_table_id; // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path @@ -683,9 +662,9 @@ fn open_session_single_instance( )?; session_manager().remove_single_instance(&ta_uuid); - // Invalidate the id in the struct *before* teardown so any - // future lock holder observes `None` and bails. - instance.task_page_table_id = None; + // Mark the instance dead *before* teardown so any future + // lock holder bails without touching the invalid page table. + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -924,7 +903,8 @@ fn open_session_new_instance( let instance = Arc::new(SpinMutex::new(TaInstance { shim, loaded_program, - task_page_table_id: Some(task_pt_id), + task_page_table_id: task_pt_id, + dead: false, })); // Cache single-instance TAs for future sessions @@ -984,10 +964,9 @@ fn handle_invoke_command( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the TA instance was already torn - // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD). The - // session is orphaned. Report TARGET_DEAD to the client. - let Some(task_pt_id) = instance.task_page_table_id else { + // `dead == true` means the TA instance was already torn down. + // The session is orphaned. Report TARGET_DEAD to the client. + if instance.dead { drop(instance); session_manager().unregister_session(session_id); write_msg_args_to_normal_world( @@ -1003,7 +982,8 @@ fn handle_invoke_command( session_id ); return Ok(()); - }; + } + let task_pt_id = instance.task_page_table_id; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1050,14 +1030,6 @@ fn handle_invoke_command( // Per OP-TEE OS: if TA panics (TARGET_DEAD), the TA context is // unrecoverable; all sessions on the same single-instance TA are // implicitly dead (Ref: tee_ta_invoke_command() in tee_ta_manager.c). - // Tear down the instance so that any subsequent op on remaining - // sessions observes `task_page_table_id == None` and fails cleanly. - // Remaining sessions stay in the session map until their clients - // call CloseSession; at that point `handle_close_session`'s None - // path will unregister them and report Success. Note that we don't - // unregister/recycle those session IDs immediately to avoid potential - // use-after-free issues (clients don't know whether those session IDs - // are invalid). if return_code == TeeResult::TargetDead { debug_serial_println!( "InvokeCommand: TA panicked (TARGET_DEAD), session_id={}", @@ -1086,10 +1058,8 @@ fn handle_invoke_command( session_manager().remove_single_instance(&ta_uuid); } - // Invalidate the id in the struct *before* teardown so any other - // lock holder (on another session sharing this Arc) observes `None` - // and bails. - instance.task_page_table_id = None; + // Mark the instance dead *before* teardown. + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -1148,11 +1118,10 @@ fn handle_close_session( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the TA instance was already torn - // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD and cleaned - // it up). From the client's perspective the session no longer exists, - // so CloseSession is trivially successful. - let Some(task_pt_id) = instance.task_page_table_id else { + // `dead == true` means the TA instance was already torn down. + // From the client's perspective the session no longer exists, so + // CloseSession is trivially successful. + if instance.dead { drop(instance); session_manager().unregister_session(session_id); write_msg_args_to_normal_world( @@ -1168,7 +1137,8 @@ fn handle_close_session( session_id ); return Ok(()); - }; + } + let task_pt_id = instance.task_page_table_id; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1236,7 +1206,7 @@ fn handle_close_session( if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); } - instance.task_page_table_id = None; + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 04bec7616..74e8a1e0c 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -115,12 +115,14 @@ pub struct TaInstance { /// Boxed to keep it at a fixed heap address - the Task inside must not be moved /// after initialization because it contains internal state that may not survive moves. pub loaded_program: alloc::boxed::Box, - /// The task page table ID associated with this TA instance. - /// - /// `None` after the page table has been torn down. Any lock holder that - /// observes `None` must treat the instance as destroyed and bail out - /// without touching the page table. - pub task_page_table_id: Option, + /// The task page table ID associated with this TA instance. Valid only + /// while `dead == false`. + pub task_page_table_id: usize, + /// Set when the TA panics or its last session closes. Any lock holder + /// should first check whether `dead == true` and if it is, bail without + /// touching `task_page_table_id`. `shim` and `loaded_program` remain + /// valid until the last `Arc` is dropped. + pub dead: bool, } // SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) @@ -398,11 +400,6 @@ impl SessionManager { &self.single_instance_cache } - /// Get a cached single-instance TA by UUID. - pub fn get_single_instance(&self, uuid: &TeeUuid) -> Option>> { - self.single_instance_cache.get(uuid) - } - /// Cache a single-instance TA. pub fn cache_single_instance(&self, uuid: TeeUuid, instance: Arc>) { self.single_instance_cache.insert(uuid, instance); @@ -515,8 +512,9 @@ impl SessionManager { let mut state = self.creation_state.lock(); if is_single_instance { - // Re-check single-instance cache under the creation lock to close - // the TOCTOU window between the caller's get_single_instance() and here. + // Check the single-instance cache under the creation lock. A + // hit means another core finished creating the instance for + // this UUID; reuse it instead of starting a new load. if let Some(existing) = self.single_instance_cache.get(uuid) { return Ok(CreationReservation::ExistingSingleInstance(existing)); } From fae6d03cd97444bc07778b806536222a82fcc22e Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 20:47:47 +0000 Subject: [PATCH 06/12] fix writing message back to normal world --- litebox_runner_lvbs/src/lib.rs | 61 +++++++++++++------------------ litebox_shim_optee/src/session.rs | 6 +-- 2 files changed, 26 insertions(+), 41 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 23efd6ebe..a23ce4aa5 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -520,14 +520,9 @@ fn handle_open_session( // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` // removing the dead ctx from `tee_ctxes`. session_manager().remove_single_instance_if_same(&ta_uuid, &existing); - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - TeeResult::TargetDead, - None, - None, - Some(&ta_req_info), - )?; + msg_args.ret = TeeResult::TargetDead; + msg_args.ret_origin = TeeOrigin::Tee; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; Ok(()) } } @@ -652,14 +647,14 @@ fn open_session_single_instance( ); // Write error response BEFORE switching page tables (accesses user memory) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, // No session ID on failure Some(&ta_params), Some(ta_req_info), - )?; + ); session_manager().remove_single_instance(&ta_uuid); // Mark the instance dead *before* teardown so any future @@ -676,6 +671,7 @@ fn open_session_single_instance( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. + write_result?; return Ok(OpenSessionOutcome::Done); } } @@ -805,19 +801,20 @@ fn open_session_new_instance( ); // Write error response back to normal world - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, ldelf_return_code, None, // No session ID on failure None, Some(ta_req_info), - )?; + ); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + write_result?; return Ok(()); } @@ -883,19 +880,20 @@ fn open_session_new_instance( ); // Write error response back to normal world - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, // No session ID on failure Some(&ta_params), Some(ta_req_info), - )?; + ); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + write_result?; return Ok(()); } @@ -969,14 +967,9 @@ fn handle_invoke_command( if instance.dead { drop(instance); session_manager().unregister_session(session_id); - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - TeeResult::TargetDead, - None, - None, - None, - )?; + msg_args.ret = TeeResult::TargetDead; + msg_args.ret_origin = TeeOrigin::Tee; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; debug_serial_println!( "InvokeCommand: session_id={} on already-destroyed TA instance", session_id @@ -1043,14 +1036,14 @@ fn handle_invoke_command( session_manager().unregister_session(session_id); // Write response BEFORE switching page tables (accesses user memory) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, Some(&ta_params), Some(&ta_req_info), - )?; + ); // Clear single-instance cache so new OpenSessions for this UUID // create a fresh instance instead of hitting the zombie one. @@ -1077,6 +1070,7 @@ fn handle_invoke_command( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. + write_result?; return Ok(()); } @@ -1124,14 +1118,9 @@ fn handle_close_session( if instance.dead { drop(instance); session_manager().unregister_session(session_id); - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - TeeResult::Success, - None, - None, - None, - )?; + msg_args.ret = TeeResult::Success; + msg_args.ret_origin = TeeOrigin::TrustedApp; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; debug_serial_println!( "CloseSession complete: session_id={}, TA instance already destroyed", session_id @@ -1167,14 +1156,14 @@ fn handle_close_session( } // CloseSession always succeeds (TA_CloseSessionEntryPoint returns void) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, TeeResult::Success, None, None, None, - )?; + ); // Clone the instance Arc before dropping the lock for later cleanup check let instance_arc = session_entry.instance.clone(); @@ -1199,7 +1188,7 @@ fn handle_close_session( "CloseSession complete: session_id={}, TA kept alive (INSTANCE_KEEP_ALIVE flag)", session_id ); - return Ok(()); + return write_result; } // Clear single-instance cache if this was a single-instance TA @@ -1229,7 +1218,7 @@ fn handle_close_session( ); } - Ok(()) + write_result } /// Update msg_args with return values and write back to normal world memory. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 74e8a1e0c..d1efd4985 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -248,9 +248,7 @@ impl SingleInstanceCache { self.inner.lock().remove(uuid) } - /// Remove a cached single-instance TA by UUID, but only if the currently - /// cached entry is the same `Arc` as `expected`. - pub fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { + fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { let mut guard = self.inner.lock(); match guard.get(uuid) { Some(current) if Arc::ptr_eq(current, expected) => { @@ -452,8 +450,6 @@ impl SessionManager { /// Remove a single-instance TA from the cache only if the currently /// cached `Arc` is the same as `expected`. - /// - /// See [`SingleInstanceCache::remove_if_same`]. pub fn remove_single_instance_if_same( &self, uuid: &TeeUuid, From ab4ab56c2a46e556a1db5753b2a4179001b5bc0a Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 12 May 2026 19:02:16 +0000 Subject: [PATCH 07/12] retry instead of target_dead --- litebox_runner_lvbs/src/lib.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index a23ce4aa5..bad62771e 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -520,10 +520,7 @@ fn handle_open_session( // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` // removing the dead ctx from `tee_ctxes`. session_manager().remove_single_instance_if_same(&ta_uuid, &existing); - msg_args.ret = TeeResult::TargetDead; - msg_args.ret_origin = TeeOrigin::Tee; - write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; - Ok(()) + Err(OpteeSmcReturnCode::EThreadLimit) } } } From 28872a25c74b05ed97785d44c0c70cc275070885 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Thu, 14 May 2026 00:14:42 +0000 Subject: [PATCH 08/12] feedback --- litebox_runner_lvbs/src/lib.rs | 109 ++++++++++++++---------------- litebox_shim_optee/src/session.rs | 14 ++-- 2 files changed, 58 insertions(+), 65 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index bad62771e..e3933d470 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -469,7 +469,7 @@ fn optee_smc_handler(smc_args_addr: usize) -> OpteeSmcArgs { /// On success, the session is registered and msg_args is updated with the session ID. /// On failure (including TA returning error), msg_args is updated with the error code /// and appropriate cleanup is performed (page table teardown for new instances, -/// instance cleanup for TARGET_DEAD on single-instance TAs with no other sessions). +/// instance cleanup for TARGET_DEAD on single-instance TAs). fn handle_open_session( msg_args: &mut OpteeMsgArgs, msg_args_phys_addr: u64, @@ -494,8 +494,8 @@ fn handle_open_session( // For single-instance TAs, `with_creation_slot` re-checks the cache // under its lock and serializes instance creation per UUID. // If a cache hit returns a zombie (an instance torn down by a - // concurrent close/panic), we mirror OP-TEE's `tee_ta_open_session` - // behavior: evict the dead entry and report `TargetDead` to the caller. + // concurrent close/panic), evict the dead entry and ask the Linux driver + // to retry so it can create a fresh TA instance. match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { open_session_new_instance( msg_args, @@ -519,7 +519,7 @@ fn handle_open_session( OpenSessionOutcome::InstanceDestroyed => { // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` // removing the dead ctx from `tee_ctxes`. - session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &existing); Err(OpteeSmcReturnCode::EThreadLimit) } } @@ -532,7 +532,7 @@ fn handle_open_session( enum OpenSessionOutcome { /// Session was successfully opened (or a TA-level error was written back). Done, - /// The cached `TaInstance` was torn down concurrently. + /// The cached `TaInstance` is closed and must not be entered. InstanceDestroyed, } @@ -540,6 +540,7 @@ enum OpenSessionOutcome { /// /// Returns `Err(OpteeSmcReturnCode::EThreadLimit)` if the TA instance is currently in use. /// The Linux driver will wait and retry automatically. +/// Returns [`OpenSessionOutcome::InstanceDestroyed`] if the cached TA is closed. /// /// If the TA's OpenSession entry point returns an error, the session is not registered. /// For cleanup semantics, see OP-TEE OS `tee_ta_open_session()` in `tee_ta_manager.c`. @@ -558,8 +559,8 @@ fn open_session_single_instance( .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; - // `dead == true` means the instance was destroyed by a concurrent close/panic - if instance.dead { + // `closed == true` means the instance is terminal and must not be entered. + if instance.closed { return Ok(OpenSessionOutcome::InstanceDestroyed); } let task_pt_id = instance.task_page_table_id; @@ -632,49 +633,37 @@ fn open_session_single_instance( // Regular errors (access denied, bad params, etc.) don't mean the TA is dead - // it can still serve future OpenSession requests from other clients. if return_code == TeeResult::TargetDead { - // Check if any other sessions are using this instance by counting sessions - // in the session map that reference this TA instance. - let session_count = session_manager() - .sessions() - .count_sessions_for_instance(&instance_arc); + debug_serial_println!("Single-instance TA panicked during OpenSession, cleaning up"); - if session_count == 0 { - debug_serial_println!( - "Single-instance TA panicked with no other sessions, cleaning up" - ); - - // Write error response BEFORE switching page tables (accesses user memory) - let write_result = write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - return_code, - None, // No session ID on failure - Some(&ta_params), - Some(ta_req_info), - ); + // Write error response BEFORE switching page tables (accesses user memory). + // Keep the instance lock held until this completes so another core cannot + // tear down the active page table while this core is copying TA outputs. + let write_result = write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + return_code, + None, // No session ID on failure + Some(&ta_params), + Some(ta_req_info), + ); - session_manager().remove_single_instance(&ta_uuid); - // Mark the instance dead *before* teardown so any future - // lock holder bails without touching the invalid page table. - instance.dead = true; + let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &instance_arc); + instance.closed = true; - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; - drop(instance); + drop(instance); - // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not - // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just - // cleaning it up. Currently we always clean up on panic. + // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not + // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just + // cleaning it up. Currently we always clean up on panic. - write_result?; - return Ok(OpenSessionOutcome::Done); - } + write_result?; + return Ok(OpenSessionOutcome::Done); } - drop(instance); - // Write error response back to normal world write_msg_args_to_normal_world( msg_args, @@ -685,11 +674,11 @@ fn open_session_single_instance( Some(ta_req_info), )?; + drop(instance); + return Ok(OpenSessionOutcome::Done); } - drop(instance); - // Success: register session and disarm the guard (ownership transfers to session map) // Safe to unwrap: guard has not been disarmed yet. let runner_session_id = session_id_guard.disarm().unwrap(); @@ -704,6 +693,8 @@ fn open_session_single_instance( Some(ta_req_info), )?; + drop(instance); + debug_serial_println!( "OpenSession complete on single-instance TA: session_id={}", runner_session_id @@ -899,7 +890,7 @@ fn open_session_new_instance( shim, loaded_program, task_page_table_id: task_pt_id, - dead: false, + closed: false, })); // Cache single-instance TAs for future sessions @@ -936,8 +927,7 @@ fn open_session_new_instance( /// Looks up the session by ID, switches to its page table, and runs the command. /// /// Per OP-TEE OS semantics: if the TA panics (returns TARGET_DEAD), the session -/// should be cleaned up. For single-instance TAs with no other sessions, the -/// entire instance is destroyed. +/// should be cleaned up. For single-instance TAs, the entire instance is destroyed. fn handle_invoke_command( msg_args: &mut OpteeMsgArgs, msg_args_phys_addr: u64, @@ -959,16 +949,16 @@ fn handle_invoke_command( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `dead == true` means the TA instance was already torn down. + // `closed == true` means the TA instance is terminal and must not be entered. // The session is orphaned. Report TARGET_DEAD to the client. - if instance.dead { + if instance.closed { drop(instance); session_manager().unregister_session(session_id); msg_args.ret = TeeResult::TargetDead; msg_args.ret_origin = TeeOrigin::Tee; write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; debug_serial_println!( - "InvokeCommand: session_id={} on already-destroyed TA instance", + "InvokeCommand: session_id={} on closed TA instance", session_id ); return Ok(()); @@ -1045,11 +1035,11 @@ fn handle_invoke_command( // Clear single-instance cache so new OpenSessions for this UUID // create a fresh instance instead of hitting the zombie one. if ta_flags.is_single_instance() { - session_manager().remove_single_instance(&ta_uuid); + let _ = + session_manager().remove_single_instance_if_same(&ta_uuid, &session_entry.instance); } - // Mark the instance dead *before* teardown. - instance.dead = true; + instance.closed = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -1109,17 +1099,17 @@ fn handle_close_session( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `dead == true` means the TA instance was already torn down. + // `closed == true` means the TA instance is terminal and must not be entered. // From the client's perspective the session no longer exists, so // CloseSession is trivially successful. - if instance.dead { + if instance.closed { drop(instance); session_manager().unregister_session(session_id); msg_args.ret = TeeResult::Success; - msg_args.ret_origin = TeeOrigin::TrustedApp; + msg_args.ret_origin = TeeOrigin::Tee; write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; debug_serial_println!( - "CloseSession complete: session_id={}, TA instance already destroyed", + "CloseSession complete: session_id={}, TA instance closed", session_id ); return Ok(()); @@ -1190,9 +1180,10 @@ fn handle_close_session( // Clear single-instance cache if this was a single-instance TA if entry.ta_flags.is_single_instance() { - session_manager().remove_single_instance(&entry.ta_uuid); + let _ = + session_manager().remove_single_instance_if_same(&entry.ta_uuid, &instance_arc); } - instance.dead = true; + instance.closed = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index d1efd4985..47b8647ba 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -116,13 +116,14 @@ pub struct TaInstance { /// after initialization because it contains internal state that may not survive moves. pub loaded_program: alloc::boxed::Box, /// The task page table ID associated with this TA instance. Valid only - /// while `dead == false`. + /// while `closed == false`. pub task_page_table_id: usize, - /// Set when the TA panics or its last session closes. Any lock holder - /// should first check whether `dead == true` and if it is, bail without - /// touching `task_page_table_id`. `shim` and `loaded_program` remain - /// valid until the last `Arc` is dropped. - pub dead: bool, + /// Set when the TA is committed to teardown because it panicked or its last + /// session closed. Any lock holder should first check whether + /// `closed == true` and if it is, bail without touching + /// `task_page_table_id`. `shim` and `loaded_program` remain valid until + /// the last `Arc` is dropped. + pub closed: bool, } // SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) @@ -248,6 +249,7 @@ impl SingleInstanceCache { self.inner.lock().remove(uuid) } + /// Remove a cached single-instance TA only if it is the expected instance. fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { let mut guard = self.inner.lock(); match guard.get(uuid) { From 658a6c2b939cf4070173e702f2d76ecc8c0a200e Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Thu, 14 May 2026 21:19:09 +0000 Subject: [PATCH 09/12] revise --- litebox_runner_lvbs/src/lib.rs | 27 +++++++++++++++++---------- litebox_shim_optee/src/session.rs | 10 +++++----- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index e3933d470..4903b4f7f 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -530,9 +530,10 @@ fn handle_open_session( /// Outcome of [`open_session_single_instance`]. enum OpenSessionOutcome { - /// Session was successfully opened (or a TA-level error was written back). + /// Session was successfully opened, TA returned a non-fatal error, or TA panicked + /// and the instance was destroyed inline. No extra cleanup effort is needed. Done, - /// The cached `TaInstance` is closed and must not be entered. + /// The cached `TaInstance` is `closed` and must not be entered. InstanceDestroyed, } @@ -543,6 +544,8 @@ enum OpenSessionOutcome { /// Returns [`OpenSessionOutcome::InstanceDestroyed`] if the cached TA is closed. /// /// If the TA's OpenSession entry point returns an error, the session is not registered. +/// On TARGET_DEAD the cached instance is destroyed unconditionally; any sibling sessions +/// become orphans that fail-fast on next access via the `instance.closed` check. /// For cleanup semantics, see OP-TEE OS `tee_ta_open_session()` in `tee_ta_manager.c`. #[allow(clippy::type_complexity)] fn open_session_single_instance( @@ -893,16 +896,16 @@ fn open_session_new_instance( closed: false, })); - // Cache single-instance TAs for future sessions - if ta_flags.is_single_instance() { - session_manager().cache_single_instance(ta_uuid, instance.clone()); - } - // Disarm the guard: ownership transfers to session manager via register_session. // Safe to unwrap: guard has not been disarmed yet. let runner_session_id = session_id_guard.disarm().unwrap(); session_manager().register_session(runner_session_id, instance.clone(), ta_uuid, ta_flags); + // Cache single-instance TAs only after the opening session owns the instance. + if ta_flags.is_single_instance() { + session_manager().cache_single_instance(ta_uuid, instance.clone()); + } + // Write success response back to normal world write_msg_args_to_normal_world( msg_args, @@ -1019,7 +1022,9 @@ fn handle_invoke_command( let ta_uuid = session_entry.ta_uuid; let ta_flags = session_entry.ta_flags; - // Remove this session from the map + // Remove this session from the map. Sibling sessions on the same + // single-instance TA will be cleaned up lazily on their next + // invoke/close via the `instance.closed` check. session_manager().unregister_session(session_id); // Write response BEFORE switching page tables (accesses user memory) @@ -1035,8 +1040,9 @@ fn handle_invoke_command( // Clear single-instance cache so new OpenSessions for this UUID // create a fresh instance instead of hitting the zombie one. if ta_flags.is_single_instance() { - let _ = + let removed = session_manager().remove_single_instance_if_same(&ta_uuid, &session_entry.instance); + debug_assert!(removed, "single-instance cache entry unexpectedly missing"); } instance.closed = true; @@ -1180,8 +1186,9 @@ fn handle_close_session( // Clear single-instance cache if this was a single-instance TA if entry.ta_flags.is_single_instance() { - let _ = + let removed = session_manager().remove_single_instance_if_same(&entry.ta_uuid, &instance_arc); + debug_assert!(removed, "single-instance cache entry unexpectedly missing"); } instance.closed = true; diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 47b8647ba..949f41b66 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -118,11 +118,11 @@ pub struct TaInstance { /// The task page table ID associated with this TA instance. Valid only /// while `closed == false`. pub task_page_table_id: usize, - /// Set when the TA is committed to teardown because it panicked or its last - /// session closed. Any lock holder should first check whether - /// `closed == true` and if it is, bail without touching - /// `task_page_table_id`. `shim` and `loaded_program` remain valid until - /// the last `Arc` is dropped. + /// Set when the TA is committed to teardown (panic or last session closed). Any lock + /// holders should check `closed` before touching `task_page_table_id` and bail if true. + /// + /// The per-instance lock must be held when setting `closed = true` and across + /// the subsequent `teardown_ta_page_table`. pub closed: bool, } From 9b02d155087e1e554a8461687753abe0127151ac Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Thu, 14 May 2026 23:08:13 +0000 Subject: [PATCH 10/12] clean up and deduplicate --- litebox_runner_lvbs/src/lib.rs | 156 +++++++++++++++--------------- litebox_shim_optee/src/session.rs | 5 - 2 files changed, 77 insertions(+), 84 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 4903b4f7f..9d608292f 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -515,7 +515,7 @@ fn handle_open_session( ta_uuid, &ta_req_info, )? { - OpenSessionOutcome::Done => Ok(()), + OpenSessionOutcome::Handled => Ok(()), OpenSessionOutcome::InstanceDestroyed => { // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` // removing the dead ctx from `tee_ctxes`. @@ -532,7 +532,7 @@ fn handle_open_session( enum OpenSessionOutcome { /// Session was successfully opened, TA returned a non-fatal error, or TA panicked /// and the instance was destroyed inline. No extra cleanup effort is needed. - Done, + Handled, /// The cached `TaInstance` is `closed` and must not be entered. InstanceDestroyed, } @@ -632,24 +632,24 @@ fn open_session_single_instance( return_code ); + // Write error response BEFORE switching page tables (accesses user memory). + // Keep the instance lock held until this completes so another core cannot + // tear down the active page table while this core is copying TA outputs. + let write_result = write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + return_code, + None, // No session ID on failure + Some(&ta_params), + Some(ta_req_info), + ); + // For single-instance TAs, only clean up on TARGET_DEAD (panic). // Regular errors (access denied, bad params, etc.) don't mean the TA is dead - // it can still serve future OpenSession requests from other clients. if return_code == TeeResult::TargetDead { debug_serial_println!("Single-instance TA panicked during OpenSession, cleaning up"); - // Write error response BEFORE switching page tables (accesses user memory). - // Keep the instance lock held until this completes so another core cannot - // tear down the active page table while this core is copying TA outputs. - let write_result = write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - return_code, - None, // No session ID on failure - Some(&ta_params), - Some(ta_req_info), - ); - let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &instance_arc); instance.closed = true; @@ -657,44 +657,46 @@ fn open_session_single_instance( // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; - drop(instance); - // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. - - write_result?; - return Ok(OpenSessionOutcome::Done); } - // Write error response back to normal world - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - return_code, - None, // No session ID on failure - Some(&ta_params), - Some(ta_req_info), - )?; - drop(instance); - - return Ok(OpenSessionOutcome::Done); + write_result?; + return Ok(OpenSessionOutcome::Handled); } - // Success: register session and disarm the guard (ownership transfers to session map) - // Safe to unwrap: guard has not been disarmed yet. - let runner_session_id = session_id_guard.disarm().unwrap(); - session_manager().register_session(runner_session_id, instance_arc.clone(), ta_uuid, ta_flags); - - write_msg_args_to_normal_world( + // Treat write-back failure as OpenSession failure: do not publish the session. + let runner_session_id = session_id_guard.id().unwrap(); + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, Some(runner_session_id), Some(&ta_params), Some(ta_req_info), - )?; + ); + + if write_result.is_err() + && !ta_flags.is_keep_alive() + && session_manager() + .sessions() + .count_sessions_for_instance(&instance_arc) + == 0 + { + let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &instance_arc); + instance.closed = true; + + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + } + write_result?; + + // Success: register session and disarm the guard (ownership transfers to session map) + session_manager().register_session(runner_session_id, instance_arc.clone(), ta_uuid, ta_flags); + session_id_guard.disarm(); drop(instance); @@ -703,7 +705,7 @@ fn open_session_single_instance( runner_session_id ); - Ok(OpenSessionOutcome::Done) + Ok(OpenSessionOutcome::Handled) } /// Create a new TA instance for a session. @@ -888,6 +890,25 @@ fn open_session_new_instance( return Ok(()); } + // Write back BEFORE publishing the instance. If the write fails, the + // session is neither registered nor cached, so we just tear down the + // local resources and let `session_id_guard` recycle the ID on drop. + // Safe to unwrap: guard has not been disarmed yet. + let runner_session_id = session_id_guard.id().unwrap(); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + return_code, + Some(runner_session_id), + Some(&ta_params), + Some(ta_req_info), + ) + .inspect_err(|_| { + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + })?; + // Success: create TA instance - loaded_program is already boxed, no move happens let instance = Arc::new(SpinMutex::new(TaInstance { shim, @@ -896,9 +917,8 @@ fn open_session_new_instance( closed: false, })); - // Disarm the guard: ownership transfers to session manager via register_session. - // Safe to unwrap: guard has not been disarmed yet. - let runner_session_id = session_id_guard.disarm().unwrap(); + // Ownership of the session ID transfers to the session manager. + session_id_guard.disarm(); session_manager().register_session(runner_session_id, instance.clone(), ta_uuid, ta_flags); // Cache single-instance TAs only after the opening session owns the instance. @@ -906,16 +926,6 @@ fn open_session_new_instance( session_manager().cache_single_instance(ta_uuid, instance.clone()); } - // Write success response back to normal world - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - return_code, - Some(runner_session_id), - Some(&ta_params), - Some(ta_req_info), - )?; - debug_serial_println!( "OpenSession complete: session_id={}, single_instance={}", runner_session_id, @@ -1010,6 +1020,18 @@ fn handle_invoke_command( let return_code: u32 = ctx.rax.truncate(); let return_code = TeeResult::try_from(return_code).unwrap_or(TeeResult::GenericError); + // Write response BEFORE switching page tables (accesses user memory). + // Keep the instance lock held until this completes so another core cannot + // tear down the active page table while this core is copying TA outputs. + let write_result = write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + return_code, + None, + Some(&ta_params), + Some(&ta_req_info), + ); + // Per OP-TEE OS: if TA panics (TARGET_DEAD), the TA context is // unrecoverable; all sessions on the same single-instance TA are // implicitly dead (Ref: tee_ta_invoke_command() in tee_ta_manager.c). @@ -1027,22 +1049,11 @@ fn handle_invoke_command( // invoke/close via the `instance.closed` check. session_manager().unregister_session(session_id); - // Write response BEFORE switching page tables (accesses user memory) - let write_result = write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - return_code, - None, - Some(&ta_params), - Some(&ta_req_info), - ); - // Clear single-instance cache so new OpenSessions for this UUID // create a fresh instance instead of hitting the zombie one. if ta_flags.is_single_instance() { - let removed = + let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &session_entry.instance); - debug_assert!(removed, "single-instance cache entry unexpectedly missing"); } instance.closed = true; @@ -1062,21 +1073,9 @@ fn handle_invoke_command( // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. - - write_result?; - return Ok(()); } - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - return_code, - None, - Some(&ta_params), - Some(&ta_req_info), - )?; - - Ok(()) + write_result } /// Handle CloseSession command. @@ -1186,9 +1185,8 @@ fn handle_close_session( // Clear single-instance cache if this was a single-instance TA if entry.ta_flags.is_single_instance() { - let removed = + let _ = session_manager().remove_single_instance_if_same(&entry.ta_uuid, &instance_arc); - debug_assert!(removed, "single-instance cache entry unexpectedly missing"); } instance.closed = true; diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 949f41b66..0e98f633d 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -445,11 +445,6 @@ impl SessionManager { entry } - /// Remove a single-instance TA from the cache. - pub fn remove_single_instance(&self, uuid: &TeeUuid) -> Option>> { - self.single_instance_cache.remove(uuid) - } - /// Remove a single-instance TA from the cache only if the currently /// cached `Arc` is the same as `expected`. pub fn remove_single_instance_if_same( From 2f4c699ed52883051110229374d6caf9896d97d6 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Fri, 15 May 2026 21:14:38 +0000 Subject: [PATCH 11/12] nits --- litebox_runner_lvbs/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 9d608292f..e69c8b246 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -917,9 +917,9 @@ fn open_session_new_instance( closed: false, })); - // Ownership of the session ID transfers to the session manager. - session_id_guard.disarm(); + // Success: register session and disarm the guard (ownership transfers to session map) session_manager().register_session(runner_session_id, instance.clone(), ta_uuid, ta_flags); + session_id_guard.disarm(); // Cache single-instance TAs only after the opening session owns the instance. if ta_flags.is_single_instance() { From 36c5089e5f8e25d3262b2853f15a06d6233e8c45 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Fri, 15 May 2026 23:21:58 +0000 Subject: [PATCH 12/12] remove dead code and add clarification --- litebox_runner_lvbs/src/lib.rs | 38 ++++++++++++++++++++----------- litebox_shim_optee/src/session.rs | 5 ---- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index e69c8b246..ac2fb47a2 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -678,21 +678,33 @@ fn open_session_single_instance( Some(ta_req_info), ); - if write_result.is_err() - && !ta_flags.is_keep_alive() - && session_manager() - .sessions() - .count_sessions_for_instance(&instance_arc) - == 0 - { - let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &instance_arc); - instance.closed = true; + // Write-back failure: OpenSession succeeded inside the TA, but we cannot + // deliver the session id to the normal world, so it will never issue a + // matching CloseSession. For a non-keep-alive instance with no siblings + // we tear the whole instance down, reclaiming the TA-side state, and the + // session id can be recycled normally. For keep-alive or shared + // instances the TA still holds session-local state tagged with this id, + // so we forget the id (disarm the guard) to prevent a future OpenSession + // from reusing it and colliding with the orphaned TA-side bookkeeping. + if let Err(e) = write_result { + if !ta_flags.is_keep_alive() + && session_manager() + .sessions() + .count_sessions_for_instance(&instance_arc) + == 0 + { + let _ = session_manager().remove_single_instance_if_same(&ta_uuid, &instance_arc); + instance.closed = true; - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + } else { + let _ = session_id_guard.disarm(); + } + drop(instance); + return Err(e); } - write_result?; // Success: register session and disarm the guard (ownership transfers to session map) session_manager().register_session(runner_session_id, instance_arc.clone(), ta_uuid, ta_flags); diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 0e98f633d..5d21b404a 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -244,11 +244,6 @@ impl SingleInstanceCache { self.inner.lock().insert(uuid, instance); } - /// Remove a cached single-instance TA by UUID. - pub fn remove(&self, uuid: &TeeUuid) -> Option>> { - self.inner.lock().remove(uuid) - } - /// Remove a cached single-instance TA only if it is the expected instance. fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { let mut guard = self.inner.lock();