From 9a06387d5e77e6062a0ea7cf680cbd7a50958316 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 29 Apr 2026 11:21:19 +0800
Subject: [PATCH 01/28] feat(callable): per-callable_id orch SO dispatch on
 AICPU (a2a3 trb)

Foundation for the callable.md design: lift each-run dlclose+dlopen on
AICPU (caused by alternating callables) to a one-time-per-callable_id
load. Adds active_callable_id_/register_new_callable_id_ to the Runtime
struct and a 64-slot orch_so_table_ on the AICPU executor.

active_callable_id_ < 0 keeps the legacy single-slot path (governed by
has_new_orch_so_) untouched, so existing run_runtime() callers and all
six other variants continue to work without changes.

Verified:
  - tests/ut/py/test_chip_worker.py: 12/12 pass on a2a3sim
  - examples/.../vector_example: pass on a2a3sim
---
 .../aicpu/aicpu_executor.cpp                  | 141 ++++++++++++------
 .../runtime/runtime.h                         |  16 ++
 .../runtime/shared/runtime.cpp                |  11 ++
 3 files changed, 124 insertions(+), 44 deletions(-)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index ab795b6f8..e7164b78e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -89,6 +89,24 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
+// Per-callable_id orchestration SO table. AICPU side of the callable.md
+// design: when `runtime->active_callable_id_ >= 0` the executor dispatches
+// to `orch_so_table_[callable_id]` (created on first sighting of that
+// callable_id, kept warm across runs); when `active_callable_id_ < 0` it
+// falls back to the legacy single slot governed by `has_new_orch_so_`.
+// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
+// (mailbox uint32 callable_id, register() returns small ints).
+static constexpr int32_t MAX_REGISTERED_CALLABLE_IDS = 64;
+
+struct OrchSoEntry {
+    bool in_use{false};
+    void *handle{nullptr};
+    char path[256]{};
+    DeviceOrchestrationFunc func{nullptr};
+    DeviceOrchestrationBindRuntimeFunc bind{nullptr};
+    DeviceOrchestrationConfigFunc config_func{nullptr};
+};
+
 struct AicpuExecutor {
     int32_t sched_thread_num_;
     bool orch_to_sched_{false};
@@ -107,9 +125,9 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Orchestration SO handle - defer dlclose until all tasks complete
+    // Legacy single-slot orch SO cache (active_callable_id_ == -1 path).
     void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};  // Path to orchestration SO file for cleanup
+    char orch_so_path_[256]{};
 
     // Shared orchestration function pointer (loaded by first orch thread, used by all)
     DeviceOrchestrationFunc orch_func_{nullptr};
@@ -117,6 +135,11 @@ struct AicpuExecutor {
     DeviceOrchestrationConfigFunc orch_config_func_{nullptr};
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
+    // Per-callable_id table (active_callable_id_ >= 0 path). Single orch thread today, so
+    // first-write/read race is not possible; if multiple orch threads are
+    // ever introduced, guard the in_use=false→true transition with a mutex.
+    OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
+
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
     SchedulerContext sched_ctx_;
 
@@ -126,8 +149,9 @@ struct AicpuExecutor {
     void deinit(Runtime *runtime);
 
     ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). The
-        // handle is otherwise kept alive across runs for cache-hit reuse.
+        // Process-wide teardown (the single static instance dies here). Both
+        // the legacy slot and every in-use callable_id slot are dlclose()'d here;
+        // each is otherwise kept alive across runs for cache-hit reuse.
         if (orch_so_handle_ != nullptr) {
             dlclose(orch_so_handle_);
             orch_so_handle_ = nullptr;
@@ -136,6 +160,12 @@ struct AicpuExecutor {
             unlink(orch_so_path_);
             orch_so_path_[0] = '\0';
         }
+        for (auto &e : orch_so_table_) {
+            if (!e.in_use) continue;
+            if (e.handle != nullptr) dlclose(e.handle);
+            if (e.path[0] != '\0') unlink(e.path);
+            e = OrchSoEntry{};
+        }
     }
 };
 
@@ -197,29 +227,44 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Two paths:
-            //   1) has_new_orch_so == true → host believes the SO identity
-            //      changed, so we drop the cached handle (if any), write the
-            //      new bytes to disk, and dlopen + dlsym a fresh handle.
-            //   2) has_new_orch_so == false → host detected a cache hit, so
-            //      we reuse `orch_so_handle_` / `orch_func_` / `orch_bind_runtime_`
-            //      from the previous run untouched. sm_handle / rt below are
-            //      always recreated because they bind this run's memory.
-            const bool reload_so = runtime->has_new_orch_so();
+            // Per-callable_id dispatch (callable.md): when active_callable_id_ >= 0 the orch
+            // SO state lives in `orch_so_table_[callable_id]` keyed by registration
+            // order; reload is governed by `register_new_callable_id_`. When
+            // active_callable_id_ < 0 we fall back to the legacy single-slot cache
+            // governed by `has_new_orch_so_`. The local pointers below let
+            // the rest of this branch ignore the choice.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            const bool use_table = (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS);
+            if (callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+                DEV_ERROR(
+                    "Thread %d: callable_id %d exceeds MAX_REGISTERED_CALLABLE_IDS=%d", thread_idx, callable_id,
+                    MAX_REGISTERED_CALLABLE_IDS
+                );
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            void **p_handle = use_table ? &orch_so_table_[callable_id].handle : &orch_so_handle_;
+            char *p_path = use_table ? orch_so_table_[callable_id].path : orch_so_path_;
+            DeviceOrchestrationFunc *p_func = use_table ? &orch_so_table_[callable_id].func : &orch_func_;
+            DeviceOrchestrationBindRuntimeFunc *p_bind =
+                use_table ? &orch_so_table_[callable_id].bind : &orch_bind_runtime_;
+            DeviceOrchestrationConfigFunc *p_config_func =
+                use_table ? &orch_so_table_[callable_id].config_func : &orch_config_func_;
+            const bool reload_so = use_table ? runtime->register_new_callable_id() : runtime->has_new_orch_so();
 
             if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected, (re)loading", thread_idx);
-                if (orch_so_handle_ != nullptr) {
-                    dlclose(orch_so_handle_);
-                    orch_so_handle_ = nullptr;
-                    orch_func_ = nullptr;
-                    orch_bind_runtime_ = nullptr;
-                    if (orch_so_path_[0] != '\0') {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+                if (*p_handle != nullptr) {
+                    dlclose(*p_handle);
+                    *p_handle = nullptr;
+                    *p_func = nullptr;
+                    *p_bind = nullptr;
+                    if (p_path[0] != '\0') {
                         // Unlink the old file so the new open() lands on a
                         // fresh inode — protects against SIGBUS / ETXTBSY when
                         // the kernel still has the old mapping pinned.
-                        unlink(orch_so_path_);
-                        orch_so_path_[0] = '\0';
+                        unlink(p_path);
+                        p_path[0] = '\0';
                     }
                 }
 
@@ -333,15 +378,22 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     bind_runtime_func = nullptr;
                 }
 
-                orch_so_handle_ = handle;
-                orch_func_ = orch_func;
-                orch_bind_runtime_ = bind_runtime_func;
-                orch_config_func_ = config_func;
-                snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path);
+                *p_handle = handle;
+                *p_func = orch_func;
+                *p_bind = bind_runtime_func;
+                *p_config_func = config_func;
+                snprintf(p_path, 256, "%s", so_path);
+                if (use_table) orch_so_table_[callable_id].in_use = true;
             } else {
-                LOG_INFO_V0("Thread %d: Reusing cached orch SO handle=%p", thread_idx, orch_so_handle_);
-                if (orch_so_handle_ == nullptr || orch_func_ == nullptr) {
-                    LOG_ERROR("Thread %d: has_new_orch_so=false but no cached SO handle/func", thread_idx);
+                LOG_INFO_V0(
+                    0, "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle,
+                    callable_id
+                );
+                if (*p_handle == nullptr || *p_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        callable_id
+                    );
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -349,8 +401,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
             // Validate arg count on every run (reload or cache hit).
-            if (orch_config_func_ != nullptr) {
-                PTO2OrchestrationConfig cfg = orch_config_func_(runtime->get_orch_args());
+            if (*p_config_func != nullptr) {
+                PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
                 LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
                 if (cfg.expected_arg_count > 0) {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
@@ -361,17 +413,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                             cfg.expected_arg_count
                         );
                         // Clean up cached state so a subsequent run does a full reload.
-                        if (orch_so_handle_ != nullptr) {
-                            dlclose(orch_so_handle_);
-                            orch_so_handle_ = nullptr;
+                        if (*p_handle != nullptr) {
+                            dlclose(*p_handle);
+                            *p_handle = nullptr;
                         }
-                        if (orch_so_path_[0] != '\0') {
-                            unlink(orch_so_path_);
-                            orch_so_path_[0] = '\0';
+                        if (p_path[0] != '\0') {
+                            unlink(p_path);
+                            p_path[0] = '\0';
                         }
-                        orch_func_ = nullptr;
-                        orch_bind_runtime_ = nullptr;
-                        orch_config_func_ = nullptr;
+                        *p_func = nullptr;
+                        *p_bind = nullptr;
+                        *p_config_func = nullptr;
+                        if (use_table) orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
@@ -473,11 +526,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             orch_cycle_start = get_sys_cnt_aicpu();
 #endif
             framework_bind_runtime(rt);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(rt);
+            if (*p_bind != nullptr) {
+                (*p_bind)(rt);
             }
             rt_scope_begin(rt);
-            orch_func_(*orch_args_cached_);
+            (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 105f1601f..5cdf3e1f4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -195,6 +195,16 @@ class Runtime {
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
     bool has_new_orch_so_;
+    // Per-callable_id dispatch (callable.md design). When
+    // `active_callable_id_ >= 0`, AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]` instead of the legacy single-slot
+    // cache; `register_new_callable_id_` then signals whether the host is
+    // delivering a freshly-registered callable_id (write+dlopen) or reusing an
+    // already-loaded one. `active_callable_id_ == -1` keeps the legacy fast
+    // path (run_runtime() compatibility shim) — has_new_orch_so_ governs
+    // reload.
+    int32_t active_callable_id_;
+    bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
@@ -251,6 +261,12 @@ class Runtime {
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
     bool has_new_orch_so() const;
+    // Per-callable_id dispatch (callable.md). callable_id < 0 disables and
+    // falls back to the legacy single-slot orch SO cache governed by
+    // has_new_orch_so_.
+    void set_active_callable_id(int32_t callable_id, bool is_new);
+    int32_t get_active_callable_id() const;
+    bool register_new_callable_id() const;
     void set_device_orch_func_name(const char *name);
     const char *get_device_orch_func_name() const;
     void set_device_orch_config_name(const char *name);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 68d374e32..abfffd9aa 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -53,6 +53,8 @@ Runtime::Runtime() {
     dev_orch_so_addr_ = 0;
     dev_orch_so_size_ = 0;
     has_new_orch_so_ = false;
+    active_callable_id_ = -1;
+    register_new_callable_id_ = false;
     device_orch_func_name_[0] = '\0';
     device_orch_config_name_[0] = '\0';
 
@@ -115,6 +117,15 @@ uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
 
 bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
 
+void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
+    active_callable_id_ = callable_id;
+    register_new_callable_id_ = is_new;
+}
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
+
 void Runtime::set_device_orch_func_name(const char *name) {
     if (name == nullptr) {
         device_orch_func_name_[0] = '\0';

From d931ec226234597047755f75bc4dcca9386c071c Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 29 Apr 2026 14:27:56 +0800
Subject: [PATCH 02/28] Add: prepare_callable / run_prepared /
 unregister_callable C ABI

Implement Layer 3 of the per-callable_id dispatch protocol described in
docs/callable.md. Splits the legacy run_runtime path into a one-time
prepare phase (uploads orch SO + kernels, builds the per-cid metadata)
and a per-call run phase (binds cached state to a fresh Runtime, then
launches without re-uploading bytes).

- Extract prepare_callable_impl / bind_prepared_to_runtime_impl out of
  init_runtime_impl in trb runtime_maker.cpp so the c_api layer can
  drive the prepare/run split independently.
- DeviceRunner (onboard + sim) gains prepared_callables_ keyed by
  callable_id, an orch_so_dedup_ table that refcounts identical SO
  bytes by Build-ID hash, and aicpu_seen_callable_ids_ to drive
  register_new_callable_id_ on first sighting per cid.
- prepare_orch_so resolves the active callable_id when present and
  short-circuits the H2D upload to the cached buffer; legacy callers
  with cid<0 still take the original pending_orch_so path.
- New ABI exported from pto_runtime_c_api.{h,cpp} on both platforms.
  Variants without callable.md support (host_build_graph,
  aicpu_build_graph) export stubs that return -1, gated by
  RUNTIME_HAS_CALLABLE_ID defined only in the trb runtime.h, so the
  shared device_runner.cpp compiles cleanly across all six variants.
---
 .../platform/onboard/host/device_runner.cpp   | 158 ++++++++++++++++
 .../platform/onboard/host/device_runner.h     |  89 +++++++++
 .../onboard/host/pto_runtime_c_api.cpp        | 176 ++++++++++++++++++
 src/a2a3/platform/sim/host/device_runner.cpp  | 138 ++++++++++++++
 src/a2a3/platform/sim/host/device_runner.h    |  32 ++++
 .../platform/sim/host/pto_runtime_c_api.cpp   | 162 ++++++++++++++++
 .../host/runtime_maker.cpp                    |  80 +++++---
 .../runtime/runtime.h                         |   6 +
 src/common/worker/pto_runtime_c_api.h         |  61 ++++++
 9 files changed, 875 insertions(+), 27 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index b43e5c5d7..acee84a1b 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -746,6 +746,36 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    // Per-callable_id path (callable.md): when run_prepared bound a known
+    // callable_id, the SO bytes were already H2D'd at prepare_callable time.
+    // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
+    // whether the AICPU has seen this id since registration.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        // The c_api caller passed is_new=false; refresh with the authoritative
+        // first_sighting flag before AICPU consumes register_new_callable_id_.
+        runtime.set_active_callable_id(cid, first_sighting);
+        // Pending fields must be empty in the prepared path — runtime_maker's
+        // bind_prepared_to_runtime_impl never stages them. Defensive clear:
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
@@ -802,6 +832,120 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0) {
+        LOG_ERROR("register_prepared_callable: negative callable_id=%d", callable_id);
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    // Hash dedup: share device buffer across callable_ids that carry the same
+    // SO bytes. Refcount drops in unregister_prepared_callable; we only free
+    // when the count hits zero.
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        int rc = rtMemcpy(buf, orch_so_size, orch_so_data, orch_so_size, RT_MEMCPY_HOST_TO_DEVICE);
+        if (rc != 0) {
+            LOG_ERROR("register_prepared_callable: rtMemcpy failed: %d", rc);
+            mem_alloc_.free(buf);
+            return rc;
+        }
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    const uint64_t hash = it->second.hash;
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+
+    // Replay kernel addresses directly into runtime->func_id_to_addr_ without
+    // going through set_function_bin_addr. The latter records func_ids in
+    // registered_kernel_func_ids_, which validate_runtime_impl iterates to
+    // free kernel binaries — but prepared kernels must survive across runs.
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.func_id_to_addr_[kv.first] = kv.second;
+    }
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
+    // with the authoritative first_sighting answer right before launch.
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 int DeviceRunner::finalize() {
     if (device_id_ == -1) {
         return 0;
@@ -844,6 +988,20 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers that callers forgot to
+    // unregister. Refcounts no longer matter at this point — the device is
+    // about to be reset.
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     // Cleanup performance profiling
     if (l2_perf_collector_.is_initialized()) {
         auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 8dd4dc816..cf127a544 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -33,6 +33,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/kernel_args.h"
@@ -420,6 +422,65 @@ class DeviceRunner {
      */
     void release_run_context();
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    /**
+     * Stage a per-callable_id orchestration SO into device memory and remember
+     * the supporting metadata (entry/config symbol names, kernel func_id ↔
+     * dev_addr table). Identical SO bytes across two callable_ids share one
+     * device buffer (refcounted by hash) so the worst case for an N-cid pool
+     * is N distinct device buffers, not N copies of the same SO.
+     *
+     * @param callable_id   Caller-stable id, must be in [0, MAX_REGISTERED_CALLABLE_IDS).
+     * @param orch_so_data  Host pointer to orchestration SO bytes (owned by caller).
+     * @param orch_so_size  Size of orchestration SO in bytes.
+     * @param func_name     Entry symbol name (copied).
+     * @param config_name   Config symbol name (copied).
+     * @param kernel_addrs  func_id ↔ dev_addr pairs already uploaded by the
+     *                      caller. Stored verbatim so run_prepared can replay
+     *                      them onto a fresh Runtime without re-uploading.
+     * @return 0 on success, negative on failure.
+     */
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Drop the prepared state for `callable_id` and decrement the SO buffer's
+     * hash-keyed refcount; frees the device buffer when the count hits zero.
+     * Kernel binaries are shared across callables and only released by
+     * finalize().
+     *
+     * @param callable_id  Id previously passed to register_prepared_callable.
+     * @return 0 on success or if the id was not registered.
+     */
+    int unregister_prepared_callable(int32_t callable_id);
+
+    /**
+     * True iff `callable_id` has prepared state staged via
+     * register_prepared_callable. Lets the c_api layer reject `run_prepared`
+     * calls without a matching `prepare_callable`.
+     */
+    bool has_prepared_callable(int32_t callable_id) const;
+
+    /**
+     * Replay the prepared state for `callable_id` onto a freshly-constructed
+     * Runtime: restores kernel func_id ↔ dev_addr table, the orch entry/config
+     * symbol names, and stamps `runtime.set_active_callable_id` so the
+     * subsequent `run` dispatches via the AICPU per-cid table. The kernel
+     * addresses are written directly into func_id_to_addr_ (bypassing
+     * registered_kernel_func_ids_) so validate_runtime_impl will not free them
+     * — they survive until unregister_prepared_callable / finalize().
+     *
+     * Marks the cid as seen so the upcoming prepare_orch_so resolves
+     * `register_new_callable_id_` correctly (true exactly on first sighting
+     * after registration).
+     *
+     * @return 0 on success, -1 if the cid is not registered.
+     */
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 private:
     // Internal state
     int device_id_{-1};
@@ -451,6 +512,34 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state (callable.md design).
+    //
+    // `prepared_callables_` maps the caller-stable callable_id to the orch
+    // SO slice + symbol names needed to launch it. `orch_so_dedup_` shares
+    // device buffers across callable_ids whose orch SO bytes have the same
+    // ELF Build-ID hash (refcounted; freed when the count hits zero).
+    // `aicpu_seen_callable_ids_` tracks which ids have already been delivered
+    // to the AICPU at least once so prepare_orch_so can set
+    // register_new_callable_id_ correctly on first sighting.
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    struct PreparedCallableState {
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     // ACL lifecycle (process-wide). aclInit must run exactly once; ensure_acl_ready
     // gates it behind this flag. finalize() drives aclFinalize only if we observed
     // acl_ready_, so runtimes that never ask for ACL (e.g. pure rt-layer) stay unaffected.
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index e4b7d3b20..75c467f54 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -40,6 +40,10 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
+#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -296,4 +300,176 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_info_v(log_info_v);
 }
 
+/* ===========================================================================
+ * Per-callable_id preparation (callable.md design)
+ * =========================================================================== */
+
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    // AICPU/AICore executor binaries are only consumed by run()/run_prepared();
+    // prepare_callable just uploads kernel + orch SO state.
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
+        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
+        Runtime *r = new (rt_buf) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            r->~Runtime();
+            return rc;
+        }
+
+        // Extract kernel func_id ↔ dev_addr pairs uploaded by prepare_callable_impl.
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        // Clear registered kernels so the Runtime destructor (or any accidental
+        // validate call) does NOT free the kernel binaries we just uploaded —
+        // they belong to the prepared state now.
+        r->clear_registered_kernels();
+
+        rc = runner->register_prepared_callable(
+            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+            r->get_device_orch_config_name(), std::move(kernel_addrs)
+        );
+        r->~Runtime();
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+) {
+    if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        Runtime *r = new (runtime) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        // Restore kernel addrs + orch symbol names + active_callable_id
+        rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            return rc;
+        }
+
+        // Per-run binding (tensor args, GM heap, SM alloc)
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
+        if (rc != 0) {
+            r->set_gm_sm_ptr(nullptr);
+            validate_runtime_impl(r);
+            r->~Runtime();
+            return rc;
+        }
+
+        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
+        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
+        runner->set_pmu_enabled(enable_pmu);
+        runner->set_output_prefix(output_prefix);
+
+        std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
+        std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
+        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
+        if (rc != 0) {
+            validate_runtime_impl(r);
+            r->~Runtime();
+            return rc;
+        }
+
+        rc = validate_runtime_impl(r);
+        r->~Runtime();
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
+    } catch (...) {
+        return -1;
+    }
+}
+#else   // RUNTIME_HAS_CALLABLE_ID
+// Stubs so the dlsym surface is uniform across runtime variants. ChipWorker
+// resolves these unconditionally; variants that lack callable.md support
+// reject the calls at runtime instead of failing to load the library.
+int prepare_callable(
+    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
+) {
+    LOG_ERROR("prepare_callable not supported by this runtime variant");
+    return -1;
+}
+int run_prepared(
+    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
+    size_t, int, int, int, const char *
+) {
+    LOG_ERROR("run_prepared not supported by this runtime variant");
+    return -1;
+}
+int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 }  // extern "C"
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 26db1e3d6..bf50487f9 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -673,6 +673,31 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    // Per-callable_id path (callable.md): mirror onboard. Bytes were staged
+    // at register_prepared_callable time; here we only stamp metadata onto
+    // the runtime and resolve `register_new_callable_id_` from first sighting.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        runtime.set_active_callable_id(cid, first_sighting);
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
@@ -720,6 +745,107 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0) {
+        LOG_ERROR("register_prepared_callable: negative callable_id=%d", callable_id);
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        // Sim shares an address space with the simulated AICPU thread, so a
+        // plain memcpy is the moral equivalent of rtMemcpy on hardware.
+        std::memcpy(buf, orch_so_data, orch_so_size);
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    const uint64_t hash = it->second.hash;
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.func_id_to_addr_[kv.first] = kv.second;
+    }
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 int DeviceRunner::finalize() {
     // Skip if already finalized
     if (device_id_ == -1 && aicpu_so_handle_ == nullptr && aicore_so_handle_ == nullptr) {
@@ -769,6 +895,18 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers callers forgot to drop.
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 210aeb9ba..f14fcc333 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -39,6 +39,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/core_type.h"
@@ -210,6 +212,16 @@ class DeviceRunner {
      */
     void remove_kernel_binary(int func_id);
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+    int unregister_prepared_callable(int32_t callable_id);
+    bool has_prepared_callable(int32_t callable_id) const;
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 private:
     // Configuration
     int device_id_{-1};
@@ -232,6 +244,26 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state (callable.md design). Mirrors onboard.
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    struct PreparedCallableState {
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     // AICPU executor SO: load-once, matching onboard's binaries_loaded_ pattern.
     // The aicpu_executor g_aicpu_executor static lives inside the dlopen'd DSO;
     // reloading it destroys orch_so_handle_ and breaks the orch-SO cache-hit path.
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index b8315b31a..06b3fda71 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -36,6 +36,10 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
+#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -282,4 +286,162 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_info_v(log_info_v);
 }
 
+/* ===========================================================================
+ * Per-callable_id preparation (callable.md design)
+ * =========================================================================== */
+
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+    (void)device_id;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+
+    try {
+        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
+        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
+        Runtime *r = new (rt_buf) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        int rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        r->clear_registered_kernels();
+
+        rc = runner->register_prepared_callable(
+            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+            r->get_device_orch_config_name(), std::move(kernel_addrs)
+        );
+        r->~Runtime();
+        pthread_setspecific(g_runner_key, nullptr);
+        return rc;
+    } catch (...) {
+        pthread_setspecific(g_runner_key, nullptr);
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+) {
+    if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+
+    try {
+        Runtime *r = new (runtime) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        int rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
+        if (rc != 0) {
+            r->set_gm_sm_ptr(nullptr);
+            validate_runtime_impl(r);
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
+        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
+        runner->set_pmu_enabled(enable_pmu);
+        runner->set_output_prefix(output_prefix);
+
+        std::vector<uint8_t> aicpu_vec;
+        std::vector<uint8_t> aicore_vec;
+        if (aicpu_binary != NULL && aicpu_size > 0) {
+            aicpu_vec.assign(aicpu_binary, aicpu_binary + aicpu_size);
+        }
+        if (aicore_binary != NULL && aicore_size > 0) {
+            aicore_vec.assign(aicore_binary, aicore_binary + aicore_size);
+        }
+        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
+        if (rc != 0) {
+            validate_runtime_impl(r);
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        rc = validate_runtime_impl(r);
+        r->~Runtime();
+        pthread_setspecific(g_runner_key, nullptr);
+        return rc;
+    } catch (...) {
+        pthread_setspecific(g_runner_key, nullptr);
+        return -1;
+    }
+}
+
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
+    } catch (...) {
+        return -1;
+    }
+}
+#else   // RUNTIME_HAS_CALLABLE_ID
+int prepare_callable(
+    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
+) {
+    LOG_ERROR("prepare_callable not supported by this runtime variant");
+    return -1;
+}
+int run_prepared(
+    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
+    size_t, int, int, int, const char *
+) {
+    LOG_ERROR("run_prepared not supported by this runtime variant");
+    return -1;
+}
+int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 }  // extern "C"
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 5b1ca640b..aac9072a2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -92,31 +92,29 @@ static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader
 }
 
 /**
- * Initialize a pre-allocated runtime for device orchestration.
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so callable.md's prepare_callable / run_prepared
+ * split lets us run this once per callable_id and amortize across runs.
  *
- * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side).
- * This function:
- * - Copies tensor metadata and replaces host pointers with device pointers
- * - Copies all tensor data to device
- * - Records all tensors for copy-back
- * - Copies orchestration SO to device memory
- * - Sets up runtime state for device orchestration
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
  * @return 0 on success, -1 on failure
  */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -146,6 +144,32 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
         return -1;
     }
 
+    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
+    runtime->pending_orch_so_data_ = orch_so_binary;
+    runtime->pending_orch_so_size_ = orch_so_size;
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
+    return 0;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by prepare_callable_impl.
+ *
+ * Splitting this from prepare_callable_impl matches the callable.md design:
+ * register/run_prepared invokes this every call, while the prep half runs
+ * only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
     if (orch_args == nullptr) {
         LOG_ERROR("orch_args pointer is null");
         return -1;
@@ -153,7 +177,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int tensor_count = orch_args->tensor_count();
     int scalar_count = orch_args->scalar_count();
-    LOG_INFO_V0("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
 
     int64_t t_total_start = _now_ms();
 
@@ -196,16 +220,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
     }
     int64_t t_args_end = _now_ms();
 
-    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
-    // DeviceRunner hashes the bytes, skips the rtMemcpy when the identity is
-    // unchanged, and overwrites dev_orch_so_addr_ / size / has_new_orch_so_
-    // on Runtime before the struct is sent to device.
-    int64_t t_so_start = _now_ms();
-    runtime->pending_orch_so_data_ = orch_so_binary;
-    runtime->pending_orch_so_size_ = orch_so_size;
-    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
-    int64_t t_so_end = _now_ms();
-
     // Read ready queue shard count from environment for AICPU scheduler
     {
         const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
@@ -285,7 +299,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int64_t t_total_end = _now_ms();
     LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-    LOG_INFO_V0("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start);
     LOG_INFO_V0("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
@@ -293,6 +306,19 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
     return 0;
 }
 
+/**
+ * Compatibility shim: `init_runtime_impl` is the legacy single-call path that
+ * still drives every `run_runtime` invocation today. The callable.md split
+ * keeps it as `prepare_callable_impl + bind_prepared_to_runtime_impl` so the
+ * legacy path stays one function to platform code, while `run_prepared` can
+ * skip the prepare half once a callable_id is staged.
+ */
+extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
+    int rc = prepare_callable_impl(runtime, callable);
+    if (rc != 0) return rc;
+    return bind_prepared_to_runtime_impl(runtime, orch_args);
+}
+
 /**
  * Validate runtime results and cleanup.
  *
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 5cdf3e1f4..722231a8a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -29,6 +29,12 @@
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 
+// This variant supports the per-callable_id dispatch protocol (callable.md).
+// DeviceRunner and pto_runtime_c_api.cpp check this at compile time to guard
+// callable_id-specific code paths so the same sources compile cleanly against
+// variants that lack the protocol (host_build_graph, aicpu_build_graph).
+#define RUNTIME_HAS_CALLABLE_ID 1
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index b6588dc45..88c797ba1 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -127,6 +127,67 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v);
  */
 int finalize_device(DeviceContextHandle ctx);
 
+/* ===========================================================================
+ * Per-callable_id preparation (callable.md design)
+ *
+ * The triplet below decouples the one-shot prep work (kernel upload + orch SO
+ * H2D + caching keyed by `callable_id`) from each `run_prepared` invocation,
+ * so the per-run cost shrinks to "rebuild Runtime args + launch". Callers
+ * keep a stable small-int `callable_id` per ChipCallable; the platform side
+ * caches the prepared state in a fixed-size table (cap 64, see
+ * MAX_REGISTERED_CALLABLE_IDS in the AICPU executor) and rejects ids outside
+ * `[0, 64)`. Lifetime: caller must `unregister_callable` before
+ * `finalize_device` to release the device-side orch SO buffer; kernels stay
+ * resident until finalize regardless.
+ * =========================================================================== */
+
+/**
+ * Stage a callable for repeated cheap launches under the given `callable_id`.
+ *
+ * Uploads child kernels into the DeviceRunner's func_id-keyed cache and
+ * copies the orchestration SO bytes into a device-resident buffer keyed by
+ * the SO's ELF Build-ID hash (so two callable_ids with identical SO share
+ * one buffer). Subsequent `run_prepared(callable_id, ...)` calls reuse this
+ * state.
+ *
+ * @return 0 on success, negative on error (NULL ctx, callable_id out of
+ *         range, or upload/copy failure).
+ */
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+);
+
+/**
+ * Launch a callable previously staged via `prepare_callable`.
+ *
+ * Same effective behavior as `run_runtime` but skips the per-run kernel
+ * upload + orch SO H2D, looking up the prepared state by `callable_id`. The
+ * AICPU side dispatches via `orch_so_table_[callable_id]` (see
+ * runtime.h::set_active_callable_id). The first run for a given callable_id
+ * sets `register_new_callable_id_` so the AICPU does its one-time dlopen.
+ *
+ * @return 0 on success, negative on error (no prep state, NULL ctx, etc.).
+ */
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+);
+
+/**
+ * Drop the prepared state for `callable_id` and release the per-id share of
+ * the device orch SO buffer. The buffer itself is freed only when its
+ * hash-keyed refcount drops to zero (different callable_ids with identical
+ * SO share one allocation).
+ *
+ * Kernel binaries uploaded by `prepare_callable` remain resident — they are
+ * shared across callables by func_id and only released by `finalize_device`.
+ *
+ * @return 0 on success or if callable_id was not registered, negative on error.
+ */
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id);
+
 #ifdef __cplusplus
 }
 #endif

From 97e56d159214126133d2cb0940af5f1e10c231c8 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 29 Apr 2026 15:08:56 +0800
Subject: [PATCH 03/28] feat(callable): expose prepare/run_prepared/unregister
 via ChipWorker + Python

Layer 4 of the callable.md migration: drive the per-callable_id C ABI
(introduced in fc721150) end-to-end through ChipWorker, the nanobind
surface, and the Python wrapper, plus a sticky flag in DeviceRunner
that keeps finalize's "kernel still cached" leak signal honest now that
the prepared-callable path legitimately keeps kernels resident until
finalize.

- ChipWorker (src/common/worker): dlsym the new symbols and add
  prepare_callable / run_prepared / unregister_callable methods with
  device-not-set guards. Stubs in non-trb variants surface the runtime
  rejection as a thrown error.
- nanobind: bind the three methods on _ChipWorker so the Python wrapper
  can drive them without a separate raw-pointer path.
- Python wrapper (simpler.task_interface.ChipWorker): thin pass-through
  that mirrors run()'s **kwargs config-override pattern.
- DeviceRunner.finalize: distinguish legacy-path "still-cached kernels"
  leaks from prepared-callable kernels that live until finalize by
  design. Uses a sticky prepared_callable_path_used_ flag set by
  register_prepared_callable (never cleared, so a post-unregister
  finalize still routes to DEBUG instead of ERROR).
- tests/ut/py/test_chip_worker.py: 3 new state-machine guards covering
  the new methods before set_device.
- tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable: new e2e
  test that prepares two callable_ids sharing the vector_example orch
  SO, runs each via run_prepared (cid=0 twice to hit the dedup path),
  then unregisters both.

Verified:
- tests/ut/py/test_chip_worker.py: 15/15 PASSED
- prepared_callable test: PASSED on a2a3sim
- paged_attention_unroll on a2a3 hardware (--device 9): PASSED
---
 python/bindings/task_interface.cpp            |  27 ++++
 python/simpler/task_interface.py              |  28 ++++
 .../platform/onboard/host/device_runner.cpp   |  22 ++-
 .../platform/onboard/host/device_runner.h     |   4 +
 src/a2a3/platform/sim/host/device_runner.cpp  |  20 ++-
 src/a2a3/platform/sim/host/device_runner.h    |   7 +
 src/common/worker/chip_worker.cpp             |  55 ++++++++
 src/common/worker/chip_worker.h               |  23 ++-
 .../test_prepared_callable.py                 | 132 ++++++++++++++++++
 tests/ut/py/test_chip_worker.py               |  22 +++
 10 files changed, 328 insertions(+), 12 deletions(-)
 create mode 100644 tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 344758b78..7872197ba 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -648,6 +648,33 @@ NB_MODULE(_task_interface, m) {
             "blob_ptr and dispatch to the runtime. Used from forked chip processes "
             "reading the WorkerThread mailbox."
         )
+        .def(
+            "prepare_callable",
+            [](ChipWorker &self, int32_t callable_id, const PyChipCallable &callable) {
+                self.prepare_callable(callable_id, callable.buffer_.data());
+            },
+            nb::arg("callable_id"), nb::arg("callable"),
+            "Stage a ChipCallable under callable_id for cheap repeated launches "
+            "via run_prepared. Variants without callable.md support raise."
+        )
+        .def(
+            "run_prepared",
+            [](ChipWorker &self, int32_t callable_id, ChipStorageTaskArgs &args, const CallConfig &config) {
+                self.run_prepared(callable_id, &args, config);
+            },
+            nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
+            "Launch a callable_id previously staged via prepare_callable."
+        )
+        .def(
+            "unregister_callable",
+            [](ChipWorker &self, int32_t callable_id) {
+                self.unregister_callable(callable_id);
+            },
+            nb::arg("callable_id"),
+            "Drop the prepared state for callable_id; releases the per-id share "
+            "of the device orch SO buffer (kernel binaries stay resident until "
+            "finalize)."
+        )
         .def_prop_ro("device_id", &ChipWorker::device_id)
         .def_prop_ro("initialized", &ChipWorker::initialized)
         .def_prop_ro("device_set", &ChipWorker::device_set)
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 71ac81122..5e9830329 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -329,6 +329,34 @@ def run_from_blob(self, callable, blob_ptr, config):
         """
         self._impl.run_from_blob(int(callable), int(blob_ptr), config)
 
+    def prepare_callable(self, callable_id, callable):
+        """Stage a ChipCallable under ``callable_id`` for repeated cheap launches.
+
+        Uploads the kernel binaries + the orchestration SO once; subsequent
+        ``run_prepared(callable_id, ...)`` skips that work. ``callable_id``
+        must be in ``[0, 64)``. Requires ``set_device()``.
+        """
+        self._impl.prepare_callable(int(callable_id), callable)
+
+    def run_prepared(self, callable_id, args, config=None, **kwargs):
+        """Launch a ``callable_id`` previously staged via ``prepare_callable``.
+
+        Args:
+            callable_id: Stable id passed to a prior ``prepare_callable``.
+            args: ChipStorageTaskArgs for this invocation.
+            config: Optional CallConfig. If None, a default is created.
+            **kwargs: Overrides applied to config (e.g. block_dim=24).
+        """
+        if config is None:
+            config = CallConfig()
+        for k, v in kwargs.items():
+            setattr(config, k, v)
+        self._impl.run_prepared(int(callable_id), args, config)
+
+    def unregister_callable(self, callable_id):
+        """Drop prepared state for ``callable_id`` and release its orch SO share."""
+        self._impl.unregister_callable(int(callable_id))
+
     def malloc(self, size):
         """Allocate memory. Returns a pointer (uint64)."""
         return int(self._impl.malloc(int(size)))
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index acee84a1b..a8e0a68a8 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -892,6 +892,7 @@ int DeviceRunner::register_prepared_callable(
     state.config_name = (config_name != nullptr) ? config_name : "";
     state.kernel_addrs = std::move(kernel_addrs);
     prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
     return 0;
 }
 
@@ -965,14 +966,27 @@ int DeviceRunner::finalize() {
     // Cleanup AICPU SO
     so_info_.finalize();
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The callable.md prepared-callable path intentionally
+    // leaves them resident across runs (shared by func_id) and relies on
+    // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the
+    // legacy regression signal is preserved for callers that never went
+    // through prepare_callable.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
-        // Cleanup leaked binaries to prevent memory leaks
+#ifdef RUNTIME_HAS_CALLABLE_ID
+        const bool prepared_path_used = prepared_callable_path_used_;
+#else
+        const bool prepared_path_used = false;
+#endif
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
+        }
         for (const auto &pair : func_id_to_addr_) {
             void *gm_addr = reinterpret_cast<void *>(pair.second);
             mem_alloc_.free(gm_addr);
-            LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
+            LOG_DEBUG("Freed kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
         }
     }
     func_id_to_addr_.clear();
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index cf127a544..b592bbb7e 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -538,6 +538,10 @@ class DeviceRunner {
     std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
     std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    // Sticky flag: prepare_callable was called at least once. Distinguishes
+    // legacy-path "kernel still cached at finalize" leaks from prepared-path
+    // kernels that legitimately live until finalize.
+    bool prepared_callable_path_used_{false};
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
     // ACL lifecycle (process-wide). aclInit must run exactly once; ensure_acl_ready
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index bf50487f9..d1b11a527 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -799,6 +799,7 @@ int DeviceRunner::register_prepared_callable(
     state.config_name = (config_name != nullptr) ? config_name : "";
     state.kernel_addrs = std::move(kernel_addrs);
     prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
     return 0;
 }
 
@@ -870,15 +871,26 @@ int DeviceRunner::finalize() {
         pmu_collector_.finalize(nullptr, free_cb, &mem_alloc_);
     }
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The callable.md prepared-callable path intentionally
+    // leaves them resident across runs and relies on finalize() to reclaim
+    // them; that is not a leak.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
-        // Cleanup leaked handles and host copies
+#ifdef RUNTIME_HAS_CALLABLE_ID
+        const bool prepared_path_used = prepared_callable_path_used_;
+#else
+        const bool prepared_path_used = false;
+#endif
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
+        }
         for (auto &pair : func_id_to_addr_) {
             MappedKernel &kernel = pair.second;
             if (kernel.dl_handle != nullptr) {
                 dlclose(kernel.dl_handle);
-                LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first);
+                LOG_DEBUG("Closed kernel: func_id=%d", pair.first);
             }
             delete[] kernel.callable_buf;
         }
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index f14fcc333..2981e3b89 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -262,6 +262,13 @@ class DeviceRunner {
     std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
     std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    // Sticky flag: prepare_callable was called at least once in this
+    // DeviceRunner's lifetime. unregister_prepared_callable clears the maps
+    // above, so we cannot use them at finalize() time to decide whether a
+    // remaining func_id_to_addr_ entry is a legacy-path leak or a kernel
+    // legitimately staged by prepare_callable (which is owned until finalize
+    // by design).
+    bool prepared_callable_path_used_{false};
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
     // AICPU executor SO: load-once, matching onboard's binaries_loaded_ pattern.
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 38680e77a..360b81d67 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -150,6 +150,9 @@ void ChipWorker::init(
         get_runtime_size_fn_ = load_symbol<GetRuntimeSizeFn>(handle, "get_runtime_size");
         run_runtime_fn_ = load_symbol<RunRuntimeFn>(handle, "run_runtime");
         simpler_init_fn_ = load_symbol<SimplerInitFn>(handle, "simpler_init");
+        prepare_callable_fn_ = load_symbol<PrepareCallableFn>(handle, "prepare_callable");
+        run_prepared_fn_ = load_symbol<RunPreparedFn>(handle, "run_prepared");
+        unregister_callable_fn_ = load_symbol<UnregisterCallableFn>(handle, "unregister_callable");
         finalize_device_fn_ = load_symbol<FinalizeDeviceFn>(handle, "finalize_device");
         // ACL lifecycle + comm_* are part of the uniform host_runtime.so ABI.
         // Every platform runtime exports all of them — runtimes that do not
@@ -243,6 +246,9 @@ void ChipWorker::finalize() {
     copy_from_device_ctx_fn_ = nullptr;
     get_runtime_size_fn_ = nullptr;
     run_runtime_fn_ = nullptr;
+    prepare_callable_fn_ = nullptr;
+    run_prepared_fn_ = nullptr;
+    unregister_callable_fn_ = nullptr;
     finalize_device_fn_ = nullptr;
     ensure_acl_ready_fn_ = nullptr;
     create_comm_stream_fn_ = nullptr;
@@ -287,6 +293,55 @@ void ChipWorker::run(const void *callable, const void *args, const CallConfig &c
     }
 }
 
+void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) {
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+    if (callable == nullptr) {
+        throw std::runtime_error("prepare_callable: callable must not be null");
+    }
+    int rc = prepare_callable_fn_(
+        device_ctx_, callable_id, callable, device_id_, aicpu_binary_.data(), aicpu_binary_.size(),
+        aicore_binary_.data(), aicore_binary_.size()
+    );
+    if (rc != 0) {
+        throw std::runtime_error("prepare_callable failed with code " + std::to_string(rc));
+    }
+}
+
+void ChipWorker::run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config) {
+    ChipStorageTaskArgs chip_storage = view_to_chip_storage(args);
+    run_prepared(callable_id, &chip_storage, config);
+}
+
+void ChipWorker::run_prepared(int32_t callable_id, const void *args, const CallConfig &config) {
+    config.validate();
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+
+    void *rt = runtime_buf_.data();
+
+    int rc = run_prepared_fn_(
+        device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(),
+        aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_l2_swimlane,
+        config.enable_dump_tensor, config.enable_pmu, config.output_prefix
+    );
+    if (rc != 0) {
+        throw std::runtime_error("run_prepared failed with code " + std::to_string(rc));
+    }
+}
+
+void ChipWorker::unregister_callable(int32_t callable_id) {
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+    int rc = unregister_callable_fn_(device_ctx_, callable_id);
+    if (rc != 0) {
+        throw std::runtime_error("unregister_callable failed with code " + std::to_string(rc));
+    }
+}
+
 uint64_t ChipWorker::malloc(size_t size) {
     if (!device_set_) {
         throw std::runtime_error("ChipWorker device not set; call set_device() first");
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 3e529a511..72d8cd492 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -61,6 +61,15 @@ class ChipWorker : public IWorker {
     // the TaskArgsView path and takes a ready-made ChipStorageTaskArgs POD.
     void run(const void *callable, const void *args, const CallConfig &config);
 
+    // Per-callable_id preparation (callable.md design). The runtime variant
+    // bound at init() may export real implementations or stubs that return
+    // -1; ChipWorker forwards the result to the caller. callable_id must be
+    // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64). Requires set_device().
+    void prepare_callable(int32_t callable_id, const void *callable);
+    void run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config);
+    void run_prepared(int32_t callable_id, const void *args, const CallConfig &config);
+    void unregister_callable(int32_t callable_id);
+
     uint64_t malloc(size_t size);
     void free(uint64_t ptr);
     void copy_to(uint64_t dst, uint64_t src, size_t size);
@@ -102,11 +111,14 @@ class ChipWorker : public IWorker {
     using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using CopyFromDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using GetRuntimeSizeFn = size_t (*)();
-    using RunRuntimeFn = int (*)(
-        void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t,
-        int, int, int, const char *
-    );
+    using RunRuntimeFn =
+        int (*)(void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *);
     using SimplerInitFn = void (*)(void *, int, int);
+    using PrepareCallableFn =
+        int (*)(void *, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t);
+    using RunPreparedFn =
+        int (*)(void *, void *, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *);
+    using UnregisterCallableFn = int (*)(void *, int32_t);
     using FinalizeDeviceFn = int (*)(void *);
     using EnsureAclReadyFn = int (*)(void *, int);
     using CreateCommStreamFn = void *(*)(void *);
@@ -129,6 +141,9 @@ class ChipWorker : public IWorker {
     GetRuntimeSizeFn get_runtime_size_fn_ = nullptr;
     RunRuntimeFn run_runtime_fn_ = nullptr;
     SimplerInitFn simpler_init_fn_ = nullptr;
+    PrepareCallableFn prepare_callable_fn_ = nullptr;
+    RunPreparedFn run_prepared_fn_ = nullptr;
+    UnregisterCallableFn unregister_callable_fn_ = nullptr;
     FinalizeDeviceFn finalize_device_fn_ = nullptr;
     EnsureAclReadyFn ensure_acl_ready_fn_ = nullptr;
     CreateCommStreamFn create_comm_stream_fn_ = nullptr;
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..14bb8a792
--- /dev/null
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable.
+
+Reuses the vector_example orchestration + AIV kernels. Exercises:
+  - prepare_callable once, then run_prepared twice (second run proves the
+    AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload).
+  - Two distinct callable_ids sharing the same orch SO binary: verifies both
+    produce correct output independently.
+  - unregister_callable after runs complete: should not raise.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_VECTOR_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPreparedCallable(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable ABI."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
+    _PLATFORMS = ["a2a3sim", "a2a3"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        # 1) prepare two callable_ids with the SAME callable (shared orch SO)
+        worker.prepare_callable(0, callable_obj)
+        worker.prepare_callable(1, callable_obj)
+
+        # 2) run_prepared cid=0 twice (second run proves dedup/cache hit)
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(0, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 3) run_prepared cid=1 — different slot, same SO, must also work
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(1, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 4) unregister both — should not raise
+        worker.unregister_callable(0)
+        worker.unregister_callable(1)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py
index 520254cc5..adeeeb6fc 100644
--- a/tests/ut/py/test_chip_worker.py
+++ b/tests/ut/py/test_chip_worker.py
@@ -110,6 +110,28 @@ def test_init_with_nonexistent_lib_raises(self):
         with pytest.raises(RuntimeError, match="dlopen"):
             worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so")
 
+    def test_prepare_callable_before_set_device_raises(self):
+        from _task_interface import ChipCallable  # noqa: PLC0415
+
+        worker = _ChipWorker()
+        callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[])
+        with pytest.raises(RuntimeError, match="device not set"):
+            worker.prepare_callable(0, callable_obj)
+
+    def test_run_prepared_before_set_device_raises(self):
+        from _task_interface import ChipStorageTaskArgs  # noqa: PLC0415
+
+        worker = _ChipWorker()
+        config = CallConfig()
+        args = ChipStorageTaskArgs()
+        with pytest.raises(RuntimeError, match="device not set"):
+            worker.run_prepared(0, args, config)
+
+    def test_unregister_callable_before_set_device_raises(self):
+        worker = _ChipWorker()
+        with pytest.raises(RuntimeError, match="device not set"):
+            worker.unregister_callable(0)
+
 
 # ============================================================================
 # Python-level ChipWorker wrapper tests

From 6621829b2bf527119cc4a2d11a0b929fe3c7233f Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 29 Apr 2026 16:51:38 +0800
Subject: [PATCH 04/28] feat(callable): port Stage 1 ABI surface to remaining
 variants

Stage 2 of docs/callable.md: make the prepare/run_prepared/unregister
ABI uniform across all 5 valid runtime variants (3 a2a3 + 2 a5) so
ChipWorker dlsym is independent of which variant is loaded.

- a5/platform/{onboard,sim}/host/pto_runtime_c_api.cpp: add
  unconditional stubs (prepare_callable/run_prepared return -1 with
  LOG_ERROR; unregister_callable returns 0). a5 has no
  RUNTIME_HAS_CALLABLE_ID-aware path yet, so the stubs are the entire
  surface; full support is deferred until a5 picks up the per-cid
  orch SO dispatch.
- python/simpler/worker.py: add L2 facade methods on Worker that
  forward to the underlying ChipWorker. The ST framework's
  conftest.st_worker fixture wraps ChipWorker in Worker(level=2),
  so prepared_callable e2e tests (and any future caller going through
  Worker) need this thin pass-through. L3+ still raises
  NotImplementedError pending Stage 3 (mailbox protocol switch to cid).

a2a3/{host_build_graph,aicpu_build_graph} required no source changes:
the platform code is shared across the three a2a3 variants and was
already gated by `#ifdef RUNTIME_HAS_CALLABLE_ID`, which only
tensormap_and_ringbuffer's runtime.h defines. The non-trb variants
fall through to the existing `#else` stub branch automatically.

Verified on sim only:
- 5 variants compile clean (a2a3sim x3, a5sim x2; a5 has no
  aicpu_build_graph).
- UT test_chip_worker.py 15/15.
- a2a3sim ST sample: host_build_graph 4/4, aicpu_build_graph 3/3,
  tensormap_and_ringbuffer 4/4 (incl. prepared_callable e2e).
- a5sim ST: host_build_graph 1/1, tensormap_and_ringbuffer 10/10.
---
 python/simpler/worker.py                      | 39 +++++++++++++++++++
 .../onboard/host/pto_runtime_c_api.cpp        | 23 +++++++++++
 .../platform/sim/host/pto_runtime_c_api.cpp   | 20 ++++++++++
 3 files changed, 82 insertions(+)

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 073084dc6..69d4fcdad 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -1075,6 +1075,45 @@ def run(self, callable, args=None, config=None) -> None:
                 self._orch._scope_end()
                 self._orch._drain()
 
+    def prepare_callable(self, callable_id: int, callable) -> None:
+        """L2 only: pre-stage a callable under ``callable_id`` (see
+        ``ChipWorker.prepare_callable``). Subsequent ``run_prepared`` skips
+        per-run kernel/orch SO upload.
+        """
+        assert self._initialized, "Worker not initialized; call init() first"
+        if self.level != 2:
+            raise NotImplementedError("prepare_callable is L2-only (callable.md Stage 2)")
+        assert self._chip_worker is not None
+        self._chip_worker.prepare_callable(callable_id, callable)
+
+    def run_prepared(self, callable_id: int, args=None, config=None) -> None:
+        """L2 only: launch a callable previously staged via ``prepare_callable``."""
+        assert self._initialized, "Worker not initialized; call init() first"
+        if self.level != 2:
+            raise NotImplementedError("run_prepared is L2-only (callable.md Stage 2)")
+        assert self._chip_worker is not None
+        cfg = config if config is not None else CallConfig()
+        self._chip_worker.run_prepared(callable_id, args, cfg)
+
+    def unregister_callable(self, callable_id: int) -> None:
+        """L2 only: drop the prepared state for ``callable_id``."""
+        assert self._initialized, "Worker not initialized; call init() first"
+        if self.level != 2:
+            raise NotImplementedError("unregister_callable is L2-only (callable.md Stage 2)")
+        assert self._chip_worker is not None
+        self._chip_worker.unregister_callable(callable_id)
+
+    def _run_as_child(self, cid: int, args, config) -> None:
+        """Called from C++ _Worker::run when this Worker is a THREAD-mode child.
+
+        Looks up the orch function from the callable registry and delegates
+        to ``self.run(orch_fn, args, config)``.
+        """
+        orch_fn = self._callable_registry.get(cid)
+        if orch_fn is None:
+            raise KeyError(f"callable id {cid} not found in registry")
+        self.run(orch_fn, args, config)
+
     # ------------------------------------------------------------------
     # close
     # ------------------------------------------------------------------
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index fa151b1ab..a89dab14a 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -330,5 +330,28 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_level(log_level);
     runner->set_log_info_v(log_info_v);
 }
+/* ===========================================================================
+ * Prepared-callable ABI stubs.
+ *
+ * a5 runtimes do not yet implement the per-callable_id orchestration SO
+ * dispatch path described in docs/callable.md (only a2a3/tensormap_and_ringbuffer
+ * does). ChipWorker dlsym's these symbols unconditionally, so we expose stubs
+ * that fail loudly at call time rather than failing to load the library.
+ * =========================================================================== */
+
+int prepare_callable(
+    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
+) {
+    LOG_ERROR("prepare_callable not supported by this runtime variant");
+    return -1;
+}
+int run_prepared(
+    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
+    size_t, int, int, int, const char *
+) {
+    LOG_ERROR("run_prepared not supported by this runtime variant");
+    return -1;
+}
+int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 
 }  // extern "C"
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index cd16e3734..1693783d8 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -281,5 +281,25 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_level(log_level);
     runner->set_log_info_v(log_info_v);
 }
+/* ===========================================================================
+ * Prepared-callable ABI stubs.
+ *
+ * a5 runtimes do not yet implement the per-callable_id orchestration SO
+ * dispatch path described in docs/callable.md (only a2a3/tensormap_and_ringbuffer
+ * does). ChipWorker dlsym's these symbols unconditionally, so we expose stubs
+ * that fail loudly at call time rather than failing to load the library.
+ * =========================================================================== */
+
+int prepare_callable(
+    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
+) {
+    LOG_ERROR("prepare_callable not supported by this runtime variant");
+    return -1;
+}
+int run_prepared(DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *) {
+    LOG_ERROR("run_prepared not supported by this runtime variant");
+    return -1;
+}
+int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 
 }  // extern "C"

From 8690d32fe80f84f9da6749ec53352752df065e8e Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 10:35:04 +0800
Subject: [PATCH 05/28] =?UTF-8?q?feat(callable):=20Stage=203=20=E2=80=94?=
 =?UTF-8?q?=20mailbox=20cid=20protocol=20+=20L3+=20lazy-register=20+=20pre?=
 =?UTF-8?q?-warm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the NEXT_LEVEL raw ChipCallable* pointer path with a unified
callable_id (cid) protocol:

C++ core:
- Remove TaskSlotState::callable (uint64_t ptr) field; unify on callable_id
- Orchestrator::submit_next_level now takes int32_t callable_id
- dispatch_thread/dispatch_process write cid into mailbox for both
  NEXT_LEVEL and SUB worker types

Python runtime:
- Worker.register() accepts ChipCallable in addition to Python fns;
  returns cid from a single shared id space
- _chip_process_loop / _chip_process_loop_with_bootstrap: accept registry
  dict, read cid from mailbox, lazy-prepare + run_prepared
- New _CTRL_PREPARE (=4) control command for explicit pre-warming
- _start_hierarchical: after init(), pushes _CTRL_PREPARE to every chip
  child for each registered ChipCallable (fixes first-run latency spike)
- Orchestrator.submit_next_level raises TypeError on raw ChipCallable
  (migration guide: use Worker.register + pass cid)

Nanobind:
- _Orchestrator binding: submit_next_level takes int32_t callable_id
- _ChipWorker.run_prepared: add TaskArgs overload (chip child path)

Test infrastructure:
- conftest.py st_worker L3: register ChipCallable entries before init
- scene_test.py _create_standalone_worker: compile + register ChipCallable
  before init; CallableNamespace exposes cid (int) not ChipCallable
- Migrate 7 L3 examples/demos to register + cid pattern
- C++ UTs: submit_next_level(int32_t, ...) signatures

Verified: C++ UT 17/17, Python UT 70/70 (65+5), a2a3sim L3 ST 3/3,
a5sim ST 10/10, prepared_callable L2 e2e 1/1.
---
 conftest.py                                   | 11 +++
 .../test_async_notify_demo.py                 |  3 +-
 .../test_deferred_notify_demo.py              |  3 +-
 .../test_async_notify_demo.py                 |  3 +-
 .../test_deferred_notify_demo.py              |  3 +-
 .../workers/l3/allreduce_distributed/main.py  |  3 +-
 examples/workers/l3/ffn_tp_parallel/main.py   |  6 +-
 .../workers/l3/multi_chip_dispatch/main.py    |  5 +-
 python/bindings/task_interface.cpp            |  9 ++
 python/bindings/worker_bind.h                 | 21 ++--
 python/simpler/orchestrator.py                | 39 +++++---
 python/simpler/worker.py                      | 95 ++++++++++++++++---
 simpler_setup/scene_test.py                   | 28 +++++-
 src/common/hierarchical/orchestrator.cpp      | 17 ++--
 src/common/hierarchical/orchestrator.h        | 17 ++--
 src/common/hierarchical/types.cpp             |  1 -
 src/common/hierarchical/types.h               |  9 +-
 src/common/hierarchical/worker_manager.cpp    |  2 +-
 .../ut/cpp/hierarchical/test_orchestrator.cpp | 30 +++---
 tests/ut/cpp/hierarchical/test_scheduler.cpp  | 22 ++---
 20 files changed, 237 insertions(+), 90 deletions(-)

diff --git a/conftest.py b/conftest.py
index bde0b7800..5d029caaf 100644
--- a/conftest.py
+++ b/conftest.py
@@ -920,11 +920,22 @@ def st_worker(request, st_platform, device_pool, _l2_worker_pool):
 
         # Register SubCallable entries from cls.CALLABLE
         sub_ids = {}
+        chip_cids = {}
         for entry in cls.CALLABLE.get("callables", []):
             if "callable" in entry:
                 cid = w.register(entry["callable"])
                 sub_ids[entry["name"]] = cid
+            elif "orchestration" in entry:
+                from simpler_setup.scene_test import _compile_chip_callable_from_spec  # noqa: PLC0415
+
+                name = entry["name"]
+                cache_key = (cls.__qualname__, name, st_platform, runtime)
+                chip = _compile_chip_callable_from_spec(entry, st_platform, runtime, cache_key)
+                cid = w.register(chip)
+                chip_cids[name] = cid
+                chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", [])
         cls._st_sub_ids = sub_ids
+        cls._st_chip_cids = chip_cids
 
         w.init()
         yield w
diff --git a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
index 9977a3a4b..7461f1b7e 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
@@ -137,6 +137,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -157,7 +158,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
index 6045efe4d..31cd3c479 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
@@ -159,6 +159,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
index 2bfab2131..c71fe5498 100644
--- a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
+++ b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
@@ -131,6 +131,7 @@ def run(platform: str = "a5", device_ids: list[int] | None = None, pto_isa_commi
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -151,7 +152,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
index e20b2ecec..d05e19b13 100644
--- a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
+++ b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
@@ -159,6 +159,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/workers/l3/allreduce_distributed/main.py b/examples/workers/l3/allreduce_distributed/main.py
index bd646df82..0dfa3d4de 100644
--- a/examples/workers/l3/allreduce_distributed/main.py
+++ b/examples/workers/l3/allreduce_distributed/main.py
@@ -194,6 +194,7 @@ def run(device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    chip_cid = worker.register(chip_callable)
 
     try:
         print("[allreduce] init worker (forks chip children + bootstraps HCCL)...")
@@ -227,7 +228,7 @@ def orch_fn(orch, _args, cfg):
                 )
                 chip_args.add_scalar(ctx.nranks)
                 chip_args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+                orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
         print("[allreduce] running 2-chip allreduce DAG...")
         worker.run(orch_fn, args=None, config=CallConfig())
diff --git a/examples/workers/l3/ffn_tp_parallel/main.py b/examples/workers/l3/ffn_tp_parallel/main.py
index b41dd561b..aa2bb2d2c 100644
--- a/examples/workers/l3/ffn_tp_parallel/main.py
+++ b/examples/workers/l3/ffn_tp_parallel/main.py
@@ -209,6 +209,8 @@ def run(device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    ffn_cid = worker.register(ffn_local_cc)
+    allreduce_cid = worker.register(allreduce_cc)
 
     try:
         print("[ffn_tp_parallel] init worker (forks chip children + bootstraps HCCL)...")
@@ -231,7 +233,7 @@ def orch_fn(orch, _args, cfg):
                 a1.add_tensor(make_tensor_arg(host_x_shards[i]), TensorArgType.INPUT)
                 a1.add_tensor(make_tensor_arg(host_w_shards[i]), TensorArgType.INPUT)
                 a1.add_tensor(make_tensor_arg(host_partial[i]), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(ffn_local_cc, a1, cfg, worker=i)
+                orch.submit_next_level(ffn_cid, a1, cfg, worker=i)
 
                 # Stage 2: AIV cross-rank sum. Tagging partial_local INPUT
                 # with the same buffer.addr makes TensorMap auto-link this
@@ -250,7 +252,7 @@ def orch_fn(orch, _args, cfg):
                 )
                 a2.add_scalar(ctx.nranks)
                 a2.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(allreduce_cc, a2, cfg, worker=i)
+                orch.submit_next_level(allreduce_cid, a2, cfg, worker=i)
 
         print("[ffn_tp_parallel] running 2-chip 2-stage DAG...")
         worker.run(orch_fn, args=None, config=CallConfig())
diff --git a/examples/workers/l3/multi_chip_dispatch/main.py b/examples/workers/l3/multi_chip_dispatch/main.py
index b92a6fc10..1b5278877 100644
--- a/examples/workers/l3/multi_chip_dispatch/main.py
+++ b/examples/workers/l3/multi_chip_dispatch/main.py
@@ -146,6 +146,9 @@ def subworker(sub_args: TaskArgs) -> None:
     print(f"[multi_chip_dispatch] compiling kernels for {platform}...")
     chip_callable = build_chip_callable(platform)
 
+    # Register the ChipCallable so submit_next_level takes a cid.
+    chip_cid = worker.register(chip_callable)
+
     # --- 5. init() forks chip + sub child processes, starts C++ scheduler.
     print("[multi_chip_dispatch] init worker...")
     worker.init()
@@ -165,7 +168,7 @@ def orch_fn(orch, _args, cfg):
                 chip_args.add_tensor(make_tensor_arg(host_a[i]), TensorArgType.INPUT)
                 chip_args.add_tensor(make_tensor_arg(host_b[i]), TensorArgType.INPUT)
                 chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+                orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
             # Sub task that depends on both chip outputs. Tagging the two
             # host_out[i] tensors INPUT tells the scheduler to wait for
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 7872197ba..7355b6493 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -665,6 +665,15 @@ NB_MODULE(_task_interface, m) {
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
             "Launch a callable_id previously staged via prepare_callable."
         )
+        .def(
+            "run_prepared",
+            [](ChipWorker &self, int32_t callable_id, TaskArgs &args, const CallConfig &config) {
+                TaskArgsView view = make_view(args);
+                self.run_prepared(callable_id, view, config);
+            },
+            nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
+            "Launch a callable_id from a TaskArgs (used by chip child loops)."
+        )
         .def(
             "unregister_callable",
             [](ChipWorker &self, int32_t callable_id) {
diff --git a/python/bindings/worker_bind.h b/python/bindings/worker_bind.h
index f9824980f..fc1e0c909 100644
--- a/python/bindings/worker_bind.h
+++ b/python/bindings/worker_bind.h
@@ -98,20 +98,23 @@ inline void bind_worker(nb::module_ &m) {
     nb::class_<Orchestrator>(m, "_Orchestrator")
         .def(
             "submit_next_level",
-            [](Orchestrator &self, uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker) {
-                return self.submit_next_level(callable, args, config, worker);
+            [](Orchestrator &self, int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker) {
+                return self.submit_next_level(callable_id, args, config, worker);
             },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1),
-            "Submit a NEXT_LEVEL (chip) task. worker= pins to a specific next-level worker (-1 = any)."
+            nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1),
+            "Submit a NEXT_LEVEL (chip) task by registered callable id "
+            "(Stage 3, callable.md). worker= pins to a specific next-level "
+            "worker (-1 = any)."
         )
         .def(
             "submit_next_level_group",
-            [](Orchestrator &self, uint64_t callable, const std::vector<TaskArgs> &args_list, const CallConfig &config,
-               const std::vector<int8_t> &workers) {
-                return self.submit_next_level_group(callable, args_list, config, workers);
+            [](Orchestrator &self, int32_t callable_id, const std::vector<TaskArgs> &args_list,
+               const CallConfig &config, const std::vector<int8_t> &workers) {
+                return self.submit_next_level_group(callable_id, args_list, config, workers);
             },
-            nb::arg("callable"), nb::arg("args_list"), nb::arg("config"), nb::arg("workers") = std::vector<int8_t>{},
-            "Submit a group of NEXT_LEVEL tasks. workers= per-args affinity (empty = any)."
+            nb::arg("callable_id"), nb::arg("args_list"), nb::arg("config"), nb::arg("workers") = std::vector<int8_t>{},
+            "Submit a group of NEXT_LEVEL tasks by registered callable id. "
+            "workers= per-args affinity (empty = any)."
         )
         .def(
             "submit_sub",
diff --git a/python/simpler/orchestrator.py b/python/simpler/orchestrator.py
index 4a8bec655..d3f679de2 100644
--- a/python/simpler/orchestrator.py
+++ b/python/simpler/orchestrator.py
@@ -17,11 +17,11 @@ def my_orch(orch, args, cfg):
         a = TaskArgs()
         a.add_tensor(make_tensor_arg(input_tensor),  TensorArgType.INPUT)
         a.add_tensor(make_tensor_arg(output_tensor), TensorArgType.OUTPUT)
-        orch.submit_next_level(chip_callable, a, cfg)
+        orch.submit_next_level(chip_cid, a, cfg)  # cid from Worker.register(chip_callable)
 
         sub_args = TaskArgs()
         sub_args.add_tensor(make_tensor_arg(output_tensor), TensorArgType.INPUT)
-        orch.submit_sub(cid, sub_args)
+        orch.submit_sub(sub_cid, sub_args)
 
     w.run(my_orch, my_args, my_config)
 
@@ -35,6 +35,7 @@ def my_orch(orch, args, cfg):
 
 from .task_interface import (
     CallConfig,
+    ChipCallable,
     ContinuousTensor,
     DataType,
     TaskArgs,
@@ -44,11 +45,21 @@ def my_orch(orch, args, cfg):
 )
 
 
-def _resolve_callable_ptr(callable_: Any) -> int:
-    """Accept either a ChipCallable (has buffer_ptr()) or a raw int pointer."""
-    if hasattr(callable_, "buffer_ptr"):
-        return callable_.buffer_ptr()
-    return int(callable_)
+def _require_cid(callable_or_cid: Any, *, kind: str) -> int:
+    """Coerce a submit argument to a registered cid (Stage 3, callable.md).
+
+    Raises a clear migration error when the caller still passes a
+    ``ChipCallable`` directly — the Stage 3 contract requires every chip
+    callable to be registered via ``Worker.register(callable)`` *before*
+    ``init()`` so each chip child can pre-warm it on its own device.
+    """
+    if isinstance(callable_or_cid, ChipCallable) or hasattr(callable_or_cid, "buffer_ptr"):
+        raise TypeError(
+            f"{kind} now takes a registered cid, not a ChipCallable. "
+            "Register the callable before init() via "
+            "`cid = worker.register(chip_callable)` and pass `cid` here."
+        )
+    return int(callable_or_cid)
 
 
 class Orchestrator:
@@ -68,18 +79,21 @@ def __init__(self, c_orchestrator: _COrchestrator) -> None:
     # ------------------------------------------------------------------
 
     def submit_next_level(
-        self, callable_: Any, args: TaskArgs, config: Optional[CallConfig] = None, *, worker: int = -1
+        self, callable_id: Any, args: TaskArgs, config: Optional[CallConfig] = None, *, worker: int = -1
     ):
-        """Submit a NEXT_LEVEL (chip) task. Tags inside ``args`` drive deps.
+        """Submit a NEXT_LEVEL (chip) task by registered callable id.
 
+        ``callable_id`` must be the int returned by
+        ``Worker.register(chip_callable)``. Tags inside ``args`` drive deps.
         ``worker``: logical worker id for affinity (-1 = unconstrained).
         """
         cfg = config if config is not None else CallConfig()
-        return self._o.submit_next_level(_resolve_callable_ptr(callable_), args, cfg, int(worker))
+        cid = _require_cid(callable_id, kind="orch.submit_next_level")
+        return self._o.submit_next_level(cid, args, cfg, int(worker))
 
     def submit_next_level_group(
         self,
-        callable_: Any,
+        callable_id: Any,
         args_list: list,
         config: Optional[CallConfig] = None,
         *,
@@ -91,7 +105,8 @@ def submit_next_level_group(
         """
         cfg = config if config is not None else CallConfig()
         w = [int(x) for x in workers] if workers else []
-        return self._o.submit_next_level_group(_resolve_callable_ptr(callable_), args_list, cfg, w)
+        cid = _require_cid(callable_id, kind="orch.submit_next_level_group")
+        return self._o.submit_next_level_group(cid, args_list, cfg, w)
 
     def submit_sub(self, callable_id: int, args: Optional[TaskArgs] = None):
         """Submit a SUB task by registered callable id.
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 69d4fcdad..af1243414 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -54,7 +54,7 @@ def my_l4_orch(orch, args, config):
 import time
 import traceback
 from multiprocessing.shared_memory import SharedMemory
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 from _task_interface import (  # pyright: ignore[reportMissingImports]
     CHIP_BOOTSTRAP_MAILBOX_SIZE,
@@ -72,6 +72,7 @@ def my_l4_orch(orch, args, config):
     MAILBOX_SIZE,
     CallConfig,
     ChipBootstrapConfig,
+    ChipCallable,
     ChipContext,
     ChipWorker,
     ContinuousTensor,
@@ -127,6 +128,11 @@ def my_l4_orch(orch, args, config):
 _CTRL_FREE = 1
 _CTRL_COPY_TO = 2
 _CTRL_COPY_FROM = 3
+# Stage 3 (callable.md): pre-warm a chip child for cid=arg0 by calling
+# `prepare_callable(cid, registry[cid])` so the first run_prepared() does
+# not pay the H2D upload cost.  Sent from the parent right after init()
+# (or whenever a new ChipCallable cid is registered).
+_CTRL_PREPARE = 4
 
 # Control args layout (reuses task mailbox fields when state == _CONTROL_*):
 #   offset  8 (_OFF_CALLABLE):  uint64  sub-command
@@ -260,6 +266,7 @@ def _chip_process_loop(
     buf: memoryview,
     bins,
     device_id: int,
+    registry: dict,
     log_level: int = 1,
     log_info_v: int = 5,
 ) -> None:
@@ -271,6 +278,13 @@ def _chip_process_loop(
     `log_level` / `log_info_v` are the parent's snapshot of the simpler logger
     (computed via `_log.get_current_config()`); the child cannot read the
     parent's logger after fork, so the values are passed explicitly.
+
+    Stage 3 (callable.md): TASK_READY carries a cid in OFF_CALLABLE; the
+    child looks the cid up in the COW-inherited Python ``registry`` to get
+    the ChipCallable, calls ``cw.prepare_callable(cid, callable)`` once,
+    then ``cw.run_prepared(cid, args, cfg)``.  ``_CTRL_PREPARE`` is the
+    explicit pre-warm path (parent pushes after init() to amortise the
+    first H2D upload).
     """
     import traceback as _tb  # noqa: PLC0415
 
@@ -289,20 +303,35 @@ def _chip_process_loop(
 
     mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
     state_addr = mailbox_addr + _OFF_STATE
-    args_ptr = mailbox_addr + _OFF_ARGS
     sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id}] ready\n")
     sys.stderr.flush()
 
+    # Per-child set of cids already prepared on this device.  The parent
+    # pre-warms via _CTRL_PREPARE, but TASK_READY also lazy-prepares as a
+    # safety net (e.g. registrations that bypassed the prefetch path).
+    prepared: set[int] = set()
+
+    def _ensure_prepared(cid: int) -> None:
+        if cid in prepared:
+            return
+        callable_obj = registry.get(cid)
+        if callable_obj is None:
+            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
+        cw.prepare_callable(cid, callable_obj)
+        prepared.add(cid)
+
     while True:
         state = _mailbox_load_i32(state_addr)
         if state == _TASK_READY:
-            callable_ptr = struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]
+            cid = int(struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]) & 0xFFFFFFFF
             cfg = _read_config_from_mailbox(buf)
 
             code = 0
             msg = ""
             try:
-                cw.run_from_blob(callable_ptr, args_ptr, cfg)
+                _ensure_prepared(cid)
+                args = _read_args_from_mailbox(buf)
+                cw.run_prepared(cid, args, cfg)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -330,6 +359,9 @@ def _chip_process_loop(
                     src = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0]
                     n = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0]
                     cw.copy_from(dst, src, n)
+                elif sub_cmd == _CTRL_PREPARE:
+                    cid = int(struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0]) & 0xFFFFFFFF
+                    _ensure_prepared(cid)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id} ctrl={int(sub_cmd)}", e)
@@ -347,6 +379,7 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
     bootstrap_cfg: ChipBootstrapConfig,
     bootstrap_mailbox_addr: int,
     max_buffer_count: int,
+    registry: dict,
     log_level: int = 1,
     log_info_v: int = 5,
 ) -> None:
@@ -395,21 +428,35 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
 
     mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
     state_addr = mailbox_addr + _OFF_STATE
-    args_ptr = mailbox_addr + _OFF_ARGS
     sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id} bootstrap] ready\n")
     sys.stderr.flush()
 
+    # Per-child set of cids already prepared on this device (Stage 3,
+    # callable.md).  Mirrors `_chip_process_loop`'s `prepared`.
+    prepared: set[int] = set()
+
+    def _ensure_prepared(cid: int) -> None:
+        if cid in prepared:
+            return
+        callable_obj = registry.get(cid)
+        if callable_obj is None:
+            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
+        cw._impl.prepare_callable(cid, callable_obj)
+        prepared.add(cid)
+
     try:
         while True:
             state = _mailbox_load_i32(state_addr)
             if state == _TASK_READY:
-                callable_ptr = struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]
+                cid = int(struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]) & 0xFFFFFFFF
                 cfg = _read_config_from_mailbox(buf)
 
                 code = 0
                 msg = ""
                 try:
-                    cw._impl.run_from_blob(callable_ptr, args_ptr, cfg)
+                    _ensure_prepared(cid)
+                    args = _read_args_from_mailbox(buf)
+                    cw._impl.run_prepared(cid, args, cfg)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -467,6 +514,9 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
                         src = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0]
                         n = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0]
                         cw._impl.copy_from(dst, src, n)
+                    elif sub_cmd == _CTRL_PREPARE:
+                        cid = int(struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0]) & 0xFFFFFFFF
+                        _ensure_prepared(cid)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id} ctrl={int(sub_cmd)}", e)
@@ -561,7 +611,7 @@ def __init__(
     ) -> None:
         self.level = level
         self._config = config
-        self._callable_registry: dict[int, Callable] = {}
+        self._callable_registry: dict[int, Any] = {}
         self._initialized = False
 
         # Level-2 internals
@@ -601,14 +651,24 @@ def __init__(
     # Callable registration (before init)
     # ------------------------------------------------------------------
 
-    def register(self, fn: Callable) -> int:
-        """Register a callable (sub or orch fn). Must be called before init()."""
+    def register(self, target) -> int:
+        """Register a callable. Must be called before init().
+
+        Stage 3 (callable.md): a unified id space serves both Python
+        functions (sub fn / orch fn) and ``ChipCallable`` instances.
+        Returns the cid the orch function must pass to
+        ``orch.submit_next_level(cid, …)`` / ``orch.submit_sub(cid, …)``.
+
+        ChipCallables are pre-warmed on every chip child during ``init()``
+        via the ``_CTRL_PREPARE`` mailbox command so the first
+        ``submit_next_level`` does not pay the H2D upload cost.
+        """
         if self.level < 3:
             raise RuntimeError("Worker.register() is only available at level 3+")
         if self._initialized:
             raise RuntimeError("Worker.register() must be called before init()")
         cid = len(self._callable_registry)
-        self._callable_registry[cid] = fn
+        self._callable_registry[cid] = target
         return cid
 
     def add_worker(self, worker: "Worker") -> None:
@@ -778,6 +838,7 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
                             bootstrap_cfg,
                             bootstrap_addr,
                             max_buffer_count,
+                            registry,
                             chip_log_level,
                             chip_log_info_v,
                         )
@@ -786,6 +847,7 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
                             buf,
                             self._l3_bins,
                             dev_id,
+                            registry,
                             chip_log_level,
                             chip_log_info_v,
                         )
@@ -846,6 +908,17 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
 
         self._orch = Orchestrator(dw.get_orchestrator())
 
+        # Pre-warm every chip child: for each registered ChipCallable cid,
+        # send `_CTRL_PREPARE` to all chip children so the first
+        # `submit_next_level` does not pay the H2D upload cost (callable.md
+        # §3.3).  Sub fns / orch fns do not need pre-warming — the
+        # registry is already COW-inherited.
+        if device_ids:
+            for cid, target in self._callable_registry.items():
+                if isinstance(target, ChipCallable):
+                    for worker_id in range(len(self._chip_shms)):
+                        self._chip_control(worker_id, _CTRL_PREPARE, arg0=cid)
+
     # ------------------------------------------------------------------
     # Bootstrap plumbing
     # ------------------------------------------------------------------
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 54c6519ef..1a7ab3870 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -1057,6 +1057,11 @@ def test_run(self, st_platform, st_worker, request):
         cls_name = type(self).__name__
         callable_obj = self.build_callable(st_platform)
         sub_ids = getattr(type(self), "_st_sub_ids", {})
+        # Stage 3 (callable.md): for L3, use pre-registered chip cids
+        # instead of raw ChipCallable objects.
+        chip_cids = getattr(type(self), "_st_chip_cids", {})
+        if self._st_level == 3 and chip_cids:
+            callable_obj = {**chip_cids}
 
         # Primary device id: prefer the one actually allocated by st_worker
         # (each test class can hold a different slot from DevicePool); fall back
@@ -1319,12 +1324,19 @@ def run_module(module_name):  # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch
         ok = True
         for (runtime, level), group in by_rt_level.items():
             print(f"\n=== Runtime: {runtime}  Level: {level} ===")
-            worker, per_class_sub_ids = _create_standalone_worker(group, level, args, selected_by_cls)
+            worker, per_class_sub_ids, per_class_chip_cids = _create_standalone_worker(
+                group, level, args, selected_by_cls
+            )
             try:
                 for cls in group:
                     inst = cls()
                     callable_obj = inst.build_callable(args.platform)
                     sub_ids = per_class_sub_ids.get(cls, {})
+                    chip_cids = per_class_chip_cids.get(cls, {})
+                    # For L3: merge chip cids into callable_obj (replacing
+                    # ChipCallable objects with their registered cid).
+                    if level == 3 and chip_cids:
+                        callable_obj = {**chip_cids}
                     for case in selected_by_cls[cls]:
                         label = f"{cls.__name__}::{case['name']}"
                         print(f"  {label} ... ", end="", flush=True)
@@ -1590,12 +1602,24 @@ def _create_standalone_worker(group, level, args, selected_by_cls):
     )
     # Register sub callables per-class to avoid name collisions
     per_class_sub_ids: dict[type, dict] = {}
+    # Stage 3 (callable.md): also register ChipCallables here (before init)
+    # so the chip children pre-warm them via _CTRL_PREPARE.
+    per_class_chip_cids: dict[type, dict] = {}
     for cls in group:
         cls_sub_ids = {}
+        cls_chip_cids = {}
         for entry in cls.CALLABLE.get("callables", []):
             if "callable" in entry:
                 cid = worker.register(entry["callable"])
                 cls_sub_ids[entry["name"]] = cid
+            elif "orchestration" in entry:
+                name = entry["name"]
+                cache_key = (cls.__qualname__, name, args.platform, cls._st_runtime)
+                chip = _compile_chip_callable_from_spec(entry, args.platform, cls._st_runtime, cache_key)
+                cid = worker.register(chip)
+                cls_chip_cids[name] = cid
+                cls_chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", [])
         per_class_sub_ids[cls] = cls_sub_ids
+        per_class_chip_cids[cls] = cls_chip_cids
     worker.init()
-    return worker, per_class_sub_ids
+    return worker, per_class_sub_ids, per_class_chip_cids
diff --git a/src/common/hierarchical/orchestrator.cpp b/src/common/hierarchical/orchestrator.cpp
index c5912a5b9..5a6e710f9 100644
--- a/src/common/hierarchical/orchestrator.cpp
+++ b/src/common/hierarchical/orchestrator.cpp
@@ -137,25 +137,25 @@ ContinuousTensor Orchestrator::alloc(const std::vector<uint32_t> &shape, DataTyp
 // =============================================================================
 
 SubmitResult
-Orchestrator::submit_next_level(uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker) {
+Orchestrator::submit_next_level(int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker) {
     std::vector<int8_t> affinities;
     if (worker >= 0) affinities = {worker};
-    return submit_impl(WorkerType::NEXT_LEVEL, callable, /*callable_id=*/-1, config, {args}, std::move(affinities));
+    return submit_impl(WorkerType::NEXT_LEVEL, callable_id, config, {args}, std::move(affinities));
 }
 
 SubmitResult Orchestrator::submit_next_level_group(
-    uint64_t callable, const std::vector<TaskArgs> &args_list, const CallConfig &config,
+    int32_t callable_id, const std::vector<TaskArgs> &args_list, const CallConfig &config,
     const std::vector<int8_t> &workers
 ) {
-    return submit_impl(WorkerType::NEXT_LEVEL, callable, /*callable_id=*/-1, config, args_list, workers);
+    return submit_impl(WorkerType::NEXT_LEVEL, callable_id, config, args_list, workers);
 }
 
 SubmitResult Orchestrator::submit_sub(int32_t callable_id, const TaskArgs &args) {
-    return submit_impl(WorkerType::SUB, /*callable_ptr=*/0, callable_id, CallConfig{}, {args});
+    return submit_impl(WorkerType::SUB, callable_id, CallConfig{}, {args});
 }
 
 SubmitResult Orchestrator::submit_sub_group(int32_t callable_id, const std::vector<TaskArgs> &args_list) {
-    return submit_impl(WorkerType::SUB, /*callable_ptr=*/0, callable_id, CallConfig{}, args_list);
+    return submit_impl(WorkerType::SUB, callable_id, CallConfig{}, args_list);
 }
 
 // =============================================================================
@@ -163,8 +163,8 @@ SubmitResult Orchestrator::submit_sub_group(int32_t callable_id, const std::vect
 // =============================================================================
 
 SubmitResult Orchestrator::submit_impl(
-    WorkerType worker_type, uint64_t callable_ptr, int32_t callable_id, const CallConfig &config,
-    std::vector<TaskArgs> args_list, std::vector<int8_t> affinities
+    WorkerType worker_type, int32_t callable_id, const CallConfig &config, std::vector<TaskArgs> args_list,
+    std::vector<int8_t> affinities
 ) {
     if (args_list.empty()) throw std::invalid_argument("Orchestrator: args_list must not be empty");
     config.validate();
@@ -198,7 +198,6 @@ SubmitResult Orchestrator::submit_impl(
     s.reset();
 
     s.worker_type = worker_type;
-    s.callable = callable_ptr;
     s.callable_id = callable_id;
     s.config = config;
 
diff --git a/src/common/hierarchical/orchestrator.h b/src/common/hierarchical/orchestrator.h
index b6880d3c1..a156a9000 100644
--- a/src/common/hierarchical/orchestrator.h
+++ b/src/common/hierarchical/orchestrator.h
@@ -92,18 +92,19 @@ class Orchestrator {
     void copy_to(int worker_id, uint64_t dst, uint64_t src, size_t size);
     void copy_from(int worker_id, uint64_t dst, uint64_t src, size_t size);
 
-    // Submit a NEXT_LEVEL task. `callable` is the chip callable buffer pointer
-    // (uint64_t handle from Python — typically ChipCallable.buffer_ptr()).
-    // Tags inside `args` drive dependency inference; OUTPUT tensors with null
-    // data are auto-allocated from the HeapRing.
+    // Submit a NEXT_LEVEL task. `callable_id` is a cid registered via
+    // Worker.register() (Stage 3, callable.md): the chip child looks it up
+    // in its COW-inherited Python registry to get the actual ChipCallable.
+    // Tags inside `args` drive dependency inference; OUTPUT tensors with
+    // null data are auto-allocated from the HeapRing.
     // `worker`: logical worker id for affinity (-1 = unconstrained).
     SubmitResult
-    submit_next_level(uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker = -1);
+    submit_next_level(int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker = -1);
 
     // Submit a group of NEXT_LEVEL tasks: N args -> N workers, 1 DAG node.
     // `workers`: per-args affinity (empty = all unconstrained).
     SubmitResult submit_next_level_group(
-        uint64_t callable, const std::vector<TaskArgs> &args_list, const CallConfig &config,
+        int32_t callable_id, const std::vector<TaskArgs> &args_list, const CallConfig &config,
         const std::vector<int8_t> &workers = {}
     );
 
@@ -178,8 +179,8 @@ class Orchestrator {
     // Shared submit machinery. Takes `args_list` by value so the Orchestrator
     // can patch `tensor.data` on OUTPUT tensors flagged for auto-allocation.
     SubmitResult submit_impl(
-        WorkerType worker_type, uint64_t callable_ptr, int32_t callable_id, const CallConfig &config,
-        std::vector<TaskArgs> args_list, std::vector<int8_t> affinities = {}
+        WorkerType worker_type, int32_t callable_id, const CallConfig &config, std::vector<TaskArgs> args_list,
+        std::vector<int8_t> affinities = {}
     );
 
     // Size, in aligned bytes, an OUTPUT tensor should occupy in the HeapRing.
diff --git a/src/common/hierarchical/types.cpp b/src/common/hierarchical/types.cpp
index e04f883f9..882a630c6 100644
--- a/src/common/hierarchical/types.cpp
+++ b/src/common/hierarchical/types.cpp
@@ -28,7 +28,6 @@ void TaskSlotState::reset() {
     output_keys.clear();
     fanin_producers.clear();
     worker_type = WorkerType::NEXT_LEVEL;
-    callable = 0;
     callable_id = -1;
     config = CallConfig{};
     task_args.clear();
diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h
index dbd91659e..33d24cedc 100644
--- a/src/common/hierarchical/types.h
+++ b/src/common/hierarchical/types.h
@@ -145,9 +145,12 @@ struct TaskSlotState {
 
     // --- Task data (stored on parent heap, lives until slot CONSUMED) ---
     WorkerType worker_type{WorkerType::NEXT_LEVEL};
-    uint64_t callable{0};     // NEXT_LEVEL: ChipCallable buffer ptr; SUB: unused
-    int32_t callable_id{-1};  // SUB: registered callable id
-    CallConfig config{};      // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features)
+    // Unified callable id: NEXT_LEVEL chip callables and SUB fns share the
+    // same Worker.register() id space (Stage 3, callable.md). The mailbox
+    // wire format writes this as a uint64 with the cid in the low 32 bits;
+    // dispatch_process read it identically for both worker types.
+    int32_t callable_id{-1};
+    CallConfig config{};  // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features)
 
     // Unified task-args storage: `task_args` is the single-task builder;
     // when `is_group_` is true, `task_args_list` carries one TaskArgs per
diff --git a/src/common/hierarchical/worker_manager.cpp b/src/common/hierarchical/worker_manager.cpp
index cb2f31b6e..2d0c40017 100644
--- a/src/common/hierarchical/worker_manager.cpp
+++ b/src/common/hierarchical/worker_manager.cpp
@@ -139,7 +139,7 @@ void WorkerThread::loop() {
 }
 
 void WorkerThread::dispatch_process(TaskSlotState &s, int32_t group_index) {
-    uint64_t callable = (s.worker_type == WorkerType::SUB) ? static_cast<uint64_t>(s.callable_id) : s.callable;
+    uint64_t callable = static_cast<uint64_t>(static_cast<uint32_t>(s.callable_id));
     TaskArgsView view = s.args_view(group_index);
 
     // Hold mailbox_mu_ for the entire round trip (write payload + state +
diff --git a/tests/ut/cpp/hierarchical/test_orchestrator.cpp b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
index 7c0d45978..59371c6da 100644
--- a/tests/ut/cpp/hierarchical/test_orchestrator.cpp
+++ b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
@@ -70,7 +70,7 @@ struct OrchestratorFixture : public ::testing::Test {
 
 TEST_F(OrchestratorFixture, IndependentTaskIsImmediatelyReady) {
     auto a = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level(/*callable=*/0xDEAD, a, cfg);
+    auto res = orch.submit_next_level(/*callable_id=*/42, a, cfg);
     EXPECT_NE(res.task_slot, INVALID_SLOT);
 
     TaskSlot slot;
@@ -82,13 +82,13 @@ TEST_F(OrchestratorFixture, IndependentTaskIsImmediatelyReady) {
 TEST_F(OrchestratorFixture, DependentTaskIsPending) {
     // Task A produces an OUTPUT at key 0xBEEF
     auto args_a = single_tensor_args(0xBEEF, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot a_slot;
     rq.try_pop(a_slot);
 
     // Task B reads INPUT at the same key -- depends on A
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
-    auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
+    auto b = orch.submit_next_level(42, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
     EXPECT_EQ(S(b.task_slot).fanin_count, 1);
 
@@ -98,7 +98,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
 
 TEST_F(OrchestratorFixture, TensorMapTracksProducer) {
     auto args_a = single_tensor_args(0x1234, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
@@ -107,7 +107,7 @@ TEST_F(OrchestratorFixture, TensorMapTracksProducer) {
 
 TEST_F(OrchestratorFixture, OnConsumedCleansUpTensorMap) {
     auto args_a = single_tensor_args(0x42, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot slot;
     rq.try_pop(slot);
 
@@ -123,7 +123,7 @@ TEST_F(OrchestratorFixture, OnConsumedCleansUpTensorMap) {
 TEST_F(OrchestratorFixture, ScopeRegistersAndReleasesRef) {
     orch.scope_begin();
     auto args_a = single_tensor_args(0x77, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot slot;
     rq.try_pop(slot);
 
@@ -147,13 +147,13 @@ TEST_F(OrchestratorFixture, ScopeRegistersAndReleasesRef) {
 TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
     // OUTPUT-tagged input registers a producer
     auto args_a = single_tensor_args(0xAAAA, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
     // Second task references same key but tagged NO_DEP -- should be independent
     auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
-    auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
+    auto b = orch.submit_next_level(42, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
     EXPECT_EQ(S(b.task_slot).fanin_count, 0);
 }
@@ -161,7 +161,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
 TEST_F(OrchestratorFixture, GroupTaskStoresArgsListPerMember) {
     TaskArgs a0 = single_tensor_args(0xA0, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xA1, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto res = orch.submit_next_level_group(42, {a0, a1}, cfg);
 
     EXPECT_NE(res.task_slot, INVALID_SLOT);
     EXPECT_TRUE(S(res.task_slot).is_group());
@@ -179,7 +179,7 @@ TEST_F(OrchestratorFixture, GroupTaskStoresArgsListPerMember) {
 
 TEST_F(OrchestratorFixture, SingleTaskStoresTaskArgsDirectly) {
     TaskArgs a0 = single_tensor_args(0xC0, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level(0xDEAD, a0, cfg);
+    auto res = orch.submit_next_level(42, a0, cfg);
     ASSERT_NE(res.task_slot, INVALID_SLOT);
     EXPECT_FALSE(S(res.task_slot).is_group());
     EXPECT_EQ(S(res.task_slot).group_size(), 1);
@@ -200,7 +200,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {
     t.dtype = DataType::UINT8;
     args.add_tensor(t, TensorArgType::OUTPUT);
 
-    auto res = orch.submit_next_level(0xDEAD, args, cfg);
+    auto res = orch.submit_next_level(42, args, cfg);
     ASSERT_NE(res.task_slot, INVALID_SLOT);
 
     uint64_t data = S(res.task_slot).task_args.tensor(0).data;
@@ -220,7 +220,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     // the alloc-slot (so its HeapRing slab stays live while they write)
     // must tag the buffer INOUT.
     auto creator_args = single_tensor_args(0xFEED, TensorArgType::OUTPUT);
-    auto creator = orch.submit_next_level(0xDEAD, creator_args, cfg);
+    auto creator = orch.submit_next_level(42, creator_args, cfg);
     TaskSlot drain;
     rq.try_pop(drain);
     // Mark the creator COMPLETED so the new submit mimics the alloc-slot
@@ -228,7 +228,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     S(creator.task_slot).state.store(TaskState::COMPLETED, std::memory_order_relaxed);
 
     auto writer_args = single_tensor_args(0xFEED, TensorArgType::INOUT);
-    auto writer = orch.submit_next_level(0xDEAD, writer_args, cfg);
+    auto writer = orch.submit_next_level(42, writer_args, cfg);
     TaskSlot writer_slot;
     rq.try_pop(writer_slot);
 
@@ -259,13 +259,13 @@ TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
     };
     for (Case c : {Case{0xABCD, TensorArgType::OUTPUT}, Case{0xBEEF, TensorArgType::OUTPUT_EXISTING}}) {
         auto prior_args = single_tensor_args(c.key, TensorArgType::OUTPUT);
-        auto prior = orch.submit_next_level(0xDEAD, prior_args, cfg);
+        auto prior = orch.submit_next_level(42, prior_args, cfg);
         TaskSlot drain;
         rq.try_pop(drain);
         S(prior.task_slot).state.store(TaskState::COMPLETED, std::memory_order_relaxed);
 
         auto writer_args = single_tensor_args(c.key, c.tag);
-        auto writer = orch.submit_next_level(0xDEAD, writer_args, cfg);
+        auto writer = orch.submit_next_level(42, writer_args, cfg);
 
         EXPECT_EQ(tm.lookup(TensorKey{c.key, -1}), writer.task_slot);
         EXPECT_EQ(S(writer.task_slot).fanin_count, 0);
diff --git a/tests/ut/cpp/hierarchical/test_scheduler.cpp b/tests/ut/cpp/hierarchical/test_scheduler.cpp
index a66dcfd27..2fc7ba8c1 100644
--- a/tests/ut/cpp/hierarchical/test_scheduler.cpp
+++ b/tests/ut/cpp/hierarchical/test_scheduler.cpp
@@ -267,13 +267,13 @@ struct SchedulerFixture : public ::testing::Test {
 
 TEST_F(SchedulerFixture, IndependentTaskDispatchedAndConsumed) {
     auto args_a = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto res = orch.submit_next_level(42, args_a, cfg);
     TaskSlot slot = res.task_slot;
 
     mock_worker.wait_running();
     ASSERT_GE(mock_worker.dispatched_count(), 1);
     EXPECT_EQ(mock_worker.dispatched[0].tensor_key, 0xCAFEu);
-    EXPECT_EQ(mock_worker.dispatched[0].callable, 0xDEADu);
+    EXPECT_EQ(mock_worker.dispatched[0].callable, 42u);
 
     mock_worker.complete();
     wait_consumed(slot);
@@ -281,14 +281,14 @@ TEST_F(SchedulerFixture, IndependentTaskDispatchedAndConsumed) {
 
 TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
     auto args_a = single_tensor_args(0xBEEF, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xAA, args_a, cfg);
+    auto a = orch.submit_next_level(10, args_a, cfg);
 
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
-    auto b = orch.submit_next_level(0xBB, args_b, cfg);
+    auto b = orch.submit_next_level(11, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
 
     mock_worker.wait_running();
-    EXPECT_EQ(mock_worker.dispatched[0].callable, 0xAAu);
+    EXPECT_EQ(mock_worker.dispatched[0].callable, 10u);
     mock_worker.complete();  // A done
 
     auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(300);
@@ -296,7 +296,7 @@ TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
         std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
     ASSERT_GE(mock_worker.dispatched_count(), 2);
-    EXPECT_EQ(mock_worker.dispatched[1].callable, 0xBBu);
+    EXPECT_EQ(mock_worker.dispatched[1].callable, 11u);
 
     mock_worker.complete();  // B done
     wait_consumed(b.task_slot);
@@ -375,7 +375,7 @@ TEST_F(GroupSchedulerFixture, GroupDispatchesToNWorkers) {
     TaskArgs a0 = single_tensor_args(0xA0, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xA1, TensorArgType::OUTPUT);
 
-    auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto res = orch.submit_next_level_group(42, {a0, a1}, cfg);
     TaskSlot slot = res.task_slot;
 
     worker_a.wait_running();
@@ -400,7 +400,7 @@ TEST_F(GroupSchedulerFixture, GroupDispatchesToNWorkers) {
 TEST_F(GroupSchedulerFixture, GroupCompletesOnlyWhenAllDone) {
     TaskArgs a0 = single_tensor_args(0xB0, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xB1, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto res = orch.submit_next_level_group(42, {a0, a1}, cfg);
     TaskSlot slot = res.task_slot;
 
     worker_a.wait_running();
@@ -491,7 +491,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
     // Submit a next-level task; the only chip worker begins running it and
     // stays blocked until we call complete() on it.
     auto chip_args = single_tensor_args(0xAAA, TensorArgType::OUTPUT);
-    auto chip = orch.submit_next_level(0xCDCD, chip_args, cfg);
+    auto chip = orch.submit_next_level(20, chip_args, cfg);
     next_level_worker.wait_running();
     ASSERT_TRUE(next_level_worker.is_running.load());
 
@@ -522,10 +522,10 @@ TEST_F(GroupSchedulerFixture, GroupDependencyChain) {
     // Task B reads INPUT at the same key -- depends on group A.
     TaskArgs a0 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto a = orch.submit_next_level_group(42, {a0, a1}, cfg);
 
     auto args_b = single_tensor_args(0xCAFE, TensorArgType::INPUT);
-    auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
+    auto b = orch.submit_next_level(42, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
 
     worker_a.wait_running();

From da1a0eb4c933a64dfeff66f1036d4e3ab8066fbc Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 11:26:02 +0800
Subject: [PATCH 06/28] =?UTF-8?q?feat(callable):=20Stage=204=20=E2=80=94?=
 =?UTF-8?q?=20unify=20L2=20API=20on=20register=20+=20run(cid)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Promote the Stage 3 cid contract to the L2 entry point so every level of
the hierarchy speaks the same dispatch surface.

Worker (level=2):
- register() now also accepts ChipCallable; returns a cid from the
  unified id space (callable.md §3.4).  May be called either before or
  after init() — L2 has no fork/COW constraint.  Pre-init registrations
  are batched and prepared at the end of init(); post-init registrations
  prepare on the device immediately.
- run(cid, args, cfg) routes through _chip_worker.run_prepared.
- _l2_use_prepared probe: when the bound runtime variant lacks
  prepare_callable support (host_build_graph / aicpu_build_graph stub
  return -1 — see Stage 2), the first prepare attempt flips the flag and
  every subsequent run() falls back to the legacy _chip_worker.run
  lower-level binding silently.

Rollback knob:
- PTO2_DISABLE_PREPARED_CALLABLE=1 forces L2 onto the legacy lower-level
  binding (skips prepare at init, resolves cid back to its ChipCallable
  at run time).  L3+ paths are unaffected — the cid mailbox protocol has
  no legacy fallback.

scene_test.py:
- _run_and_validate_l2 now register()s the compiled ChipCallable once
  per class (cached via _st_l2_cid) and calls Worker.run(cid, …).

Verified: Python UT 80/80 (15 chip + 65 worker), a2a3sim L2
host_build_graph 4/4 (auto fallback), aicpu_build_graph 3/3, trb
spmd_sync_start (with and without PTO2_DISABLE_PREPARED_CALLABLE=1),
prepared_callable e2e 1/1.
---
 python/simpler/worker.py    | 115 +++++++++++++++++++++++++++++++-----
 simpler_setup/scene_test.py |  12 +++-
 2 files changed, 111 insertions(+), 16 deletions(-)

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index af1243414..c794d9ad9 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -87,6 +87,17 @@ def my_l4_orch(orch, args, config):
 _BOOTSTRAP_WAIT_TIMEOUT_S = 120.0
 _BOOTSTRAP_POLL_INTERVAL_S = 0.001
 
+# Stage 4 (callable.md): rollback knob.  When set to "1" the L2 path skips
+# `prepare_callable` at init() and `Worker.run` falls back to the legacy
+# `_chip_worker.run(callable, args, cfg)` lower-level binding.  L3+ paths
+# are unaffected — the cid mailbox protocol does not have a legacy fallback.
+_PREPARED_CALLABLE_DISABLED_ENV = "PTO2_DISABLE_PREPARED_CALLABLE"
+
+
+def _prepared_callable_disabled() -> bool:
+    return os.environ.get(_PREPARED_CALLABLE_DISABLED_ENV, "") == "1"
+
+
 # ---------------------------------------------------------------------------
 # Unified mailbox layout (must match worker_manager.h MAILBOX_OFF_*)
 # ---------------------------------------------------------------------------
@@ -616,6 +627,12 @@ def __init__(
 
         # Level-2 internals
         self._chip_worker: Optional[ChipWorker] = None
+        # Stage 4 (callable.md): when the bound runtime variant supports
+        # prepare_callable, L2 routes Worker.run(cid) through run_prepared;
+        # otherwise the cid is resolved to its ChipCallable and the legacy
+        # lower-level binding handles dispatch.  Decided lazily in
+        # `_init_level2` (and toggled by `PTO2_DISABLE_PREPARED_CALLABLE`).
+        self._l2_use_prepared = False
 
         # Level-3+ internals
         self._worker: Optional[_Worker] = None
@@ -651,24 +668,61 @@ def __init__(
     # Callable registration (before init)
     # ------------------------------------------------------------------
 
+    def _l2_prepare(self, cid: int, target) -> bool:
+        """Try to pre-warm a ChipCallable on the L2 device.
+
+        Returns True on success.  When the bound runtime variant lacks
+        prepare_callable support (e.g. host_build_graph / aicpu_build_graph,
+        which return -1 from the C ABI stub), flips ``_l2_use_prepared``
+        off and returns False so the caller stops trying further cids and
+        Worker.run falls back to the legacy lower-level binding.
+        """
+        assert self._chip_worker is not None
+        try:
+            self._chip_worker.prepare_callable(cid, target)
+            return True
+        except RuntimeError:
+            self._l2_use_prepared = False
+            return False
+
     def register(self, target) -> int:
-        """Register a callable. Must be called before init().
+        """Register a callable. Returns the cid passed to ``run`` / ``submit_*``.
 
-        Stage 3 (callable.md): a unified id space serves both Python
-        functions (sub fn / orch fn) and ``ChipCallable`` instances.
-        Returns the cid the orch function must pass to
+        Stage 4 (callable.md): a unified id space serves Python functions
+        (sub fn / orch fn) and ``ChipCallable`` instances at every level.
+        L2 returns a cid the user passes to ``Worker.run(cid, args, cfg)``;
+        L3+ returns a cid the orch function passes to
         ``orch.submit_next_level(cid, …)`` / ``orch.submit_sub(cid, …)``.
 
-        ChipCallables are pre-warmed on every chip child during ``init()``
-        via the ``_CTRL_PREPARE`` mailbox command so the first
-        ``submit_next_level`` does not pay the H2D upload cost.
+        Timing constraints:
+          - L3+: must be called **before** ``init()`` so the COW-inherited
+            registry is visible to forked chip / sub children.  ChipCallables
+            are pre-warmed by pushing ``_CTRL_PREPARE`` to every chip child
+            during ``init()``.
+          - L2: may be called either before or after ``init()`` (no fork,
+            no COW constraint).  When called post-init, ChipCallables are
+            prepared on the device immediately; pre-init registrations are
+            batched and prepared at the end of ``init()``.
+
+        Both pre-warm steps are skipped under
+        ``PTO2_DISABLE_PREPARED_CALLABLE=1``; ``Worker.run`` then
+        falls back to the legacy lower-level ``_chip_worker.run`` binding.
         """
-        if self.level < 3:
-            raise RuntimeError("Worker.register() is only available at level 3+")
-        if self._initialized:
-            raise RuntimeError("Worker.register() must be called before init()")
+        if self.level >= 3 and self._initialized:
+            raise RuntimeError(
+                "Worker.register() at level >= 3 must be called before init() "
+                "(forked children inherit the registry via COW)"
+            )
         cid = len(self._callable_registry)
         self._callable_registry[cid] = target
+
+        # L2 post-init: pre-warm immediately so the very first
+        # `Worker.run(cid, …)` is a clean cache hit.  When the runtime
+        # does not support prepare_callable (Stage 2 stub variants),
+        # `_l2_prepare` flips `_l2_use_prepared` off and `Worker.run`
+        # silently falls back to the legacy binding.
+        if self.level == 2 and self._initialized and isinstance(target, ChipCallable) and self._l2_use_prepared:
+            self._l2_prepare(cid, target)
         return cid
 
     def add_worker(self, worker: "Worker") -> None:
@@ -725,6 +779,20 @@ def _init_level2(self) -> None:
         self._chip_worker.init(binaries)
         self._chip_worker.set_device(device_id)
 
+        # Stage 4 (callable.md): pre-warm any registered ChipCallable so the
+        # first run(cid, …) does not pay the H2D upload cost.  Skipped under
+        # the rollback env var so the legacy `_chip_worker.run(callable, …)`
+        # path stays viable for emergency triage.  The flag also flips when
+        # the bound runtime variant lacks prepare_callable support
+        # (host_build_graph / aicpu_build_graph still return -1 — see Stage 2):
+        # the first prepare attempt sets `_l2_use_prepared = False` and every
+        # subsequent run() goes through the legacy lower-level binding.
+        self._l2_use_prepared = not _prepared_callable_disabled()
+        if self._l2_use_prepared:
+            for cid, target in self._callable_registry.items():
+                if isinstance(target, ChipCallable) and not self._l2_prepare(cid, target):
+                    break
+
     def _init_hierarchical(self) -> None:
         device_ids = self._config.get("device_ids", [])
         n_sub = self._config.get("num_sub_workers", 0)
@@ -1115,16 +1183,33 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None:
     def run(self, callable, args=None, config=None) -> None:
         """Execute one task (L2) or one DAG (L3+) synchronously.
 
-        callable: ChipCallable (L2) or Python orch fn (L3+)
-        args:     TaskArgs (optional)
-        config:   CallConfig (optional, default-constructed if None)
+        Stage 4 (callable.md):
+          - L2: ``callable`` is a cid returned by ``Worker.register(chip_callable)``.
+            Routes to ``_chip_worker.run_prepared(cid, args, cfg)``.
+            Under ``PTO2_DISABLE_PREPARED_CALLABLE=1`` the cid is resolved
+            back to the registered ``ChipCallable`` and the legacy
+            ``_chip_worker.run(callable, args, cfg)`` lower-level binding
+            is invoked instead.
+          - L3+: ``callable`` is a Python orch fn invoked with the
+            ``Orchestrator`` handle.
+
+        ``args``  : TaskArgs (optional)
+        ``config``: CallConfig (optional, default-constructed if None)
         """
         assert self._initialized, "Worker not initialized; call init() first"
         cfg = config if config is not None else CallConfig()
 
         if self.level == 2:
             assert self._chip_worker is not None
-            self._chip_worker.run(callable, args, cfg)
+            if not self._l2_use_prepared:
+                # Rollback / unsupported-runtime path: resolve cid back to
+                # its ChipCallable and call the legacy lower-level binding.
+                target = self._callable_registry.get(int(callable))
+                if target is None:
+                    raise KeyError(f"Worker.run: cid {int(callable)} not found in registry")
+                self._chip_worker.run(target, args, cfg)
+            else:
+                self._chip_worker.run_prepared(int(callable), args, cfg)
         else:
             self._start_hierarchical()
             assert self._orch is not None
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 1a7ab3870..401d32dfe 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -918,6 +918,16 @@ def _run_and_validate_l2(
         config_dict = case.get("config", {})
         orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
 
+        # Stage 4 (callable.md): the L2 entry point is now
+        # `Worker.run(cid, args, cfg)`.  Reuse the cid registered by the
+        # st_worker fixture / standalone path.  For first-time callers
+        # (worker reused across rounds), `_st_l2_cid` caches the cid so
+        # subsequent runs skip re-registration.
+        cid = getattr(type(self), "_st_l2_cid", None)
+        if cid is None:
+            cid = worker.register(callable_obj)
+            type(self)._st_l2_cid = cid
+
         # Build args
         test_args = self.generate_args(params)
         chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
@@ -949,7 +959,7 @@ def _run_and_validate_l2(
             )
 
             with _temporary_env(self._resolve_env()):
-                worker.run(callable_obj, chip_args, config=config)
+                worker.run(cid, chip_args, config=config)
 
             if not skip_golden:
                 _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)

From 67e12e7c898352e67b81abd1933e73adf8276e08 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 14:36:23 +0800
Subject: [PATCH 07/28] Add: aicpu_dlopen_count getter for callable
 registration verification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose a monotonic counter of distinct callable_ids the AICPU has been
asked to dlopen for, so tests can assert per-cid registration eliminates
redundant dlopens across repeated runs (callable.md §7 verification).

- DeviceRunner (a2a3 onboard + sim): track aicpu_dlopen_total_, bumped
  on first-sighting bind; not decremented by unregister so case D
  (unregister + re-prepare) reports +2
- C ABI: get_aicpu_dlopen_count exported by all 4 a2a3/a5 variants;
  a5 + non-trb a2a3 return 0 (no per-cid registration there)
- ChipWorker / nanobind / Python wrappers: aicpu_dlopen_count property
  on _ChipWorker, ChipWorker, and Worker (L2-only; non-L2 returns 0)
- tests/st prepared_callable: 4 new test methods asserting counter
  delta for same-cid repeat (1), two-cid interleaving (2), double
  prepare (RuntimeError), and unregister + re-prepare (2). Each test
  snapshots baseline on entry and unregisters on exit so the shared
  st_worker fixture stays clean between cases.
---
 python/bindings/task_interface.cpp            |   7 ++
 python/simpler/task_interface.py              |   5 +
 python/simpler/worker.py                      |  12 ++
 .../platform/onboard/host/device_runner.cpp   |   4 +
 .../platform/onboard/host/device_runner.h     |  17 +++
 .../onboard/host/pto_runtime_c_api.cpp        |   9 ++
 src/a2a3/platform/sim/host/device_runner.cpp  |   4 +
 src/a2a3/platform/sim/host/device_runner.h    |   4 +
 .../platform/sim/host/pto_runtime_c_api.cpp   |   9 ++
 .../onboard/host/pto_runtime_c_api.cpp        |   2 +
 .../platform/sim/host/pto_runtime_c_api.cpp   |   2 +
 src/common/worker/chip_worker.cpp             |   9 ++
 src/common/worker/chip_worker.h               |   9 ++
 src/common/worker/pto_runtime_c_api.h         |   8 ++
 .../test_prepared_callable.py                 | 108 ++++++++++++++++++
 15 files changed, 209 insertions(+)

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 7355b6493..3c01dabcf 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -687,6 +687,13 @@ NB_MODULE(_task_interface, m) {
         .def_prop_ro("device_id", &ChipWorker::device_id)
         .def_prop_ro("initialized", &ChipWorker::initialized)
         .def_prop_ro("device_set", &ChipWorker::device_set)
+        .def_prop_ro(
+            "aicpu_dlopen_count", &ChipWorker::aicpu_dlopen_count,
+            "Number of distinct callable_ids the AICPU has dlopened for on the "
+            "bound device. Equals 0 when no device is set or the runtime "
+            "variant lacks per-cid registration. Tests assert this to verify "
+            "prepare_callable + repeated run_prepared do not redundantly dlopen."
+        )
         .def("malloc", &ChipWorker::malloc, nb::arg("size"))
         .def("free", &ChipWorker::free, nb::arg("ptr"))
         .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size"))
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 5e9830329..5124b4390 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -357,6 +357,11 @@ def unregister_callable(self, callable_id):
         """Drop prepared state for ``callable_id`` and release its orch SO share."""
         self._impl.unregister_callable(int(callable_id))
 
+    @property
+    def aicpu_dlopen_count(self):
+        """Number of distinct callable_ids the AICPU has dlopened for."""
+        return self._impl.aicpu_dlopen_count
+
     def malloc(self, size):
         """Allocate memory. Returns a pointer (uint64)."""
         return int(self._impl.malloc(int(size)))
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index c794d9ad9..4e13bf83d 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -1261,6 +1261,18 @@ def unregister_callable(self, callable_id: int) -> None:
         assert self._chip_worker is not None
         self._chip_worker.unregister_callable(callable_id)
 
+    @property
+    def aicpu_dlopen_count(self) -> int:
+        """L2 only: number of distinct callable_ids the AICPU has dlopened for.
+
+        Used by tests to assert that ``register`` + repeated ``run(cid)`` calls
+        do not retrigger the AICPU dlopen for an already-seen cid. Returns 0
+        on non-L2 workers (no per-cid registration there).
+        """
+        if self.level != 2 or self._chip_worker is None:
+            return 0
+        return self._chip_worker.aicpu_dlopen_count
+
     def _run_as_child(self, cid: int, args, config) -> None:
         """Called from C++ _Worker::run when this Worker is a THREAD-mode child.
 
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index a8e0a68a8..662f8d2b0 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -760,6 +760,9 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         }
         const auto &state = it->second;
         const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
         runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
         // The c_api caller passed is_new=false; refresh with the authoritative
         // first_sighting flag before AICPU consumes register_new_callable_id_.
@@ -1014,6 +1017,7 @@ int DeviceRunner::finalize() {
     orch_so_dedup_.clear();
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
     // Cleanup performance profiling
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index b592bbb7e..79f95d404 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -479,6 +479,18 @@ class DeviceRunner {
      * @return 0 on success, -1 if the cid is not registered.
      */
     int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+
+    /**
+     * Number of distinct callable_ids the AICPU has been asked to dlopen for.
+     * Monotonically increases on every first-sighting bind; `unregister_callable`
+     * does NOT decrement it. So a `prepare → run → unregister → re-prepare → run`
+     * sequence reports 2 (each AICPU dlopen counted once), even though only one
+     * cid is currently registered. Tests assert this to verify per-cid
+     * registration eliminates duplicate dlopens across repeated runs.
+     */
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+#else   // RUNTIME_HAS_CALLABLE_ID
+    size_t aicpu_dlopen_count() const { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
 private:
@@ -538,6 +550,11 @@ class DeviceRunner {
     std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
     std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    // Monotonic count of AICPU dlopens triggered (incremented on each
+    // first-sighting bind; never decremented). Diverges from
+    // aicpu_seen_callable_ids_.size() once any cid is unregistered and
+    // re-prepared. Exposed via aicpu_dlopen_count() for tests.
+    size_t aicpu_dlopen_total_{0};
     // Sticky flag: prepare_callable was called at least once. Distinguishes
     // legacy-path "kernel still cached at finalize" leaks from prepared-path
     // kernels that legitimately live until finalize.
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 75c467f54..ce7709655 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -472,4 +472,13 @@ int run_prepared(
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
 }  // extern "C"
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index d1b11a527..1d07b5cf3 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -686,6 +686,9 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         }
         const auto &state = it->second;
         const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
         runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
         runtime.set_active_callable_id(cid, first_sighting);
         runtime.pending_orch_so_data_ = nullptr;
@@ -917,6 +920,7 @@ int DeviceRunner::finalize() {
     orch_so_dedup_.clear();
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
     // Close executor .so files (typically already closed by run(), this is a safety net)
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 2981e3b89..b115485b8 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -220,6 +220,9 @@ class DeviceRunner {
     int unregister_prepared_callable(int32_t callable_id);
     bool has_prepared_callable(int32_t callable_id) const;
     int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+#else   // RUNTIME_HAS_CALLABLE_ID
+    size_t aicpu_dlopen_count() const { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
 private:
@@ -262,6 +265,7 @@ class DeviceRunner {
     std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
     std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    size_t aicpu_dlopen_total_{0};
     // Sticky flag: prepare_callable was called at least once in this
     // DeviceRunner's lifetime. unregister_prepared_callable clears the maps
     // above, so we cannot use them at finalize() time to decide whether a
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 06b3fda71..188edf5b5 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -444,4 +444,13 @@ int run_prepared(
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
 }  // extern "C"
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index a89dab14a..12e768ce2 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -354,4 +354,6 @@ int run_prepared(
 }
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 
+size_t get_aicpu_dlopen_count(DeviceContextHandle) { return 0; }
+
 }  // extern "C"
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 1693783d8..bb3183a0d 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -302,4 +302,6 @@ int run_prepared(DeviceContextHandle, RuntimeHandle, int32_t, const void *, int,
 }
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 
+size_t get_aicpu_dlopen_count(DeviceContextHandle) { return 0; }
+
 }  // extern "C"
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 360b81d67..1ed20eb4c 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -153,6 +153,7 @@ void ChipWorker::init(
         prepare_callable_fn_ = load_symbol<PrepareCallableFn>(handle, "prepare_callable");
         run_prepared_fn_ = load_symbol<RunPreparedFn>(handle, "run_prepared");
         unregister_callable_fn_ = load_symbol<UnregisterCallableFn>(handle, "unregister_callable");
+        get_aicpu_dlopen_count_fn_ = load_symbol<GetAicpuDlopenCountFn>(handle, "get_aicpu_dlopen_count");
         finalize_device_fn_ = load_symbol<FinalizeDeviceFn>(handle, "finalize_device");
         // ACL lifecycle + comm_* are part of the uniform host_runtime.so ABI.
         // Every platform runtime exports all of them — runtimes that do not
@@ -249,6 +250,7 @@ void ChipWorker::finalize() {
     prepare_callable_fn_ = nullptr;
     run_prepared_fn_ = nullptr;
     unregister_callable_fn_ = nullptr;
+    get_aicpu_dlopen_count_fn_ = nullptr;
     finalize_device_fn_ = nullptr;
     ensure_acl_ready_fn_ = nullptr;
     create_comm_stream_fn_ = nullptr;
@@ -342,6 +344,13 @@ void ChipWorker::unregister_callable(int32_t callable_id) {
     }
 }
 
+size_t ChipWorker::aicpu_dlopen_count() const {
+    if (!device_set_) {
+        return 0;
+    }
+    return get_aicpu_dlopen_count_fn_(device_ctx_);
+}
+
 uint64_t ChipWorker::malloc(size_t size) {
     if (!device_set_) {
         throw std::runtime_error("ChipWorker device not set; call set_device() first");
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 72d8cd492..4e67c0a60 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -70,6 +70,13 @@ class ChipWorker : public IWorker {
     void run_prepared(int32_t callable_id, const void *args, const CallConfig &config);
     void unregister_callable(int32_t callable_id);
 
+    /// Number of distinct callable_ids the AICPU has been asked to dlopen for
+    /// on the bound device. Returns 0 when no device is set or the runtime
+    /// variant has no per-cid registration support. Used by tests to assert
+    /// that prepare_callable + repeated run_prepared do not trigger redundant
+    /// AICPU dlopens.
+    size_t aicpu_dlopen_count() const;
+
     uint64_t malloc(size_t size);
     void free(uint64_t ptr);
     void copy_to(uint64_t dst, uint64_t src, size_t size);
@@ -119,6 +126,7 @@ class ChipWorker : public IWorker {
     using RunPreparedFn =
         int (*)(void *, void *, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *);
     using UnregisterCallableFn = int (*)(void *, int32_t);
+    using GetAicpuDlopenCountFn = size_t (*)(void *);
     using FinalizeDeviceFn = int (*)(void *);
     using EnsureAclReadyFn = int (*)(void *, int);
     using CreateCommStreamFn = void *(*)(void *);
@@ -144,6 +152,7 @@ class ChipWorker : public IWorker {
     PrepareCallableFn prepare_callable_fn_ = nullptr;
     RunPreparedFn run_prepared_fn_ = nullptr;
     UnregisterCallableFn unregister_callable_fn_ = nullptr;
+    GetAicpuDlopenCountFn get_aicpu_dlopen_count_fn_ = nullptr;
     FinalizeDeviceFn finalize_device_fn_ = nullptr;
     EnsureAclReadyFn ensure_acl_ready_fn_ = nullptr;
     CreateCommStreamFn create_comm_stream_fn_ = nullptr;
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index 88c797ba1..8890dfeb1 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -188,6 +188,14 @@ int run_prepared(
  */
 int unregister_callable(DeviceContextHandle ctx, int32_t callable_id);
 
+/**
+ * Number of distinct callable_ids the AICPU has been asked to dlopen for on
+ * the device bound to `ctx`. Returns 0 on runtime variants without per-cid
+ * registration support. Used by tests to assert that `prepare_callable` +
+ * repeated `run_prepared` calls do not trigger redundant AICPU dlopens.
+ */
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index 14bb8a792..27335a6c4 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -15,8 +15,11 @@
   - Two distinct callable_ids sharing the same orch SO binary: verifies both
     produce correct output independently.
   - unregister_callable after runs complete: should not raise.
+  - aicpu_dlopen_count assertions covering: same-cid repeat, multi-cid
+    interleaving, double-prepare rejection, and unregister + re-prepare.
 """
 
+import pytest
 import torch
 from simpler.task_interface import ArgDirection as D
 
@@ -127,6 +130,111 @@ def _run_and_validate_l2(
         worker.unregister_callable(0)
         worker.unregister_callable(1)
 
+    # ------------------------------------------------------------------
+    # aicpu_dlopen_count assertions (callable.md §7 verification).
+    #
+    # The L2 worker fixture is shared across tests in this class, so the
+    # counter can be non-zero on entry from prior tests' leftover prepared
+    # callables (or from this test class's own test_run). Each test below
+    # snapshots the counter on entry, asserts the *delta* introduced by the
+    # scenario, then unregisters everything it staged so the next test sees
+    # the same baseline (unregister_callable erases the cid, decrementing
+    # the counter).
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        """Common fixture: build callable + config, return (callable, config, case)."""
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        """Case A: prepare(0) + run(0) × 5 → dlopen_count delta == 1."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new dlopen for 5 runs of cid=0, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        """Case B: prepare(0)+prepare(1) + (run(0),run(1)) × 5 → delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(1, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, 1, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new dlopens for cids {{0,1}} interleaved, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(1)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """Case C: prepare(0) + prepare(0) → second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(0, callable_obj)
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """Case D: prepare(0)+run(0)+unregister(0)+prepare(0)+run(0) → delta == 2.
+
+        unregister erases the cid from aicpu_seen_callable_ids_, so the second
+        prepare/run pair sets register_new_callable_id_ again and the AICPU
+        does a fresh dlopen. The counter is monotonic (does NOT decrement on
+        unregister), so the delta after the second cycle is 2.
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1
+            st_worker.unregister_callable(0)
+            registered = False
+            after_unreg = st_worker.aicpu_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if registered:
+                st_worker.unregister_callable(0)
+
 
 if __name__ == "__main__":
     SceneTestCase.run_module(__name__)

From ee90d6afdcc7290749480c815a6cb1f65cde7515 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 16:09:36 +0800
Subject: [PATCH 08/28] fix(pr): address review and CI issues for #710

- Apply clang-format on src/a5/platform/sim/host/pto_runtime_c_api.cpp
  and src/common/worker/chip_worker.h (pre-commit fix).
- register_prepared_callable: enforce callable_id in [0, 64) in both
  a2a3 onboard and sim DeviceRunner so an out-of-range id fails fast on
  host instead of OOB-indexing the AICPU orch_so_table_ later.
- aicpu_executor: reject negative callable_id values other than the
  legacy -1 sentinel (mirrors the upper-bound guard).
- tests/st/explicit_fatal: migrate to Stage 4 register + run(cid) API
  so the negative ST works under the unified run(cid) entry point.
---
 src/a2a3/platform/onboard/host/device_runner.cpp     |  8 ++++++--
 src/a2a3/platform/sim/host/device_runner.cpp         |  8 ++++++--
 .../aicpu/aicpu_executor.cpp                         |  8 +++++---
 src/a5/platform/sim/host/pto_runtime_c_api.cpp       |  5 ++++-
 src/common/worker/chip_worker.h                      | 12 ++++++++----
 tests/st/explicit_fatal/test_explicit_fatal.py       |  5 +++--
 6 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 662f8d2b0..e77be663a 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -840,8 +840,12 @@ int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
 ) {
-    if (callable_id < 0) {
-        LOG_ERROR("register_prepared_callable: negative callable_id=%d", callable_id);
+    // Bounds match the AICPU executor's `orch_so_table_[64]` (see
+    // MAX_REGISTERED_CALLABLE_IDS in aicpu_executor.cpp). An out-of-range id
+    // would succeed here but blow up later as an OOB access on the AICPU.
+    constexpr int32_t kMaxCallableId = 64;
+    if (callable_id < 0 || callable_id >= kMaxCallableId) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, kMaxCallableId);
         return -1;
     }
     if (orch_so_data == nullptr || orch_so_size == 0) {
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1d07b5cf3..4e0fec515 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -753,8 +753,12 @@ int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
 ) {
-    if (callable_id < 0) {
-        LOG_ERROR("register_prepared_callable: negative callable_id=%d", callable_id);
+    // Bounds match the AICPU executor's `orch_so_table_[64]` (see
+    // MAX_REGISTERED_CALLABLE_IDS in aicpu_executor.cpp). An out-of-range id
+    // would succeed here but blow up later as an OOB access on the AICPU.
+    constexpr int32_t kMaxCallableId = 64;
+    if (callable_id < 0 || callable_id >= kMaxCallableId) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, kMaxCallableId);
         return -1;
     }
     if (orch_so_data == nullptr || orch_so_size == 0) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index e7164b78e..21df90c30 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -235,10 +235,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // the rest of this branch ignore the choice.
             const int32_t callable_id = runtime->get_active_callable_id();
             const bool use_table = (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS);
-            if (callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+            // -1 is the legacy sentinel that intentionally falls back to the
+            // single-slot orch SO cache. Any other negative value is a
+            // protocol violation.
+            if (callable_id >= MAX_REGISTERED_CALLABLE_IDS || (callable_id < 0 && callable_id != -1)) {
                 DEV_ERROR(
-                    "Thread %d: callable_id %d exceeds MAX_REGISTERED_CALLABLE_IDS=%d", thread_idx, callable_id,
-                    MAX_REGISTERED_CALLABLE_IDS
+                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
                 );
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index bb3183a0d..dee315ec8 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -296,7 +296,10 @@ int prepare_callable(
     LOG_ERROR("prepare_callable not supported by this runtime variant");
     return -1;
 }
-int run_prepared(DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *) {
+int run_prepared(
+    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
+    size_t, int, int, int, const char *
+) {
     LOG_ERROR("run_prepared not supported by this runtime variant");
     return -1;
 }
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 4e67c0a60..9c2e43af8 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -118,13 +118,17 @@ class ChipWorker : public IWorker {
     using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using CopyFromDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using GetRuntimeSizeFn = size_t (*)();
-    using RunRuntimeFn =
-        int (*)(void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *);
+    using RunRuntimeFn = int (*)(
+        void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t,
+        int, int, int, const char *
+    );
     using SimplerInitFn = void (*)(void *, int, int);
     using PrepareCallableFn =
         int (*)(void *, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t);
-    using RunPreparedFn =
-        int (*)(void *, void *, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char *);
+    using RunPreparedFn = int (*)(
+        void *, void *, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int,
+        int, int, const char *
+    );
     using UnregisterCallableFn = int (*)(void *, int32_t);
     using GetAicpuDlopenCountFn = size_t (*)(void *);
     using FinalizeDeviceFn = int (*)(void *);
diff --git a/tests/st/explicit_fatal/test_explicit_fatal.py b/tests/st/explicit_fatal/test_explicit_fatal.py
index 8a88f0f41..f6c8a34c5 100644
--- a/tests/st/explicit_fatal/test_explicit_fatal.py
+++ b/tests/st/explicit_fatal/test_explicit_fatal.py
@@ -42,12 +42,13 @@ def test_explicit_fatal_reports(st_platform, st_device_ids):
 
     chip_callable = _build_chip_callable(st_platform)
     worker = Worker(level=2, platform=st_platform, runtime=RUNTIME, device_id=int(st_device_ids[0]))
+    cid = worker.register(chip_callable)
     worker.init()
     try:
         config = CallConfig()
         config.block_dim = 24
         config.aicpu_thread_num = 4
-        with pytest.raises(RuntimeError, match=r"run_runtime failed with code -9"):
-            worker.run(chip_callable, ChipStorageTaskArgs(), config)
+        with pytest.raises(RuntimeError, match=r"(run_runtime|run_prepared) failed with code -9"):
+            worker.run(cid, ChipStorageTaskArgs(), config)
     finally:
         worker.close()

From 9a80d7198b3c78fea78e62d35b67e17dd1c5f631 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 16:53:17 +0800
Subject: [PATCH 09/28] refactor(callable): unify MAX_REGISTERED_CALLABLE_IDS
 source of truth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the upper bound was hard-coded as `64` in three independent
places (a2a3 onboard/sim DeviceRunner host bounds checks and the AICPU
executor's `orch_so_table_[]` declaration), with three different
spellings (`kMaxCallableId` vs `MAX_REGISTERED_CALLABLE_IDS`). They are
the same protocol constant — diverging would silently break the host↔
AICPU contract.

Move the constant into a new `src/common/task_interface/callable_protocol.h`
header (cstdint-only so the AICPU side can include it without dragging
in `<vector>`/`<stdexcept>` from `callable.h`) and have all three
call sites reference it.
---
 .../platform/onboard/host/device_runner.cpp   | 15 +++++----
 src/a2a3/platform/sim/host/device_runner.cpp  | 15 +++++----
 .../aicpu/aicpu_executor.cpp                  |  8 +++--
 src/common/task_interface/callable_protocol.h | 31 +++++++++++++++++++
 4 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 src/common/task_interface/callable_protocol.h

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index e77be663a..c3296962a 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -28,6 +28,7 @@
 // Include HAL constants from CANN (header only, library loaded dynamically)
 #include "ascend_hal.h"
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "host/host_regs.h"  // Register address retrieval
 #include "host/raii_scope_guard.h"
@@ -840,12 +841,14 @@ int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
 ) {
-    // Bounds match the AICPU executor's `orch_so_table_[64]` (see
-    // MAX_REGISTERED_CALLABLE_IDS in aicpu_executor.cpp). An out-of-range id
-    // would succeed here but blow up later as an OOB access on the AICPU.
-    constexpr int32_t kMaxCallableId = 64;
-    if (callable_id < 0 || callable_id >= kMaxCallableId) {
-        LOG_ERROR("register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, kMaxCallableId);
+    // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+    // (declared in src/common/task_interface/callable_protocol.h) and indexes it by
+    // callable_id; rejecting an out-of-range id here keeps the host and
+    // AICPU sides in sync and avoids an OOB access at run time.
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
         return -1;
     }
     if (orch_so_data == nullptr || orch_so_size == 0) {
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 4e0fec515..8245b9509 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -36,6 +36,7 @@
 
 #include "aicpu/platform_aicpu_affinity.h"
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "cpu_sim_context.h"
 #include "host/raii_scope_guard.h"
@@ -753,12 +754,14 @@ int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
 ) {
-    // Bounds match the AICPU executor's `orch_so_table_[64]` (see
-    // MAX_REGISTERED_CALLABLE_IDS in aicpu_executor.cpp). An out-of-range id
-    // would succeed here but blow up later as an OOB access on the AICPU.
-    constexpr int32_t kMaxCallableId = 64;
-    if (callable_id < 0 || callable_id >= kMaxCallableId) {
-        LOG_ERROR("register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, kMaxCallableId);
+    // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+    // (declared in src/common/task_interface/callable_protocol.h) and indexes it by
+    // callable_id; rejecting an out-of-range id here keeps the host and
+    // AICPU sides in sync and avoids an OOB access at run time.
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
         return -1;
     }
     if (orch_so_data == nullptr || orch_so_size == 0) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 21df90c30..40ebc92cf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -24,6 +24,7 @@
 
 #include "aicpu/device_time.h"
 #include "aicpu/orch_so_file.h"
+#include "callable_protocol.h"
 #include "pto2_dispatch_payload.h"
 #include "runtime.h"
 #include "spin_hint.h"
@@ -91,12 +92,13 @@ static PTO2Runtime *rt{nullptr};
 
 // Per-callable_id orchestration SO table. AICPU side of the callable.md
 // design: when `runtime->active_callable_id_ >= 0` the executor dispatches
-// to `orch_so_table_[callable_id]` (created on first sighting of that
+// `orch_so_table_[active_callable_id_]` (created on first sighting of that
 // callable_id, kept warm across runs); when `active_callable_id_ < 0` it
 // falls back to the legacy single slot governed by `has_new_orch_so_`.
 // MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
-// (mailbox uint32 callable_id, register() returns small ints).
-static constexpr int32_t MAX_REGISTERED_CALLABLE_IDS = 64;
+// (mailbox uint32 callable_id, register() returns small ints) and is shared
+// with the host bounds check in DeviceRunner::register_prepared_callable —
+// see src/common/task_interface/callable_protocol.h.
 
 struct OrchSoEntry {
     bool in_use{false};
diff --git a/src/common/task_interface/callable_protocol.h b/src/common/task_interface/callable_protocol.h
new file mode 100644
index 000000000..108713910
--- /dev/null
+++ b/src/common/task_interface/callable_protocol.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Per-callable_id protocol constants (callable.md)
+ *
+ * Single source of truth for the host↔AICPU per-callable_id dispatch protocol.
+ * Kept separate from callable.h so the AICPU side can include it without
+ * pulling in <vector>/<stdexcept>.
+ *
+ * Both sides must agree on these bounds:
+ *   - Host: DeviceRunner::register_prepared_callable rejects out-of-range ids.
+ *   - AICPU: AicpuExecutor::run guards `orch_so_table_[callable_id]` access.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+// Hard cap on the number of distinct callable_ids that can be registered
+// via Worker.register / DeviceRunner::register_prepared_callable. The AICPU
+// executor reserves a fixed-size `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+// keyed by callable_id, so this bound is part of the host↔AICPU protocol.
+constexpr int32_t MAX_REGISTERED_CALLABLE_IDS = 64;

From 8667074e1ccf35eb670c500464d611f8e81c2544 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 17:01:40 +0800
Subject: [PATCH 10/28] fix(callable): chip_process_loop falls back to legacy
 run when variant lacks prepare_callable

a5/onboard's pto_runtime_c_api stubs `prepare_callable`/`run_prepared`
to -1 (Stage 1 ABI port deferred the implementation), which hard-broke
every L3+ test on a5/onboard once Stage 3 made the chip_process_loop
go through `prepare_callable` + `run_prepared` unconditionally.

Detect the stub at the very first prepare attempt: if the call raises
RuntimeError, set `prepared_unsupported` and route every subsequent
TASK_READY through the legacy `cw.run(callable_obj, args, cfg)` path
(callable_obj resolved from the COW-inherited registry by cid). This
keeps the L3+ mailbox protocol cid-only as designed while letting
variants that have not yet picked up per-cid orch SO dispatch keep
working in the meantime. Once all variants implement the prepared
path, the fallback shim and the legacy ChipWorker.run binding can go.

Mirror the same fallback in `_chip_process_loop_with_bootstrap`
(distributed/HCCL chips).
---
 python/simpler/worker.py | 71 +++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 11 deletions(-)

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 4e13bf83d..68e513cb5 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -321,15 +321,43 @@ def _chip_process_loop(
     # pre-warms via _CTRL_PREPARE, but TASK_READY also lazy-prepares as a
     # safety net (e.g. registrations that bypassed the prefetch path).
     prepared: set[int] = set()
-
-    def _ensure_prepared(cid: int) -> None:
+    # Some runtime variants (e.g. a5 onboard) ship `prepare_callable` /
+    # `run_prepared` as stubs that return -1.  When the very first
+    # prepare_callable raises, flip this flag and fall back to the legacy
+    # `cw.run(callable_obj, args, cfg)` path for every subsequent task.
+    # The fallback is a transitional shim — once every variant implements
+    # the prepared path it can go away.
+    prepared_unsupported = False
+
+    def _ensure_prepared(cid: int) -> bool:
+        """Return True iff `cid` is ready for `run_prepared`.
+
+        Returns False (and sets `prepared_unsupported` in the enclosing
+        scope) when the runtime variant does not implement
+        `prepare_callable`, signalling the caller to take the legacy path.
+        """
+        nonlocal prepared_unsupported
+        if prepared_unsupported:
+            return False
         if cid in prepared:
-            return
+            return True
         callable_obj = registry.get(cid)
         if callable_obj is None:
             raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
-        cw.prepare_callable(cid, callable_obj)
+        try:
+            cw.prepare_callable(cid, callable_obj)
+        except RuntimeError:
+            prepared_unsupported = True
+            return False
         prepared.add(cid)
+        return True
+
+    def _run_legacy(cid: int, args, cfg) -> None:
+        """Legacy path: resolve cid back to its ChipCallable and run it."""
+        callable_obj = registry.get(cid)
+        if callable_obj is None:
+            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
+        cw.run(callable_obj, args, cfg)
 
     while True:
         state = _mailbox_load_i32(state_addr)
@@ -340,9 +368,11 @@ def _ensure_prepared(cid: int) -> None:
             code = 0
             msg = ""
             try:
-                _ensure_prepared(cid)
                 args = _read_args_from_mailbox(buf)
-                cw.run_prepared(cid, args, cfg)
+                if _ensure_prepared(cid):
+                    cw.run_prepared(cid, args, cfg)
+                else:
+                    _run_legacy(cid, args, cfg)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -445,15 +475,32 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
     # Per-child set of cids already prepared on this device (Stage 3,
     # callable.md).  Mirrors `_chip_process_loop`'s `prepared`.
     prepared: set[int] = set()
+    # Variants without per-cid prepare/run_prepared (e.g. a5 onboard) trip
+    # this on the very first prepare and the loop falls back to legacy run.
+    prepared_unsupported = False
 
-    def _ensure_prepared(cid: int) -> None:
+    def _ensure_prepared(cid: int) -> bool:
+        nonlocal prepared_unsupported
+        if prepared_unsupported:
+            return False
         if cid in prepared:
-            return
+            return True
         callable_obj = registry.get(cid)
         if callable_obj is None:
             raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
-        cw._impl.prepare_callable(cid, callable_obj)
+        try:
+            cw._impl.prepare_callable(cid, callable_obj)
+        except RuntimeError:
+            prepared_unsupported = True
+            return False
         prepared.add(cid)
+        return True
+
+    def _run_legacy(cid: int, args, cfg) -> None:
+        callable_obj = registry.get(cid)
+        if callable_obj is None:
+            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
+        cw._impl.run(callable_obj, args, cfg)
 
     try:
         while True:
@@ -465,9 +512,11 @@ def _ensure_prepared(cid: int) -> None:
                 code = 0
                 msg = ""
                 try:
-                    _ensure_prepared(cid)
                     args = _read_args_from_mailbox(buf)
-                    cw._impl.run_prepared(cid, args, cfg)
+                    if _ensure_prepared(cid):
+                        cw._impl.run_prepared(cid, args, cfg)
+                    else:
+                        _run_legacy(cid, args, cfg)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id}", e)

From 642e95acdb9a02057b0b86c44a3b4695cb399d9a Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 6 May 2026 17:17:46 +0800
Subject: [PATCH 11/28] fix(callable): scope orch SO file name by callable_id
 (a2a3 507018)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The onboard `create_orch_so_file` named the staged SO `libdevice_orch_<pid>.so`
based on the assumption that "only one runtime runs per device process,
so pid uniqueness is sufficient" (in 7e071c18 / before stage 4). Stage 4
broke that assumption: per-callable_id dispatch keeps multiple orch SO
images resident in the same AICPU process at once, one per cid in
`orch_so_table_[]`. The reload branch first creates `orch_so_table_[cid].handle`
without unlinking any pre-existing on-disk file (the unlink only fires
when *that same slot's* handle is non-null), so the second cid's
`open(..., O_TRUNC)` silently truncated and rewrote cid=0's file image.
The kernel still mapped the old inode for cid=0's dlopen'd code; the
next launch on cid=0 jumped into bytes that now belonged to cid=1 and
SIGBUS'd inside AICPU. The host saw it as
`rtStreamSynchronize (AICPU) failed: 507018`.

Repro: examples/workers/l3/ffn_tp_parallel — two cids (ffn_local +
allreduce) on a2a3/onboard. multi_chip_dispatch passed because it only
register()'d a single ChipCallable.

Fix:
- create_orch_so_file gains a callable_id parameter. Onboard variants
  embed it in the file name (`libdevice_orch_<pid>_<cid>.so`) when
  cid >= 0; the legacy single-slot path (cid == -1) keeps pid-only
  naming so variants that never adopt per-cid dispatch see no change.
- Sim variants embed cid for log readability only — mkstemps already
  guarantees uniqueness — keeping the contract symmetrical across all
  four implementations.
- aicpu_executor.cpp at both a2a3 and a5 forwards the active cid (a5
  passes -1 since it has no callable_id concept yet).

Regression test: tests/ut/cpp/common/test_orch_so_file.cpp asserts that
distinct cids produce distinct paths and the legacy sentinel preserves
pid-only naming. Compiles the a2a3 onboard implementation directly so
the ut catches the bug on no-hw runners too.
---
 .../platform/include/aicpu/orch_so_file.h     |  7 +-
 .../platform/onboard/aicpu/orch_so_file.cpp   | 18 +++-
 src/a2a3/platform/sim/aicpu/orch_so_file.cpp  | 11 ++-
 .../aicpu/aicpu_executor.cpp                  |  2 +-
 src/a5/platform/include/aicpu/orch_so_file.h  |  7 +-
 .../platform/onboard/aicpu/orch_so_file.cpp   | 18 +++-
 src/a5/platform/sim/aicpu/orch_so_file.cpp    | 11 ++-
 .../aicpu/aicpu_executor.cpp                  |  2 +-
 tests/ut/cpp/CMakeLists.txt                   | 21 +++++
 tests/ut/cpp/common/test_orch_so_file.cpp     | 93 +++++++++++++++++++
 10 files changed, 174 insertions(+), 16 deletions(-)
 create mode 100644 tests/ut/cpp/common/test_orch_so_file.cpp

diff --git a/src/a2a3/platform/include/aicpu/orch_so_file.h b/src/a2a3/platform/include/aicpu/orch_so_file.h
index a305ab8fa..29318f5ea 100644
--- a/src/a2a3/platform/include/aicpu/orch_so_file.h
+++ b/src/a2a3/platform/include/aicpu/orch_so_file.h
@@ -39,10 +39,15 @@
  * Caller is expected to try the next candidate directory.
  *
  * @param dir            Candidate directory (e.g. "/tmp")
+ * @param callable_id    Per-callable_id table slot id (>= 0). Required for
+ *                       uniqueness on the onboard path so concurrently-
+ *                       resident orch SOs (one per cid) do not collide on
+ *                       the same on-disk file. Pass -1 for the legacy
+ *                       single-slot dispatch path.
  * @param out_path       Buffer that receives the full file path on success
  * @param out_path_size  Size of `out_path` in bytes
  * @return Open writable fd on success, -1 on failure
  */
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size);
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size);
 
 #endif  // PLATFORM_AICPU_ORCH_SO_FILE_H_
diff --git a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
index 322cb7dcc..a1847adb9 100644
--- a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
+++ b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
@@ -15,10 +15,20 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
-    // Pid-based naming: AICPU device libc may lack mkstemps, and only one
-    // runtime runs per device process, so pid uniqueness is sufficient.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
+    // Pid + callable_id naming: AICPU device libc may lack mkstemps. With
+    // Stage 4 per-callable_id dispatch, multiple orch SOs can be resident
+    // in the same device process at once (one per cid in `orch_so_table_`),
+    // so the on-disk file name must be unique per cid — otherwise the
+    // second cid's `O_TRUNC` would silently shred the first cid's already
+    // dlopen'd file image and the next launch on cid=0 would SIGBUS.
+    // callable_id < 0 is the legacy single-slot path: pid alone is fine.
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d_%d.so", dir, getpid(), callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a2a3/platform/sim/aicpu/orch_so_file.cpp b/src/a2a3/platform/sim/aicpu/orch_so_file.cpp
index 4da92d7de..114fe4826 100644
--- a/src/a2a3/platform/sim/aicpu/orch_so_file.cpp
+++ b/src/a2a3/platform/sim/aicpu/orch_so_file.cpp
@@ -24,10 +24,17 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
     // mkstemps: multiple sim workers can share a process, so names must be
     // unique per call.  The "XXXXXX" template is replaced in-place.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    // callable_id is embedded purely for log readability (mkstemps already
+    // guarantees uniqueness regardless).
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_cid%d_XXXXXX.so", dir, callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 40ebc92cf..e4c08f2ac 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -291,7 +291,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
                 for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path));
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
                     if (fd < 0) {
                         LOG_INFO_V0(
                             "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
diff --git a/src/a5/platform/include/aicpu/orch_so_file.h b/src/a5/platform/include/aicpu/orch_so_file.h
index 40bec7411..33862527e 100644
--- a/src/a5/platform/include/aicpu/orch_so_file.h
+++ b/src/a5/platform/include/aicpu/orch_so_file.h
@@ -39,10 +39,15 @@
  * Caller is expected to try the next candidate directory.
  *
  * @param dir            Candidate directory (e.g. "/tmp")
+ * @param callable_id    Per-callable_id table slot id (>= 0). Required for
+ *                       uniqueness on the onboard path so concurrently-
+ *                       resident orch SOs (one per cid) do not collide on
+ *                       the same on-disk file. Pass -1 for the legacy
+ *                       single-slot dispatch path.
  * @param out_path       Buffer that receives the full file path on success
  * @param out_path_size  Size of `out_path` in bytes
  * @return Open writable fd on success, -1 on failure
  */
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size);
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size);
 
 #endif  // PLATFORM_AICPU_ORCH_SO_FILE_H_
diff --git a/src/a5/platform/onboard/aicpu/orch_so_file.cpp b/src/a5/platform/onboard/aicpu/orch_so_file.cpp
index 322cb7dcc..a1847adb9 100644
--- a/src/a5/platform/onboard/aicpu/orch_so_file.cpp
+++ b/src/a5/platform/onboard/aicpu/orch_so_file.cpp
@@ -15,10 +15,20 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
-    // Pid-based naming: AICPU device libc may lack mkstemps, and only one
-    // runtime runs per device process, so pid uniqueness is sufficient.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
+    // Pid + callable_id naming: AICPU device libc may lack mkstemps. With
+    // Stage 4 per-callable_id dispatch, multiple orch SOs can be resident
+    // in the same device process at once (one per cid in `orch_so_table_`),
+    // so the on-disk file name must be unique per cid — otherwise the
+    // second cid's `O_TRUNC` would silently shred the first cid's already
+    // dlopen'd file image and the next launch on cid=0 would SIGBUS.
+    // callable_id < 0 is the legacy single-slot path: pid alone is fine.
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d_%d.so", dir, getpid(), callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a5/platform/sim/aicpu/orch_so_file.cpp b/src/a5/platform/sim/aicpu/orch_so_file.cpp
index 4da92d7de..114fe4826 100644
--- a/src/a5/platform/sim/aicpu/orch_so_file.cpp
+++ b/src/a5/platform/sim/aicpu/orch_so_file.cpp
@@ -24,10 +24,17 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
     // mkstemps: multiple sim workers can share a process, so names must be
     // unique per call.  The "XXXXXX" template is replaced in-place.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    // callable_id is embedded purely for log readability (mkstemps already
+    // guarantees uniqueness regardless).
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_cid%d_XXXXXX.so", dir, callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index f1936d467..ec7c6e4ee 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -242,7 +242,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
                 for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path));
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], /*callable_id=*/-1, so_path, sizeof(so_path));
                     if (fd < 0) {
                         LOG_INFO_V0(
                             "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index b3caacd97..6c525edd0 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -246,6 +246,27 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp)
 add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp)
 add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp)
 
+# Per-callable_id orch SO file naming regression (callable.md, see
+# rtStreamSynchronize 507018 root cause). Compiles the a2a3 onboard
+# `create_orch_so_file` against the test source so it runs on no-hw
+# runners too.
+add_executable(test_orch_so_file
+    common/test_orch_so_file.cpp
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
+)
+target_include_directories(test_orch_so_file PRIVATE
+    ${GTEST_INCLUDE_DIRS}
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+)
+target_compile_options(test_orch_so_file PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+target_link_libraries(test_orch_so_file PRIVATE
+    ${GTEST_MAIN_LIB}
+    ${GTEST_LIB}
+    pthread
+)
+add_test(NAME test_orch_so_file COMMAND test_orch_so_file)
+set_tests_properties(test_orch_so_file PROPERTIES LABELS "no_hardware")
+
 # ---------------------------------------------------------------------------
 # A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
 # ---------------------------------------------------------------------------
diff --git a/tests/ut/cpp/common/test_orch_so_file.cpp b/tests/ut/cpp/common/test_orch_so_file.cpp
new file mode 100644
index 000000000..078e422b9
--- /dev/null
+++ b/tests/ut/cpp/common/test_orch_so_file.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Regression test for the per-callable_id orch SO file naming contract.
+//
+// The onboard variants of `create_orch_so_file` (src/{a2a3,a5}/platform/
+// onboard/aicpu/orch_so_file.cpp) historically used pid-only naming, which
+// silently broke once Stage 4 (callable.md) introduced multi-callable
+// dispatch on the same device process: the second cid's `O_TRUNC` open
+// shredded the first cid's already-dlopen'd SO image and the next launch
+// on cid=0 SIGBUS'd inside the AICPU executor (manifesting as
+// `rtStreamSynchronize (AICPU) failed: 507018` on the host).
+//
+// The fix is to embed `callable_id` in the file name when cid >= 0. This
+// test exercises the contract directly: distinct cids must produce distinct
+// paths, and the legacy cid=-1 path must remain pid-only (no behavioural
+// change for variants that never adopt per-cid dispatch).
+
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "aicpu/orch_so_file.h"
+
+namespace {
+
+std::string mkscratch_dir() {
+    char templ[] = "/tmp/orch_so_file_ut_XXXXXX";
+    const char *dir = mkdtemp(templ);
+    if (dir == nullptr) {
+        std::abort();
+    }
+    return std::string(dir);
+}
+
+void rmtree(const std::string &dir) {
+    std::string cmd = "rm -rf '" + dir + "'";
+    (void)std::system(cmd.c_str());
+}
+
+}  // namespace
+
+TEST(OrchSoFile, DistinctCallableIdsProduceDistinctPaths) {
+    // Repro for the 507018 SIGBUS bug: with pid-only naming, cid=0 and
+    // cid=1 collide on `libdevice_orch_<pid>.so` and the second
+    // O_TRUNC open silently shreds the first cid's already-dlopen'd
+    // image. Embedding the cid restores per-callable file isolation.
+    const std::string dir = mkscratch_dir();
+    char path0[256] = {};
+    char path1[256] = {};
+
+    int32_t fd0 = create_orch_so_file(dir.c_str(), /*callable_id=*/0, path0, sizeof(path0));
+    ASSERT_GE(fd0, 0) << "create_orch_so_file(cid=0) failed";
+    close(fd0);
+
+    int32_t fd1 = create_orch_so_file(dir.c_str(), /*callable_id=*/1, path1, sizeof(path1));
+    ASSERT_GE(fd1, 0) << "create_orch_so_file(cid=1) failed";
+    close(fd1);
+
+    EXPECT_STRNE(path0, path1) << "Distinct cids must yield distinct file paths "
+                                  "(otherwise O_TRUNC would corrupt the first SO).";
+
+    rmtree(dir);
+}
+
+TEST(OrchSoFile, LegacySentinelKeepsPidOnlyNaming) {
+    // Variants that never adopt per-cid dispatch pass cid=-1; the file
+    // name must remain pid-only so existing callers see no change.
+    const std::string dir = mkscratch_dir();
+    char path[256] = {};
+
+    int32_t fd = create_orch_so_file(dir.c_str(), /*callable_id=*/-1, path, sizeof(path));
+    ASSERT_GE(fd, 0);
+    close(fd);
+
+    char expected[256];
+    std::snprintf(expected, sizeof(expected), "%s/libdevice_orch_%d.so", dir.c_str(), getpid());
+    EXPECT_STREQ(path, expected) << "Legacy (cid=-1) path must remain pid-only";
+
+    rmtree(dir);
+}

From 5f264ef25582bcd0c0b80fee5d3b0c53df512189 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Thu, 7 May 2026 09:30:59 +0800
Subject: [PATCH 12/28] fix(pr): resolve CI failures for #710

- python/bindings: add TaskArgs overload for ChipWorker.run() so chip
  child loops on variants without prepare_callable can dispatch via the
  legacy TaskArgs path (fixes a5 multi_chip_dispatch failures).
- a2a3 sim/onboard device_runner: in upload_kernel_binary, hash the
  incoming bytes and re-upload when a cached func_id entry holds a
  different binary. Stage 4 wires multiple ChipCallables onto the same
  ChipWorker (and DeviceRunner) via prepare_callable, so different
  callables register distinct kernels under overlapping func_ids; the
  prior unconditional cache hit handed the AICore the previous
  callable's kernel and segfaulted (sim) or hung the AICPU dispatch
  spin-wait (onboard) on the next run.
- a2a3 sim device_runner: initialize Worker.l2_perf_records_addr in
  the per-core init loop (matches onboard); uninitialized garbage was
  being treated as a valid pointer when the L2 swimlane bit happened
  to be set in enable_profiling_flag, causing AICore segfaults.
- a2a3 onboard host_regs: restore placeholder-address fallback for
  AicoreRegKind::Ctrl on halMemCtl failure (the dispatch path does not
  dereference these); Pmu kind continues to propagate failure so the
  caller can disable PMU collection cleanly.
- a2a3 runtime aicpu_executor: replace stray DEV_ERROR (undefined in
  this branch's logging surface) with LOG_ERROR, and drop the spurious
  leading 0 argument on a LOG_INFO_V0 call (V0 is the verbosity-0 form,
  not LOG_INFO_V).
- a2a3 l2_perf_collector.h: drop unused #include "runtime.h" so
  clang-tidy can lint the header without per-runtime include paths.
---
 python/bindings/task_interface.cpp            | 10 +++++++++
 .../platform/onboard/host/device_runner.cpp   | 22 ++++++++++++++++---
 .../platform/onboard/host/device_runner.h     |  4 ++++
 src/a2a3/platform/onboard/host/host_regs.cpp  | 22 +++++++++++++++----
 src/a2a3/platform/sim/host/device_runner.cpp  | 21 +++++++++++++++---
 .../aicpu/aicpu_executor.cpp                  |  5 ++---
 6 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 3c01dabcf..55f0b7dfc 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -629,6 +629,16 @@ NB_MODULE(_task_interface, m) {
             },
             nb::arg("callable"), nb::arg("args"), nb::arg("config")
         )
+        .def(
+            "run",
+            [](ChipWorker &self, const PyChipCallable &callable, TaskArgs &args, const CallConfig &config) {
+                TaskArgsView view = make_view(args);
+                self.run(reinterpret_cast<uint64_t>(callable.buffer_.data()), view, config);
+            },
+            nb::arg("callable"), nb::arg("args"), nb::arg("config"),
+            "Launch a callable from a TaskArgs (used by chip child loops on "
+            "variants without prepare_callable support)."
+        )
         .def(
             "run_raw",
             [](ChipWorker &self, uint64_t callable, uint64_t args, const CallConfig &config) {
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index c3296962a..c8a7cb8d1 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -1000,6 +1000,7 @@ int DeviceRunner::finalize() {
         }
     }
     func_id_to_addr_.clear();
+    func_id_to_hash_.clear();
     binaries_loaded_ = false;
 
     // Release the cached orchestration SO buffer.
@@ -1191,11 +1192,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the callable.md prepared-callable path, multiple ChipCallables
+    // share a single ChipWorker (and DeviceRunner) and can pick distinct kernel
+    // binaries for the same func_id. Naively reusing the cached entry hands the
+    // AICore the previous callable's kernel: dispatch never completes the new
+    // task and the AICPU spins forever.
+    const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size);
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return it->second;
+        auto hash_it = func_id_to_hash_.find(func_id);
+        if (hash_it != func_id_to_hash_.end() && hash_it->second == new_hash) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching hash), returning cached address", func_id);
+            return it->second;
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        mem_alloc_.free(reinterpret_cast<void *>(it->second));
+        func_id_to_addr_.erase(it);
+        func_id_to_hash_.erase(func_id);
     }
 
     LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size);
@@ -1225,6 +1239,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     func_id_to_addr_[func_id] = callable_addr;
+    func_id_to_hash_[func_id] = new_hash;
 
     LOG_DEBUG("  func_id=%d -> callable_addr=0x%lx, binary_code_addr=0x%lx", func_id, callable_addr, binary_code_addr);
 
@@ -1242,6 +1257,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 
     mem_alloc_.free(gm_addr);
     func_id_to_addr_.erase(it);
+    func_id_to_hash_.erase(func_id);
 
     LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr);
 }
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 79f95d404..a9efab30e 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -514,6 +514,10 @@ class DeviceRunner {
     // Kernel binary management
     bool binaries_loaded_{false};              // true after AICPU SO loaded
     std::map<int, uint64_t> func_id_to_addr_;  // func_id -> function_bin_addr (device GM)
+    // Parallel hash map for upload_kernel_binary() to detect when the same
+    // func_id is re-uploaded with different binary bytes (different ChipCallable
+    // sharing the same func_id under callable.md / Stage 4).
+    std::map<int, uint64_t> func_id_to_hash_;
 
     // Orchestration SO cache. `cached_orch_so_hash_ == 0` means "no cache".
     // The device buffer grows monotonically — cache miss with a larger SO
diff --git a/src/a2a3/platform/onboard/host/host_regs.cpp b/src/a2a3/platform/onboard/host/host_regs.cpp
index 0a90e4b07..f519392e1 100644
--- a/src/a2a3/platform/onboard/host/host_regs.cpp
+++ b/src/a2a3/platform/onboard/host/host_regs.cpp
@@ -135,8 +135,11 @@ get_aicore_reg_info(std::vector<int64_t> &aic, std::vector<int64_t> &aiv, const
 
 /**
  * Get one flat AIC-then-AIV address array for the requested register kind.
- * Returns a negative code on HAL failure; does NOT generate placeholder
- * addresses (callers must treat failure as fatal for that kind).
+ * For Ctrl kind, falls back to placeholder addresses on HAL failure to
+ * preserve historical behavior on hardware where halMemCtl rejects
+ * ADDR_MAP_TYPE_REG_AIC_CTRL queries (the dispatch path does not actually
+ * dereference these addresses).  For Pmu kind, propagates the HAL error so
+ * the caller can disable PMU collection cleanly.
  */
 static int get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id, AicoreRegKind kind) {
     std::vector<int64_t> aic;
@@ -144,8 +147,19 @@ static int get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id, Aicor
 
     int rc = get_aicore_reg_info(aic, aiv, kind_to_addr_type(kind), device_id);
     if (rc != 0) {
-        LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc);
-        return rc;
+        if (kind == AicoreRegKind::Ctrl) {
+            LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d, using placeholder addresses", kind_to_name(kind), rc);
+            aic.clear();
+            aiv.clear();
+            for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
+                aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000));
+                aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
+                aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
+            }
+        } else {
+            LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc);
+            return rc;
+        }
     }
 
     // AIC cores first, then AIV cores
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 8245b9509..f07d32358 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -359,6 +359,8 @@ int DeviceRunner::run(
         runtime.workers[i].task = 0;
         // First 1/3 are AIC, remaining 2/3 are AIV
         runtime.workers[i].core_type = (i < num_aic) ? CoreType::AIC : CoreType::AIV;
+        runtime.workers[i].enable_profiling_flag = enable_profiling_flag;
+        runtime.workers[i].l2_perf_records_addr = static_cast<uint64_t>(0);
     }
 
     // Set function_bin_addr for each task: func_id_to_addr_[] stores CoreCallable
@@ -955,11 +957,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the callable.md prepared-callable path, multiple ChipCallables
+    // share a single ChipWorker (and hence DeviceRunner) and can pick distinct
+    // kernel binaries for the same func_id.  Naively reusing the cached entry
+    // hands the AICore the previous callable's kernel and segfaults at dispatch.
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        const auto &cached_callable = *reinterpret_cast<const CoreCallable *>(it->second.callable_buf);
+        const auto *new_callable = reinterpret_cast<const CoreCallable *>(bin_data);
+        if (cached_callable.binary_size() == new_callable->binary_size() &&
+            std::memcmp(cached_callable.binary_data(), new_callable->binary_data(), new_callable->binary_size()) == 0) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching bytes), returning cached address", func_id);
+            return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        if (it->second.dl_handle != nullptr) dlclose(it->second.dl_handle);
+        delete[] it->second.callable_buf;
+        func_id_to_addr_.erase(it);
     }
 
     // Extract binary from CoreCallable envelope
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index e4c08f2ac..a93f4884e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -241,7 +241,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // single-slot orch SO cache. Any other negative value is a
             // protocol violation.
             if (callable_id >= MAX_REGISTERED_CALLABLE_IDS || (callable_id < 0 && callable_id != -1)) {
-                DEV_ERROR(
+                LOG_ERROR(
                     "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
                 );
                 runtime_init_ready_.store(true, std::memory_order_release);
@@ -390,8 +390,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 if (use_table) orch_so_table_[callable_id].in_use = true;
             } else {
                 LOG_INFO_V0(
-                    0, "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle,
-                    callable_id
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
                 );
                 if (*p_handle == nullptr || *p_func == nullptr) {
                     LOG_ERROR(

From 80f2be5bb5057a7886aa6ccf8dd70b843d570a75 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Thu, 7 May 2026 16:15:45 +0800
Subject: [PATCH 13/28] =?UTF-8?q?feat(callable):=20Phase=200=20=E2=80=94?=
 =?UTF-8?q?=20add=20active=5Fcallable=5Fid=5F=20to=20all=20runtimes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `active_callable_id_` and `register_new_callable_id_` fields plus
their setter/getter to the three runtime variants that lack them
(a2a3/host_build_graph, a5/tensormap_and_ringbuffer,
a5/host_build_graph). After this commit every runtime variant exposes
the same per-callable_id state shape that a2a3/tensormap_and_ringbuffer
already has — Phase 1+ wire AICPU and platform layers to read it.

Also gate a5/tensormap_and_ringbuffer with `#define
RUNTIME_HAS_CALLABLE_ID 1` so the shared a5 platform layer recognises
the protocol when compiled against this runtime; the macro is removed
once every variant implements the prepare/run_prepared path.

Behaviour is unchanged: the new fields are written but no caller reads
them yet. All four sim variants
(a2a3sim/{trb,hbg}, a5sim/{trb,hbg}) compile cleanly.
---
 .../host_build_graph/runtime/runtime.h        | 12 ++++++++++
 .../host_build_graph/runtime/runtime.h        | 11 ++++++++++
 .../runtime/runtime.h                         | 22 +++++++++++++++++++
 .../runtime/shared/runtime.cpp                | 11 ++++++++++
 4 files changed, 56 insertions(+)

diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 46b673878..ce9d1f5fd 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -460,6 +460,12 @@ class Runtime {
     uint64_t dev_orch_so_size_{0};
     bool has_new_orch_so_{false};
 
+    // Per-callable_id dispatch (callable.md). hbg orch runs on host, so AICPU
+    // never reads `active_callable_id_`; the field exists for parity with the
+    // shared platform layer (DeviceRunner stamps it on every run).
+    int32_t active_callable_id_{-1};
+    bool register_new_callable_id_{false};
+
     // Host-only staging fields (mirror tensormap_and_ringbuffer variant).
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
@@ -469,6 +475,12 @@ class Runtime {
         dev_orch_so_size_ = size;
         has_new_orch_so_ = is_new;
     }
+    void set_active_callable_id(int32_t callable_id, bool is_new) {
+        active_callable_id_ = callable_id;
+        register_new_callable_id_ = is_new;
+    }
+    int32_t get_active_callable_id() const { return active_callable_id_; }
+    bool register_new_callable_id() const { return register_new_callable_id_; }
 };
 
 #endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index 607783733..704cf6477 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -469,6 +469,11 @@ class Runtime {
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
     bool has_new_orch_so_{false};
+    // Per-callable_id dispatch (callable.md). hbg orch runs on host, so AICPU
+    // never reads `active_callable_id_`; the field exists for parity with the
+    // shared platform layer (DeviceRunner stamps it on every run).
+    int32_t active_callable_id_{-1};
+    bool register_new_callable_id_{false};
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
@@ -477,6 +482,12 @@ class Runtime {
         dev_orch_so_size_ = size;
         has_new_orch_so_ = is_new;
     }
+    void set_active_callable_id(int32_t callable_id, bool is_new) {
+        active_callable_id_ = callable_id;
+        register_new_callable_id_ = is_new;
+    }
+    int32_t get_active_callable_id() const { return active_callable_id_; }
+    bool register_new_callable_id() const { return register_new_callable_id_; }
 };
 
 #endif  // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index e8bd2ff85..7279ee5e9 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -29,6 +29,13 @@
 #ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 #define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 
+// This variant supports the per-callable_id dispatch protocol (callable.md).
+// DeviceRunner and pto_runtime_c_api.cpp check this at compile time to guard
+// callable_id-specific code paths so the same sources compile cleanly against
+// variants that lack the protocol (host_build_graph). This guard is removed
+// in a later phase once every variant implements the protocol.
+#define RUNTIME_HAS_CALLABLE_ID 1
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -209,6 +216,15 @@ class Runtime {
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
     bool has_new_orch_so_;
+    // Per-callable_id dispatch (callable.md). When `active_callable_id_ >= 0`,
+    // AICPU dispatches via `orch_so_table_[active_callable_id_]` instead of
+    // the legacy single-slot cache; `register_new_callable_id_` then signals
+    // whether the host is delivering a freshly-registered callable_id
+    // (write+dlopen) or reusing an already-loaded one. `active_callable_id_
+    // == -1` keeps the legacy fast path (run_runtime() compatibility shim) —
+    // has_new_orch_so_ governs reload.
+    int32_t active_callable_id_;
+    bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
@@ -265,6 +281,12 @@ class Runtime {
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
     bool has_new_orch_so() const;
+    // Per-callable_id dispatch (callable.md). callable_id < 0 disables and
+    // falls back to the legacy single-slot orch SO cache governed by
+    // has_new_orch_so_.
+    void set_active_callable_id(int32_t callable_id, bool is_new);
+    int32_t get_active_callable_id() const;
+    bool register_new_callable_id() const;
     void set_device_orch_func_name(const char *name);
     const char *get_device_orch_func_name() const;
     void set_device_orch_config_name(const char *name);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 8f595e1a3..80ae1b8b2 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -55,6 +55,8 @@ Runtime::Runtime() {
     dev_orch_so_addr_ = 0;
     dev_orch_so_size_ = 0;
     has_new_orch_so_ = false;
+    active_callable_id_ = -1;
+    register_new_callable_id_ = false;
     device_orch_func_name_[0] = '\0';
     device_orch_config_name_[0] = '\0';
 
@@ -117,6 +119,15 @@ uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
 
 bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
 
+void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
+    active_callable_id_ = callable_id;
+    register_new_callable_id_ = is_new;
+}
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
+
 void Runtime::set_device_orch_func_name(const char *name) {
     if (name == nullptr) {
         device_orch_func_name_[0] = '\0';

From 22ea75ddacc6254979304174d05e3c583a650ea8 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Thu, 7 May 2026 17:28:46 +0800
Subject: [PATCH 14/28] =?UTF-8?q?feat(callable):=20Phase=201=20=E2=80=94?=
 =?UTF-8?q?=20port=20a5/trb=20to=20per-cid=20orch=20SO=20table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors the a2a3/tensormap_and_ringbuffer prepared_callable implementation
onto a5: AICPU executor gains a per-cid orch_so_table_, host device runner
gains register/unregister/has/bind methods + a hash-keyed orch SO buffer
dedup, and runtime_maker.cpp is split into prepare_callable_impl +
bind_prepared_to_runtime_impl with init_runtime_impl as a shim.

The a5 platform layer (onboard + sim) is shared between trb and hbg, so
callable-specific implementations are guarded by RUNTIME_HAS_CALLABLE_ID
to keep hbg compiling until Phase 2 lands its prepare/bind impls.
---
 .../platform/onboard/host/device_runner.cpp   | 192 +++++++++++++++++-
 src/a5/platform/onboard/host/device_runner.h  |  73 +++++++
 .../onboard/host/pto_runtime_c_api.cpp        | 175 +++++++++++++++-
 src/a5/platform/sim/host/device_runner.cpp    | 165 ++++++++++++++-
 src/a5/platform/sim/host/device_runner.h      |  43 ++++
 .../platform/sim/host/pto_runtime_c_api.cpp   | 159 ++++++++++++++-
 .../aicpu/aicpu_executor.cpp                  | 146 +++++++++----
 .../host/runtime_maker.cpp                    |  77 ++++---
 8 files changed, 940 insertions(+), 90 deletions(-)

diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 149feb7da..4765c5624 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "host/host_regs.h"  // Register address retrieval
 #include "host/raii_scope_guard.h"
@@ -599,6 +600,39 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    // Per-callable_id path (callable.md): when run_prepared bound a known
+    // callable_id, the SO bytes were already H2D'd at prepare_callable time.
+    // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
+    // whether the AICPU has seen this id since registration.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        // The c_api caller passed is_new=false; refresh with the authoritative
+        // first_sighting flag before AICPU consumes register_new_callable_id_.
+        runtime.set_active_callable_id(cid, first_sighting);
+        // Pending fields must be empty in the prepared path — runtime_maker's
+        // bind_prepared_to_runtime_impl never stages them. Defensive clear:
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
@@ -650,6 +684,128 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+    // (declared in src/common/task_interface/callable_protocol.h) and indexes
+    // it by callable_id; rejecting an out-of-range id here keeps host and AICPU
+    // in sync and avoids an OOB access at run time.
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    // Hash dedup: share device buffer across callable_ids that carry the same
+    // SO bytes. Refcount drops in unregister_prepared_callable; we only free
+    // when the count hits zero.
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        int rc = rtMemcpy(buf, orch_so_size, orch_so_data, orch_so_size, RT_MEMCPY_HOST_TO_DEVICE);
+        if (rc != 0) {
+            LOG_ERROR("register_prepared_callable: rtMemcpy failed: %d", rc);
+            mem_alloc_.free(buf);
+            return rc;
+        }
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    const uint64_t hash = it->second.hash;
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+
+    // Replay kernel addresses directly into runtime.func_id_to_addr_ without
+    // going through set_function_bin_addr — the latter would record func_ids
+    // in registered_kernel_func_ids_, which validate_runtime_impl iterates to
+    // free kernel binaries. Prepared kernels must survive across runs and only
+    // be freed by finalize().
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.func_id_to_addr_[kv.first] = kv.second;
+    }
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
+    // with the authoritative first_sighting answer right before launch.
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 int DeviceRunner::finalize() {
     if (device_id_ == -1) {
         return 0;
@@ -669,14 +825,27 @@ int DeviceRunner::finalize() {
     // Cleanup AICPU SO
     so_info_.finalize();
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The callable.md prepared-callable path intentionally
+    // leaves them resident across runs (shared by func_id) and relies on
+    // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the
+    // legacy regression signal is preserved for callers that never went
+    // through prepare_callable.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
-        // Cleanup leaked binaries to prevent memory leaks
+#ifdef RUNTIME_HAS_CALLABLE_ID
+        const bool prepared_path_used = prepared_callable_path_used_;
+#else
+        const bool prepared_path_used = false;
+#endif
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
+        }
         for (const auto &pair : func_id_to_addr_) {
             void *gm_addr = reinterpret_cast<void *>(pair.second);
             mem_alloc_.free(gm_addr);
-            LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
+            LOG_DEBUG("Freed kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
         }
     }
     func_id_to_addr_.clear();
@@ -691,6 +860,21 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers that callers forgot to
+    // unregister. Refcounts no longer matter at this point — the device is
+    // about to be reset.
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     // Cleanup performance profiling (frees L2PerfSetupHeader + all per-core/per-thread buffers)
     if (l2_perf_collector_.is_initialized()) {
         auto free_cb = [](void *dev_ptr) -> int {
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 4c5fab748..55f76e52e 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -33,6 +33,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/kernel_args.h"
@@ -370,6 +372,51 @@ class DeviceRunner {
      */
     void release_run_context();
 
+    /**
+     * Stage a per-callable_id orchestration SO into device memory and remember
+     * the supporting metadata (entry/config symbol names, kernel func_id ↔
+     * dev_addr table). Identical SO bytes across two callable_ids share one
+     * device buffer (refcounted by hash) so the worst case for an N-cid pool
+     * is N distinct device buffers, not N copies of the same SO.
+     *
+     * @param callable_id   Caller-stable id, must be in [0, MAX_REGISTERED_CALLABLE_IDS).
+     * @param orch_so_data  Host pointer to orchestration SO bytes (owned by caller).
+     * @param orch_so_size  Size of orchestration SO in bytes.
+     * @param func_name     Entry symbol name (copied).
+     * @param config_name   Config symbol name (copied).
+     * @param kernel_addrs  func_id ↔ dev_addr pairs already uploaded by the
+     *                      caller. Stored verbatim so run_prepared can replay
+     *                      them onto a fresh Runtime without re-uploading.
+     * @return 0 on success, negative on failure.
+     */
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Drop the prepared state for `callable_id` and decrement the SO buffer's
+     * hash-keyed refcount; frees the device buffer when the count hits zero.
+     * Kernel binaries are shared across callables and only released by
+     * finalize().
+     */
+    int unregister_prepared_callable(int32_t callable_id);
+
+    /** True iff `callable_id` has prepared state staged. */
+    bool has_prepared_callable(int32_t callable_id) const;
+
+    /**
+     * Replay the prepared state for `callable_id` onto a freshly-constructed
+     * Runtime. See a2a3 onboard documentation for full contract.
+     */
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+
+    /**
+     * Number of distinct callable_ids the AICPU has been asked to dlopen for.
+     * Monotonically increases on first-sighting bind; never decremented.
+     */
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+
 private:
     // Internal state
     int device_id_{-1};
@@ -398,6 +445,32 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state (callable.md design). See a2a3 onboard
+    // device_runner.h for the full design narrative; mirrored here so a5
+    // shares the same dispatch surface.
+    struct PreparedCallableState {
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    // Monotonic AICPU dlopen counter (first-sighting bind only; never decremented).
+    size_t aicpu_dlopen_total_{0};
+    // Sticky flag: prepare_callable was called at least once. Lets finalize()
+    // distinguish legacy-path leaks from prepared-path kernels that legitimately
+    // live until finalize.
+    bool prepared_callable_path_used_{false};
+
     // Performance profiling
     L2PerfCollector l2_perf_collector_;
 
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 12e768ce2..8110263fc 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -40,6 +40,10 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
+#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -331,14 +335,165 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_info_v(log_info_v);
 }
 /* ===========================================================================
- * Prepared-callable ABI stubs.
+ * Per-callable_id preparation (callable.md design)
  *
- * a5 runtimes do not yet implement the per-callable_id orchestration SO
- * dispatch path described in docs/callable.md (only a2a3/tensormap_and_ringbuffer
- * does). ChipWorker dlsym's these symbols unconditionally, so we expose stubs
- * that fail loudly at call time rather than failing to load the library.
+ * Variants that define RUNTIME_HAS_CALLABLE_ID get the real prepare/run_prepared
+ * implementation; others fall back to dlsym-resolvable stubs that fail loudly
+ * at call time so ChipWorker's unconditional symbol resolution still succeeds.
  * =========================================================================== */
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    // AICPU/AICore executor binaries are only consumed by run()/run_prepared();
+    // prepare_callable just uploads kernel + orch SO state.
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
+        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
+        Runtime *r = new (rt_buf) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            r->~Runtime();
+            return rc;
+        }
+
+        // Extract kernel func_id ↔ dev_addr pairs uploaded by prepare_callable_impl.
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        // Clear registered kernels so the Runtime destructor (or any accidental
+        // validate call) does NOT free the kernel binaries we just uploaded —
+        // they belong to the prepared state now.
+        r->clear_registered_kernels();
+
+        rc = runner->register_prepared_callable(
+            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+            r->get_device_orch_config_name(), std::move(kernel_addrs)
+        );
+        r->~Runtime();
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+) {
+    if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        Runtime *r = new (runtime) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        // Restore kernel addrs + orch symbol names + active_callable_id
+        rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            return rc;
+        }
+
+        // Per-run binding (tensor args, GM heap, SM alloc)
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
+        if (rc != 0) {
+            r->set_gm_sm_ptr(nullptr);
+            validate_runtime_impl(r);
+            r->~Runtime();
+            return rc;
+        }
+
+        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
+        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
+        runner->set_pmu_enabled(enable_pmu);
+        runner->set_output_prefix(output_prefix);
+
+        std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
+        std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
+        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
+        if (rc != 0) {
+            validate_runtime_impl(r);
+            r->~Runtime();
+            return rc;
+        }
+
+        rc = validate_runtime_impl(r);
+        r->~Runtime();
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
+    } catch (...) {
+        return -1;
+    }
+}
+#else   // RUNTIME_HAS_CALLABLE_ID
+// Stubs so the dlsym surface is uniform across runtime variants. ChipWorker
+// resolves these unconditionally; variants that lack callable.md support
+// reject the calls at runtime instead of failing to load the library.
 int prepare_callable(
     DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
 ) {
@@ -353,7 +508,15 @@ int run_prepared(
     return -1;
 }
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
+#endif  // RUNTIME_HAS_CALLABLE_ID
 
-size_t get_aicpu_dlopen_count(DeviceContextHandle) { return 0; }
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
 
 }  // extern "C"
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 015419665..0fd9278a5 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -36,6 +36,7 @@
 
 #include "aicpu/platform_aicpu_affinity.h"
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "cpu_sim_context.h"
 #include "host/raii_scope_guard.h"
@@ -653,6 +654,34 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    // Per-callable_id path (callable.md): mirror onboard. Bytes were staged
+    // at register_prepared_callable time; here we only stamp metadata onto
+    // the runtime and resolve `register_new_callable_id_` from first sighting.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        runtime.set_active_callable_id(cid, first_sighting);
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
@@ -700,6 +729,110 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        // Sim shares an address space with the simulated AICPU thread, so a
+        // plain memcpy is the moral equivalent of rtMemcpy on hardware.
+        std::memcpy(buf, orch_so_data, orch_so_size);
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    const uint64_t hash = it->second.hash;
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.func_id_to_addr_[kv.first] = kv.second;
+    }
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
 int DeviceRunner::finalize() {
     // Skip if already finalized
     if (device_id_ == -1 && aicpu_so_handle_ == nullptr && aicore_so_handle_ == nullptr) {
@@ -736,15 +869,26 @@ int DeviceRunner::finalize() {
         pmu_collector_.finalize(nullptr, free_cb, nullptr);
     }
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The callable.md prepared-callable path intentionally
+    // leaves them resident across runs and relies on finalize() to reclaim
+    // them; that is not a leak.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
-        // Cleanup leaked handles and host copies
+#ifdef RUNTIME_HAS_CALLABLE_ID
+        const bool prepared_path_used = prepared_callable_path_used_;
+#else
+        const bool prepared_path_used = false;
+#endif
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
+        }
         for (auto &pair : func_id_to_addr_) {
             MappedKernel &kernel = pair.second;
             if (kernel.dl_handle != nullptr) {
                 dlclose(kernel.dl_handle);
-                LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first);
+                LOG_DEBUG("Closed kernel: func_id=%d", pair.first);
             }
             delete[] kernel.callable_buf;
         }
@@ -761,6 +905,19 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers callers forgot to drop.
+#ifdef RUNTIME_HAS_CALLABLE_ID
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
+#endif  // RUNTIME_HAS_CALLABLE_ID
+
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 636149f18..3a39a31df 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -37,6 +37,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/core_type.h"
@@ -208,6 +210,27 @@ class DeviceRunner {
      */
     void remove_kernel_binary(int func_id);
 
+    /**
+     * Stage a per-callable_id orchestration SO and its supporting metadata.
+     * See a5 onboard or a2a3 device_runner.h for full contract.
+     */
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /** Drop prepared state for `callable_id`; refcounts the dedup'd SO buffer. */
+    int unregister_prepared_callable(int32_t callable_id);
+
+    /** True iff `callable_id` has prepared state staged. */
+    bool has_prepared_callable(int32_t callable_id) const;
+
+    /** Replay prepared state onto a freshly-constructed Runtime. */
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+
+    /** Monotonic AICPU dlopen counter (first-sighting only; never decremented). */
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+
 private:
     // Configuration
     int device_id_{-1};
@@ -230,6 +253,26 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state (callable.md design). Mirrors onboard.
+    struct PreparedCallableState {
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    size_t aicpu_dlopen_total_{0};
+    bool prepared_callable_path_used_{false};
+
     // Runtime pointer for print_handshake_results
     Runtime *last_runtime_{nullptr};
 
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index dee315ec8..00aa79028 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -36,6 +36,10 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
+#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -282,14 +286,147 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_info_v(log_info_v);
 }
 /* ===========================================================================
- * Prepared-callable ABI stubs.
- *
- * a5 runtimes do not yet implement the per-callable_id orchestration SO
- * dispatch path described in docs/callable.md (only a2a3/tensormap_and_ringbuffer
- * does). ChipWorker dlsym's these symbols unconditionally, so we expose stubs
- * that fail loudly at call time rather than failing to load the library.
+ * Per-callable_id preparation (callable.md design)
  * =========================================================================== */
 
+#ifdef RUNTIME_HAS_CALLABLE_ID
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+    (void)device_id;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+
+    try {
+        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
+        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
+        Runtime *r = new (rt_buf) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        int rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        r->clear_registered_kernels();
+
+        rc = runner->register_prepared_callable(
+            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+            r->get_device_orch_config_name(), std::move(kernel_addrs)
+        );
+        r->~Runtime();
+        pthread_setspecific(g_runner_key, nullptr);
+        return rc;
+    } catch (...) {
+        pthread_setspecific(g_runner_key, nullptr);
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+) {
+    if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+
+    try {
+        Runtime *r = new (runtime) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        int rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
+        if (rc != 0) {
+            r->set_gm_sm_ptr(nullptr);
+            validate_runtime_impl(r);
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
+        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
+        runner->set_pmu_enabled(enable_pmu);
+        runner->set_output_prefix(output_prefix);
+
+        std::vector<uint8_t> aicpu_vec;
+        std::vector<uint8_t> aicore_vec;
+        if (aicpu_binary != NULL && aicpu_size > 0) {
+            aicpu_vec.assign(aicpu_binary, aicpu_binary + aicpu_size);
+        }
+        if (aicore_binary != NULL && aicore_size > 0) {
+            aicore_vec.assign(aicore_binary, aicore_binary + aicore_size);
+        }
+        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
+        if (rc != 0) {
+            validate_runtime_impl(r);
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        rc = validate_runtime_impl(r);
+        r->~Runtime();
+        pthread_setspecific(g_runner_key, nullptr);
+        return rc;
+    } catch (...) {
+        pthread_setspecific(g_runner_key, nullptr);
+        return -1;
+    }
+}
+
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
+    } catch (...) {
+        return -1;
+    }
+}
+#else   // RUNTIME_HAS_CALLABLE_ID
 int prepare_callable(
     DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
 ) {
@@ -304,7 +441,15 @@ int run_prepared(
     return -1;
 }
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
+#endif  // RUNTIME_HAS_CALLABLE_ID
 
-size_t get_aicpu_dlopen_count(DeviceContextHandle) { return 0; }
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
 
 }  // extern "C"
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index ec7c6e4ee..b10724738 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -24,6 +24,7 @@
 
 #include "aicpu/device_time.h"
 #include "aicpu/orch_so_file.h"
+#include "callable_protocol.h"
 #include "pto2_dispatch_payload.h"
 #include "runtime.h"
 #include "spin_hint.h"
@@ -89,6 +90,25 @@ static int32_t read_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
+// Per-callable_id orchestration SO table. AICPU side of the callable.md
+// design: when `runtime->active_callable_id_ >= 0` the executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of that
+// callable_id, kept warm across runs); when `active_callable_id_ < 0` it
+// falls back to the legacy single slot governed by `has_new_orch_so_`.
+// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
+// (mailbox uint32 callable_id, register() returns small ints) and is shared
+// with the host bounds check in DeviceRunner::register_prepared_callable —
+// see src/common/task_interface/callable_protocol.h.
+
+struct OrchSoEntry {
+    bool in_use{false};
+    void *handle{nullptr};
+    char path[256]{};
+    DeviceOrchestrationFunc func{nullptr};
+    DeviceOrchestrationBindRuntimeFunc bind{nullptr};
+    DeviceOrchestrationConfigFunc config_func{nullptr};
+};
+
 struct AicpuExecutor {
     int32_t sched_thread_num_;
     bool orch_to_sched_{false};
@@ -107,9 +127,9 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Orchestration SO handle - defer dlclose until all tasks complete
+    // Legacy single-slot orch SO cache (active_callable_id_ == -1 path).
     void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};  // Path to orchestration SO file for cleanup
+    char orch_so_path_[256]{};
 
     // Shared orchestration function pointer (loaded by first orch thread, used by all)
     DeviceOrchestrationFunc orch_func_{nullptr};
@@ -117,6 +137,11 @@ struct AicpuExecutor {
     DeviceOrchestrationConfigFunc orch_config_func_{nullptr};
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
+    // Per-callable_id table (active_callable_id_ >= 0 path). Single orch thread today, so
+    // first-write/read race is not possible; if multiple orch threads are
+    // ever introduced, guard the in_use=false→true transition with a mutex.
+    OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
+
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
     SchedulerContext sched_ctx_;
 
@@ -126,8 +151,9 @@ struct AicpuExecutor {
     void deinit(Runtime *runtime);
 
     ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). The
-        // handle is otherwise kept alive across runs for cache-hit reuse.
+        // Process-wide teardown (the single static instance dies here). Both
+        // the legacy slot and every in-use callable_id slot are dlclose()'d here;
+        // each is otherwise kept alive across runs for cache-hit reuse.
         if (orch_so_handle_ != nullptr) {
             dlclose(orch_so_handle_);
             orch_so_handle_ = nullptr;
@@ -136,6 +162,12 @@ struct AicpuExecutor {
             unlink(orch_so_path_);
             orch_so_path_[0] = '\0';
         }
+        for (auto &e : orch_so_table_) {
+            if (!e.in_use) continue;
+            if (e.handle != nullptr) dlclose(e.handle);
+            if (e.path[0] != '\0') unlink(e.path);
+            e = OrchSoEntry{};
+        }
     }
 };
 
@@ -197,29 +229,46 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Two paths:
-            //   1) has_new_orch_so == true → host believes the SO identity
-            //      changed, so we drop the cached handle (if any), write the
-            //      new bytes to disk, and dlopen + dlsym a fresh handle.
-            //   2) has_new_orch_so == false → host detected a cache hit, so
-            //      we reuse `orch_so_handle_` / `orch_func_` / `orch_bind_runtime_`
-            //      from the previous run untouched. sm_handle / rt below are
-            //      always recreated because they bind this run's memory.
-            const bool reload_so = runtime->has_new_orch_so();
+            // Per-callable_id dispatch (callable.md): when active_callable_id_ >= 0 the orch
+            // SO state lives in `orch_so_table_[callable_id]` keyed by registration
+            // order; reload is governed by `register_new_callable_id_`. When
+            // active_callable_id_ < 0 we fall back to the legacy single-slot cache
+            // governed by `has_new_orch_so_`. The local pointers below let
+            // the rest of this branch ignore the choice.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            const bool use_table = (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS);
+            // -1 is the legacy sentinel that intentionally falls back to the
+            // single-slot orch SO cache. Any other negative value is a
+            // protocol violation.
+            if (callable_id >= MAX_REGISTERED_CALLABLE_IDS || (callable_id < 0 && callable_id != -1)) {
+                LOG_ERROR(
+                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
+                );
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            void **p_handle = use_table ? &orch_so_table_[callable_id].handle : &orch_so_handle_;
+            char *p_path = use_table ? orch_so_table_[callable_id].path : orch_so_path_;
+            DeviceOrchestrationFunc *p_func = use_table ? &orch_so_table_[callable_id].func : &orch_func_;
+            DeviceOrchestrationBindRuntimeFunc *p_bind =
+                use_table ? &orch_so_table_[callable_id].bind : &orch_bind_runtime_;
+            DeviceOrchestrationConfigFunc *p_config_func =
+                use_table ? &orch_so_table_[callable_id].config_func : &orch_config_func_;
+            const bool reload_so = use_table ? runtime->register_new_callable_id() : runtime->has_new_orch_so();
 
             if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected, (re)loading", thread_idx);
-                if (orch_so_handle_ != nullptr) {
-                    dlclose(orch_so_handle_);
-                    orch_so_handle_ = nullptr;
-                    orch_func_ = nullptr;
-                    orch_bind_runtime_ = nullptr;
-                    if (orch_so_path_[0] != '\0') {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+                if (*p_handle != nullptr) {
+                    dlclose(*p_handle);
+                    *p_handle = nullptr;
+                    *p_func = nullptr;
+                    *p_bind = nullptr;
+                    if (p_path[0] != '\0') {
                         // Unlink the old file so the new open() lands on a
                         // fresh inode — protects against SIGBUS / ETXTBSY when
                         // the kernel still has the old mapping pinned.
-                        unlink(orch_so_path_);
-                        orch_so_path_[0] = '\0';
+                        unlink(p_path);
+                        p_path[0] = '\0';
                     }
                 }
 
@@ -242,7 +291,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
                 for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(candidate_dirs[i], /*callable_id=*/-1, so_path, sizeof(so_path));
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
                     if (fd < 0) {
                         LOG_INFO_V0(
                             "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
@@ -333,15 +382,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     bind_runtime_func = nullptr;
                 }
 
-                orch_so_handle_ = handle;
-                orch_func_ = orch_func;
-                orch_bind_runtime_ = bind_runtime_func;
-                orch_config_func_ = config_func;
-                snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path);
+                *p_handle = handle;
+                *p_func = orch_func;
+                *p_bind = bind_runtime_func;
+                *p_config_func = config_func;
+                snprintf(p_path, 256, "%s", so_path);
+                if (use_table) orch_so_table_[callable_id].in_use = true;
             } else {
-                LOG_INFO_V0("Thread %d: Reusing cached orch SO handle=%p", thread_idx, orch_so_handle_);
-                if (orch_so_handle_ == nullptr || orch_func_ == nullptr) {
-                    LOG_ERROR("Thread %d: has_new_orch_so=false but no cached SO handle/func", thread_idx);
+                LOG_INFO_V0(
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
+                );
+                if (*p_handle == nullptr || *p_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        callable_id
+                    );
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -349,8 +404,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
             // Validate arg count on every run (reload or cache hit).
-            if (orch_config_func_ != nullptr) {
-                PTO2OrchestrationConfig cfg = orch_config_func_(runtime->get_orch_args());
+            if (*p_config_func != nullptr) {
+                PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
                 LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
                 if (cfg.expected_arg_count > 0) {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
@@ -361,17 +416,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                             cfg.expected_arg_count
                         );
                         // Clean up cached state so a subsequent run does a full reload.
-                        if (orch_so_handle_ != nullptr) {
-                            dlclose(orch_so_handle_);
-                            orch_so_handle_ = nullptr;
+                        if (*p_handle != nullptr) {
+                            dlclose(*p_handle);
+                            *p_handle = nullptr;
                         }
-                        if (orch_so_path_[0] != '\0') {
-                            unlink(orch_so_path_);
-                            orch_so_path_[0] = '\0';
+                        if (p_path[0] != '\0') {
+                            unlink(p_path);
+                            p_path[0] = '\0';
                         }
-                        orch_func_ = nullptr;
-                        orch_bind_runtime_ = nullptr;
-                        orch_config_func_ = nullptr;
+                        *p_func = nullptr;
+                        *p_bind = nullptr;
+                        *p_config_func = nullptr;
+                        if (use_table) orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
@@ -473,11 +529,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             orch_cycle_start = get_sys_cnt_aicpu();
 #endif
             framework_bind_runtime(rt);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(rt);
+            if (*p_bind != nullptr) {
+                (*p_bind)(rt);
             }
             rt_scope_begin(rt);
-            orch_func_(*orch_args_cached_);
+            (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 4c4e8dd9c..ccd03d898 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -92,31 +92,29 @@ static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *hos
 }
 
 /**
- * Initialize a pre-allocated runtime for device orchestration.
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so callable.md's prepare_callable / run_prepared
+ * split lets us run this once per callable_id and amortize across runs.
  *
- * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side).
- * This function:
- * - Copies tensor metadata and replaces host pointers with device pointers
- * - Copies all tensor data to device
- * - Records all tensors for copy-back
- * - Copies orchestration SO to device memory
- * - Sets up runtime state for device orchestration
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
  * @return 0 on success, -1 on failure
  */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -146,6 +144,32 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
         return -1;
     }
 
+    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
+    runtime->pending_orch_so_data_ = orch_so_binary;
+    runtime->pending_orch_so_size_ = orch_so_size;
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
+    return 0;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by prepare_callable_impl.
+ *
+ * Splitting this from prepare_callable_impl matches the callable.md design:
+ * register/run_prepared invokes this every call, while the prep half runs
+ * only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
     if (orch_args == nullptr) {
         LOG_ERROR("orch_args pointer is null");
         return -1;
@@ -153,7 +177,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int tensor_count = orch_args->tensor_count();
     int scalar_count = orch_args->scalar_count();
-    LOG_INFO_V0("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
 
     int64_t t_total_start = _now_ms();
 
@@ -196,13 +220,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
     }
     int64_t t_args_end = _now_ms();
 
-    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
-    int64_t t_so_start = _now_ms();
-    runtime->pending_orch_so_data_ = orch_so_binary;
-    runtime->pending_orch_so_size_ = orch_so_size;
-    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
-    int64_t t_so_end = _now_ms();
-
     // Read ready queue shard count from environment for AICPU scheduler
     {
         const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
@@ -282,7 +299,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int64_t t_total_end = _now_ms();
     LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-    LOG_INFO_V0("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start);
     LOG_INFO_V0("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
@@ -290,6 +306,19 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
     return 0;
 }
 
+/**
+ * Compatibility shim: `init_runtime_impl` is the legacy single-call path that
+ * still drives every `run_runtime` invocation today. The callable.md split
+ * keeps it as `prepare_callable_impl + bind_prepared_to_runtime_impl` so the
+ * legacy path stays one function to platform code, while `run_prepared` can
+ * skip the prepare half once a callable_id is staged.
+ */
+extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
+    int rc = prepare_callable_impl(runtime, callable);
+    if (rc != 0) return rc;
+    return bind_prepared_to_runtime_impl(runtime, orch_args);
+}
+
 /**
  * Validate runtime results and cleanup.
  *

From e5f6656ef8fefc16aea0159e98c15b889ea8952d Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Thu, 7 May 2026 17:29:25 +0800
Subject: [PATCH 15/28] =?UTF-8?q?feat(callable):=20Phase=201=20=E2=80=94?=
 =?UTF-8?q?=20mirror=20prepared=5Fcallable=20ST=20test=20to=20a5/trb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end coverage for prepare_callable / run_prepared / unregister_callable
on a5/tensormap_and_ringbuffer, structurally identical to the a2a3 test:
shared-orch double-cid run, same-cid repeat dlopen accounting, two-cid
interleaved dlopen accounting, double-prepare rejection, and unregister +
re-prepare counter monotonicity.

Reuses the orch_so_cache single-task orchestration and mixed_example
kernel_add_standalone so the test stays focused on the prepare/run ABI.
---
 .../test_prepared_callable.py                 | 216 ++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py

diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..ed1d8751c
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable on a5/trb.
+
+Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable. Uses the
+single-task orchestration borrowed from `orch_so_cache` plus
+`mixed_example/kernels/aiv/kernel_add_standalone.cpp` so the test stays
+focused on the prepare/run_prepared ABI rather than orchestration richness.
+
+aicpu_dlopen_count assertions verify that the per-cid AICPU dispatch table
+collapses repeated runs of the same callable_id into a single AICPU dlopen,
+matching docs/callable.md §7.
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_MIXED_KERNELS = "../mixed_example/kernels"
+_ORCH_SO_CACHE = "../orch_so_cache"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPreparedCallable(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable ABI on a5/trb."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_ORCH_SO_CACHE}/kernels/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_MIXED_KERNELS}/aiv/kernel_add_standalone.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
+    _PLATFORMS = ["a5sim", "a5"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # f = a + b (kernel_add_standalone)
+        args.f[:] = args.a + args.b
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        # 1) prepare two callable_ids with the SAME callable (shared orch SO)
+        worker.prepare_callable(0, callable_obj)
+        worker.prepare_callable(1, callable_obj)
+
+        # 2) run_prepared cid=0 twice (second run proves dedup/cache hit)
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(0, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 3) run_prepared cid=1 — different slot, same SO, must also work
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(1, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 4) unregister both — should not raise
+        worker.unregister_callable(0)
+        worker.unregister_callable(1)
+
+    # ------------------------------------------------------------------
+    # aicpu_dlopen_count assertions (callable.md §7 verification).
+    # See a2a3 prepared_callable test for the contract notes.
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        """Case A: prepare(0) + run(0) × 5 → dlopen_count delta == 1."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new dlopen for 5 runs of cid=0, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        """Case B: prepare(0)+prepare(1) + (run(0),run(1)) × 5 → delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(1, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, 1, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new dlopens for cids {{0,1}} interleaved, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(1)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """Case C: prepare(0) + prepare(0) → second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(0, callable_obj)
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """Case D: prepare+run+unregister+prepare+run → counter monotonic, delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1
+            st_worker.unregister_callable(0)
+            registered = False
+            after_unreg = st_worker.aicpu_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if registered:
+                st_worker.unregister_callable(0)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)

From 3bedebc5b8d136b36586455c9f1562d012b2e2c0 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 10:37:00 +0800
Subject: [PATCH 16/28] =?UTF-8?q?feat(callable):=20Phase=202=20=E2=80=94?=
 =?UTF-8?q?=20host=5Fbuild=5Fgraph=20prepare/run=5Fprepared=20with=20cache?=
 =?UTF-8?q?d=20host=20dlopen?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 4 hbg runtime.h (a2a3+a5): add RUNTIME_HAS_CALLABLE_ID + RUNTIME_HOST_ORCH
  defines and pending_host_dlopen_handle_/pending_host_orch_func_ptr_ host
  staging fields.
- 4 runtimes (trb+hbg): add replay_function_bin_addr(func_id, addr) — does
  not record into registered_kernel_func_ids_, lets platform replay prepared
  kernel bindings without triggering validate-time release. Unifies
  func_id_to_addr_ access via member function.
- 2 hbg runtime_maker.cpp: split init_runtime_impl into prepare_callable_impl
  (dlopen+dlsym → staging fields) and bind_prepared_to_runtime_impl (read
  fn_ptr, call orch_func, build graph). Legacy init_runtime_impl is now a
  shim (dlclose at end).
- 4 platform device_runner.{h,cpp} (a2a3/a5 × onboard/sim):
  PreparedCallableState extended with host_dlopen_handle/host_orch_func_ptr;
  new register_prepared_callable_host_orch + host_dlopen_count +
  host_dlopen_total_; unregister_prepared_callable branches on
  host_dlopen_handle (hbg → dlclose, trb → orch_so_dedup_ refcount);
  bind_prepared_callable_to_runtime uses replay_function_bin_addr; host orch
  fields restored under #ifdef RUNTIME_HOST_ORCH; prepare_orch_so early-
  returns for hbg (zeroes dev_orch_so to skip AICPU counting).
- 4 pto_runtime_c_api.cpp: prepare_callable uses std::unique_ptr<Runtime>
  (hbg Runtime holds 131072 Tasks ≈ tens of MB, too large for stack);
  routes to register_prepared_callable_host_orch under #ifdef
  RUNTIME_HOST_ORCH; exports get_host_dlopen_count.
- chip_worker.{h,cpp}: add host_dlopen_count() getter and dlsym binding.
- bindings/task_interface.cpp + python/simpler/{task_interface,worker}.py:
  expose host_dlopen_count attribute.

Verified: 4 sim binaries compile, 4 variants × 5 prepared_callable ST tests
pass (20 total), tests/ut/py/test_chip_worker.py 15 pass, a2a3/hbg
vector_example regression passes.
---
 python/bindings/task_interface.cpp            |   6 ++
 python/simpler/task_interface.py              |   5 +
 python/simpler/worker.py                      |  11 ++
 .../platform/onboard/host/device_runner.cpp   |  62 ++++++++++-
 .../platform/onboard/host/device_runner.h     |  46 +++++++-
 .../onboard/host/pto_runtime_c_api.cpp        |  32 +++++-
 src/a2a3/platform/sim/host/device_runner.cpp  |  55 +++++++++-
 src/a2a3/platform/sim/host/device_runner.h    |  15 +++
 .../platform/sim/host/pto_runtime_c_api.cpp   |  29 ++++-
 .../host_build_graph/host/runtime_maker.cpp   | 102 ++++++++++++------
 .../host_build_graph/runtime/runtime.h        |  34 ++++++
 .../runtime/runtime.h                         |   7 ++
 .../runtime/shared/runtime.cpp                |   8 ++
 .../platform/onboard/host/device_runner.cpp   |  57 +++++++++-
 src/a5/platform/onboard/host/device_runner.h  |  31 +++++-
 .../onboard/host/pto_runtime_c_api.cpp        |  28 ++++-
 src/a5/platform/sim/host/device_runner.cpp    |  54 +++++++++-
 src/a5/platform/sim/host/device_runner.h      |  17 ++-
 .../platform/sim/host/pto_runtime_c_api.cpp   |  27 ++++-
 .../host_build_graph/host/runtime_maker.cpp   | 102 ++++++++++++------
 .../host_build_graph/runtime/runtime.h        |  26 +++++
 .../runtime/runtime.h                         |   7 ++
 .../runtime/shared/runtime.cpp                |   8 ++
 src/common/worker/chip_worker.cpp             |   9 ++
 src/common/worker/chip_worker.h               |   5 +
 src/common/worker/pto_runtime_c_api.h         |   8 ++
 26 files changed, 688 insertions(+), 103 deletions(-)

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 55f0b7dfc..88862e66a 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -704,6 +704,12 @@ NB_MODULE(_task_interface, m) {
             "variant lacks per-cid registration. Tests assert this to verify "
             "prepare_callable + repeated run_prepared do not redundantly dlopen."
         )
+        .def_prop_ro(
+            "host_dlopen_count", &ChipWorker::host_dlopen_count,
+            "Number of host-side dlopens triggered by prepare_callable on "
+            "host_build_graph variants. Mirrors aicpu_dlopen_count for the "
+            "host-orchestration path; 0 on device-orch variants."
+        )
         .def("malloc", &ChipWorker::malloc, nb::arg("size"))
         .def("free", &ChipWorker::free, nb::arg("ptr"))
         .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size"))
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 5124b4390..6362e90e3 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -362,6 +362,11 @@ def aicpu_dlopen_count(self):
         """Number of distinct callable_ids the AICPU has dlopened for."""
         return self._impl.aicpu_dlopen_count
 
+    @property
+    def host_dlopen_count(self):
+        """Number of host-side orch SO dlopens (host_build_graph variants)."""
+        return self._impl.host_dlopen_count
+
     def malloc(self, size):
         """Allocate memory. Returns a pointer (uint64)."""
         return int(self._impl.malloc(int(size)))
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 68e513cb5..e3be2d70b 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -1322,6 +1322,17 @@ def aicpu_dlopen_count(self) -> int:
             return 0
         return self._chip_worker.aicpu_dlopen_count
 
+    @property
+    def host_dlopen_count(self) -> int:
+        """L2 only: number of host-side orch SO dlopens (hbg variants).
+
+        Mirrors ``aicpu_dlopen_count`` for the host_build_graph path. Returns
+        0 on non-L2 workers or device-orch variants (trb).
+        """
+        if self.level != 2 or self._chip_worker is None:
+            return 0
+        return self._chip_worker.host_dlopen_count
+
     def _run_as_child(self, cid: int, args, config) -> None:
         """Called from C++ _Worker::run when this Worker is a THREAD-mode child.
 
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index c8a7cb8d1..ef0556503 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -760,6 +760,14 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
             return -1;
         }
         const auto &state = it->second;
+        // hbg variant: orch SO never crosses the host/device boundary, so the
+        // AICPU does no per-cid dlopen. Skip the orch_so_table_ bookkeeping
+        // (and the AICPU dlopen counter) and clear the device-orch metadata.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
         const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
         if (first_sighting) {
             ++aicpu_dlopen_total_;
@@ -906,16 +914,53 @@ int DeviceRunner::register_prepared_callable(
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
 int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
     auto it = prepared_callables_.find(callable_id);
     if (it == prepared_callables_.end()) {
         return 0;
     }
-    const uint64_t hash = it->second.hash;
+    PreparedCallableState state = std::move(it->second);
     prepared_callables_.erase(it);
     aicpu_seen_callable_ids_.erase(callable_id);
 
-    auto buf_it = orch_so_dedup_.find(hash);
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg path: no orch SO refcount, just dlclose the host handle.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
     if (buf_it != orch_so_dedup_.end()) {
         if (--buf_it->second.refcount <= 0) {
             mem_alloc_.free(buf_it->second.dev_addr);
@@ -946,10 +991,19 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
             LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
             return -1;
         }
-        runtime.func_id_to_addr_[kv.first] = kv.second;
-    }
+        runtime.replay_function_bin_addr(kv.first, kv.second);
+    }
+#ifdef RUNTIME_HOST_ORCH
+    // hbg: replay the cached host dlopen so bind_prepared_to_runtime_impl can
+    // invoke orch_func without redoing dlopen+dlsym. Guarded by
+    // RUNTIME_HOST_ORCH because trb's Runtime has no host-orch staging fields.
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+#else
+    // trb: AICPU dlopens from device buffer using the entry-symbol names.
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
+#endif
     // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
     // with the authoritative first_sighting answer right before launch.
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index a9efab30e..5c6aa1715 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -446,12 +446,28 @@ class DeviceRunner {
     );
 
     /**
-     * Drop the prepared state for `callable_id` and decrement the SO buffer's
-     * hash-keyed refcount; frees the device buffer when the count hits zero.
-     * Kernel binaries are shared across callables and only released by
-     * finalize().
+     * Host-orchestration variant of register_prepared_callable: stores a
+     * dlopen handle + entry-symbol pointer that runtime_maker resolved on the
+     * host (host_build_graph variant). Mutually exclusive with the trb-shaped
+     * `register_prepared_callable` overload — exactly one is invoked for a
+     * given callable_id, picked by the C ABI based on which staging fields the
+     * runtime carries after prepare_callable_impl. dlopen handle is owned by
+     * DeviceRunner from this call onward and dlclose'd by
+     * unregister_prepared_callable. Increments `host_dlopen_count_`.
+     */
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Drop the prepared state for `callable_id`. trb path: decrement the orch
+     * SO buffer's hash-keyed refcount and free when it hits zero. hbg path:
+     * dlclose the host dlopen handle. Kernel binaries are shared across
+     * callables and only released by finalize().
      *
-     * @param callable_id  Id previously passed to register_prepared_callable.
+     * @param callable_id  Id previously passed to one of the
+     *                     register_prepared_callable* overloads.
      * @return 0 on success or if the id was not registered.
      */
     int unregister_prepared_callable(int32_t callable_id);
@@ -489,8 +505,19 @@ class DeviceRunner {
      * registration eliminates duplicate dlopens across repeated runs.
      */
     size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+
+    /**
+     * Number of host-side dlopen() invocations triggered by
+     * `register_prepared_callable_host_orch`. Mirrors `aicpu_dlopen_count` but
+     * counts the host_build_graph variant's host-side dlopens; it never
+     * decrements (re-prepare after unregister still counts). Tests assert
+     * `host_dlopen_count == distinct_registered_cids` to verify the prepared
+     * path doesn't dlopen on every run.
+     */
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
 #else   // RUNTIME_HAS_CALLABLE_ID
     size_t aicpu_dlopen_count() const { return 0; }
+    size_t host_dlopen_count() const { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
 private:
@@ -539,12 +566,17 @@ class DeviceRunner {
     // register_new_callable_id_ correctly on first sighting.
 #ifdef RUNTIME_HAS_CALLABLE_ID
     struct PreparedCallableState {
+        // trb path (AICPU dlopens orch SO from device buffer)
         uint64_t hash{0};
         uint64_t dev_orch_so_addr{0};
         size_t dev_orch_so_size{0};
         std::string func_name;
         std::string config_name;
+        // common
         std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path (host already dlopen'd the orch SO)
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
     };
     struct OrchSoBuffer {
         void *dev_addr{nullptr};
@@ -559,6 +591,10 @@ class DeviceRunner {
     // aicpu_seen_callable_ids_.size() once any cid is unregistered and
     // re-prepared. Exposed via aicpu_dlopen_count() for tests.
     size_t aicpu_dlopen_total_{0};
+    // Monotonic count of host-side dlopens triggered (incremented on every
+    // register_prepared_callable_host_orch call; never decremented). Same
+    // re-prepare semantics as aicpu_dlopen_total_, but for hbg variants.
+    size_t host_dlopen_total_{0};
     // Sticky flag: prepare_callable was called at least once. Distinguishes
     // legacy-path "kernel still cached at finalize" leaks from prepared-path
     // kernels that legitimately live until finalize.
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index ce7709655..c2c655184 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -21,6 +21,8 @@
 #include "task_args.h"
 
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -332,9 +334,10 @@ int prepare_callable(
             runner->release_run_context();
         });
 
-        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
-        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
-        Runtime *r = new (rt_buf) Runtime();
+        // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB,
+        // larger than the default thread stack.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
         r->host_api.device_malloc = device_malloc;
         r->host_api.device_free = device_free;
         r->host_api.copy_to_device = copy_to_device;
@@ -344,7 +347,6 @@ int prepare_callable(
 
         rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
         if (rc != 0) {
-            r->~Runtime();
             return rc;
         }
 
@@ -361,11 +363,22 @@ int prepare_callable(
         // they belong to the prepared state now.
         r->clear_registered_kernels();
 
+#ifdef RUNTIME_HOST_ORCH
+        // hbg: prepare_callable_impl already dlopen+dlsym'd on the host. Hand
+        // the handle/fn pointer over to DeviceRunner; it owns the dlopen
+        // lifetime from here until unregister_prepared_callable.
+        rc = runner->register_prepared_callable_host_orch(
+            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+        );
+        // Clear staging so the Runtime destructor does not see them again.
+        r->pending_host_dlopen_handle_ = nullptr;
+        r->pending_host_orch_func_ptr_ = nullptr;
+#else
         rc = runner->register_prepared_callable(
             callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
             r->get_device_orch_config_name(), std::move(kernel_addrs)
         );
-        r->~Runtime();
+#endif
         return rc;
     } catch (...) {
         return -1;
@@ -481,4 +494,13 @@ size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
     }
 }
 
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
 }  // extern "C"
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index f07d32358..20596afa5 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -688,6 +688,13 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
             return -1;
         }
         const auto &state = it->second;
+        // hbg: orch SO never crosses host/device — clear device-orch metadata
+        // and skip AICPU bookkeeping. See onboard/device_runner.cpp.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
         const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
         if (first_sighting) {
             ++aicpu_dlopen_total_;
@@ -815,16 +822,53 @@ int DeviceRunner::register_prepared_callable(
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
 int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
     auto it = prepared_callables_.find(callable_id);
     if (it == prepared_callables_.end()) {
         return 0;
     }
-    const uint64_t hash = it->second.hash;
+    PreparedCallableState state = std::move(it->second);
     prepared_callables_.erase(it);
     aicpu_seen_callable_ids_.erase(callable_id);
 
-    auto buf_it = orch_so_dedup_.find(hash);
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg: dlclose the host handle; no orch SO refcount to decrement.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
     if (buf_it != orch_so_dedup_.end()) {
         if (--buf_it->second.refcount <= 0) {
             mem_alloc_.free(buf_it->second.dev_addr);
@@ -850,10 +894,15 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
             LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
             return -1;
         }
-        runtime.func_id_to_addr_[kv.first] = kv.second;
+        runtime.replay_function_bin_addr(kv.first, kv.second);
     }
+#ifdef RUNTIME_HOST_ORCH
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+#else
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
+#endif
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
     return 0;
 }
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index b115485b8..4a1759054 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -217,12 +217,21 @@ class DeviceRunner {
         int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
         const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
     );
+    // Host-orchestration sibling of register_prepared_callable; see
+    // src/a2a3/platform/onboard/host/device_runner.h for the contract. Sim
+    // shares the host-only dlopen path verbatim (no AICPU side effects).
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
     int unregister_prepared_callable(int32_t callable_id);
     bool has_prepared_callable(int32_t callable_id) const;
     int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
     size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
 #else   // RUNTIME_HAS_CALLABLE_ID
     size_t aicpu_dlopen_count() const { return 0; }
+    size_t host_dlopen_count() const { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
 private:
@@ -250,12 +259,17 @@ class DeviceRunner {
     // Per-callable_id prepared state (callable.md design). Mirrors onboard.
 #ifdef RUNTIME_HAS_CALLABLE_ID
     struct PreparedCallableState {
+        // trb path
         uint64_t hash{0};
         uint64_t dev_orch_so_addr{0};
         size_t dev_orch_so_size{0};
         std::string func_name;
         std::string config_name;
+        // common
         std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
     };
     struct OrchSoBuffer {
         void *dev_addr{nullptr};
@@ -266,6 +280,7 @@ class DeviceRunner {
     std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
     size_t aicpu_dlopen_total_{0};
+    size_t host_dlopen_total_{0};
     // Sticky flag: prepare_callable was called at least once in this
     // DeviceRunner's lifetime. unregister_prepared_callable clears the maps
     // above, so we cannot use them at finalize() time to decide whether a
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 188edf5b5..da71f750b 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -22,6 +22,8 @@
 
 #include <new>
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -308,9 +310,11 @@ int prepare_callable(
     pthread_setspecific(g_runner_key, ctx);
 
     try {
-        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
-        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
-        Runtime *r = new (rt_buf) Runtime();
+        // Heap-allocate the temp Runtime — sizeof(Runtime) is in the tens of MB
+        // for hbg variants (RUNTIME_MAX_TASKS=131072), well past the stack
+        // budget. unique_ptr keeps the cleanup symmetric on every exit.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
         r->host_api.device_malloc = device_malloc;
         r->host_api.device_free = device_free;
         r->host_api.copy_to_device = copy_to_device;
@@ -320,7 +324,6 @@ int prepare_callable(
 
         int rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
         if (rc != 0) {
-            r->~Runtime();
             pthread_setspecific(g_runner_key, nullptr);
             return rc;
         }
@@ -334,11 +337,18 @@ int prepare_callable(
         }
         r->clear_registered_kernels();
 
+#ifdef RUNTIME_HOST_ORCH
+        rc = runner->register_prepared_callable_host_orch(
+            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+        );
+        r->pending_host_dlopen_handle_ = nullptr;
+        r->pending_host_orch_func_ptr_ = nullptr;
+#else
         rc = runner->register_prepared_callable(
             callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
             r->get_device_orch_config_name(), std::move(kernel_addrs)
         );
-        r->~Runtime();
+#endif
         pthread_setspecific(g_runner_key, nullptr);
         return rc;
     } catch (...) {
@@ -444,6 +454,15 @@ int run_prepared(
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
 size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
     if (ctx == NULL) return 0;
     try {
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index cf6618170..f75215b6e 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -276,31 +276,27 @@ extern "C" {
 #endif
 
 /**
- * Initialize a pre-allocated runtime with dynamic orchestration.
- *
- * This function loads the orchestration SO from binary data via a temp file,
- * resolves the orchestration function via dlsym, then calls it to build the
- * task graph. The orchestration function is responsible for:
- * - Allocating device memory via device_malloc()
- * - Copying data to device via copy_to_device()
- * - Building the task graph
- * - Recording tensor pairs via record_tensor_pair()
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
- * @return 0 on success, -1 on failure
+ * Stage the per-callable resources for the host_build_graph variant: upload
+ * kernel binaries and dlopen the orchestration SO on the host. The dlopen
+ * handle and resolved entry-symbol pointer are parked on the runtime via
+ * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the
+ * platform layer can hoist them into PreparedCallableState. Splitting this
+ * out of init_runtime_impl is what callable.md's prepare_callable / run_prepared
+ * design rests on for hbg — the dlopen runs once per cid instead of every run.
  */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -329,7 +325,9 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    // Load orchestration SO from binary data via temp file
+    // Load orchestration SO from binary data via temp file. Held open across
+    // the lifetime of the prepared callable; closed by
+    // DeviceRunner::unregister_prepared_callable.
     std::string fd_path;
     if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) {
         LOG_ERROR("Failed to create temp SO file");
@@ -343,7 +341,7 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    dlerror();  // Clear any existing error
+    dlerror();
     OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(dlsym(handle, orch_func_name));
     const char *dlsym_error = dlerror();
     if (dlsym_error != nullptr) {
@@ -354,11 +352,42 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
 
     LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name);
 
-    // Clear any previous tensor pairs
+    runtime->pending_host_dlopen_handle_ = handle;
+    runtime->pending_host_orch_func_ptr_ = reinterpret_cast<void *>(orch_func);
+    // hbg never uploads orch SO bytes to the device; clear the trb staging
+    // fields so DeviceRunner::register_prepared_callable cannot mistake this
+    // for a trb-shaped registration.
+    runtime->pending_orch_so_data_ = nullptr;
+    runtime->pending_orch_so_size_ = 0;
+    return 0;
+}
+
+/**
+ * Per-run binding for hbg: invoke the previously-resolved orchestration entry
+ * point against the supplied args, then upload tensor info / allocation
+ * storage. Assumes prepare_callable_impl populated
+ * `pending_host_orch_func_ptr_` (either freshly during prepare_callable, or
+ * via DeviceRunner::bind_prepared_callable_to_runtime when run_prepared
+ * replays a prepared cid onto a fresh Runtime).
+ */
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(runtime->pending_host_orch_func_ptr_);
+    if (orch_func == nullptr) {
+        LOG_ERROR("bind_prepared_to_runtime_impl: host orch_func pointer is null");
+        return -1;
+    }
+
     runtime->clear_tensor_pairs();
 
     LOG_INFO_V0("=== Calling Orchestration Function ===");
-
     LOG_DEBUG(
         "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(),
         orch_args->tensor_count(), orch_args->scalar_count()
@@ -370,13 +399,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder
     };
 
-    // Call orchestration function to build task graph
-    // The orchestration function handles device memory allocation and copy-to-device
     int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
     if (rc != 0) {
         LOG_ERROR("Orchestration function failed with code %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -384,7 +410,6 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
     if (rc != 0) {
         LOG_ERROR("Failed to upload tensor allocations: %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -396,17 +421,34 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
             runtime->clear_tensor_allocation_storage();
         }
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
     LOG_INFO_V0("Runtime initialized. Ready for execution from Python.");
+    return 0;
+}
+
+/**
+ * Compatibility shim: legacy single-call init_runtime_impl drives the existing
+ * run_runtime path. The callable.md split keeps it as
+ * prepare_callable_impl + bind_prepared_to_runtime_impl so legacy callers see
+ * one function while run_prepared reuses the prep half across runs. The shim
+ * dlcloses the orchestration SO immediately because legacy callers (no cid)
+ * never see register_prepared_callable.
+ */
+int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
+    int rc = prepare_callable_impl(runtime, callable);
+    if (rc != 0) return rc;
 
-    // Host orchestration is complete once orch_func returns. The task graph now
-    // lives in Runtime, so the orchestration SO can be closed immediately.
-    dlclose(handle);
+    rc = bind_prepared_to_runtime_impl(runtime, orch_args);
 
-    return 0;
+    // Legacy path: orchestration SO is no longer needed once orch_func returned.
+    if (runtime->pending_host_dlopen_handle_ != nullptr) {
+        dlclose(runtime->pending_host_dlopen_handle_);
+        runtime->pending_host_dlopen_handle_ = nullptr;
+        runtime->pending_host_orch_func_ptr_ = nullptr;
+    }
+    return rc;
 }
 
 /**
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index ce9d1f5fd..e48d4500e 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -29,6 +29,17 @@
 #ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 #define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 
+// Tells the shared platform layer (device_runner.{h,cpp},
+// pto_runtime_c_api.cpp) that this runtime variant participates in callable.md
+// prepare_callable / run_prepared. hbg dlopens the orchestration SO on the
+// host, so the platform layer routes register/unregister through the
+// host_dlopen_handle branch instead of the AICPU per-cid orch_so_table_.
+#define RUNTIME_HAS_CALLABLE_ID 1
+// Marks this variant as host-orchestrated: the platform layer's
+// register/bind/unregister logic uses runtime->pending_host_dlopen_handle_ and
+// pending_host_orch_func_ptr_, which only exist on hbg-shaped Runtimes.
+#define RUNTIME_HOST_ORCH 1
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -434,6 +445,19 @@ class Runtime {
      */
     void set_function_bin_addr(int func_id, uint64_t addr);
 
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_prepared_callable_to_runtime when restoring kernels
+     * across run_prepared invocations: the prepared callable owns the
+     * kernel binaries' device memory until unregister, so
+     * validate_runtime_impl must NOT free them.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr) {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        func_id_to_addr_[func_id] = addr;
+    }
+
     int get_registered_kernel_count() const { return registered_kernel_count_; }
 
     int get_registered_kernel_func_id(int index) const {
@@ -470,6 +494,16 @@ class Runtime {
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
+    // Host-orchestration staging (callable.md hbg path). prepare_callable_impl
+    // dlopens the orch SO on the host and parks the handle + entry-symbol
+    // pointer here so DeviceRunner::register_prepared_callable_host_orch can
+    // claim them; bind_prepared_callable_to_runtime restores them onto a fresh
+    // Runtime so bind_prepared_to_runtime_impl can call orch_func without a
+    // second dlopen. Distinct from `pending_orch_so_data_` (which is unused on
+    // hbg — host orchestration never uploads the SO bytes to the device).
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
+
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 722231a8a..9bd847379 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -280,6 +280,13 @@ class Runtime {
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_prepared_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr);
 
     int get_registered_kernel_count() const;
     int get_registered_kernel_func_id(int index) const;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index abfffd9aa..1f6375a6b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -171,6 +171,14 @@ void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
     func_id_to_addr_[func_id] = addr;
 }
 
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
 int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
 
 int Runtime::get_registered_kernel_func_id(int index) const {
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 4765c5624..2d318d7fe 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -17,6 +17,8 @@
 
 #include "device_runner.h"
 
+#include <dlfcn.h>
+
 #include <cassert>
 #include <cstring>
 #include <iostream>
@@ -613,6 +615,13 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
             return -1;
         }
         const auto &state = it->second;
+        // hbg variant: orch SO never crosses host/device, so AICPU does no
+        // per-cid dlopen. Skip orch_so_table_ bookkeeping and clear metadata.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
         const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
         if (first_sighting) {
             ++aicpu_dlopen_total_;
@@ -754,16 +763,53 @@ int DeviceRunner::register_prepared_callable(
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
 int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
     auto it = prepared_callables_.find(callable_id);
     if (it == prepared_callables_.end()) {
         return 0;
     }
-    const uint64_t hash = it->second.hash;
+    PreparedCallableState state = std::move(it->second);
     prepared_callables_.erase(it);
     aicpu_seen_callable_ids_.erase(callable_id);
 
-    auto buf_it = orch_so_dedup_.find(hash);
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg path: dlclose the host handle; no orch SO refcount to decrement.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
     if (buf_it != orch_so_dedup_.end()) {
         if (--buf_it->second.refcount <= 0) {
             mem_alloc_.free(buf_it->second.dev_addr);
@@ -795,10 +841,15 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
             LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
             return -1;
         }
-        runtime.func_id_to_addr_[kv.first] = kv.second;
+        runtime.replay_function_bin_addr(kv.first, kv.second);
     }
+#ifdef RUNTIME_HOST_ORCH
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+#else
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
+#endif
     // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
     // with the authoritative first_sighting answer right before launch.
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 55f76e52e..a7d5c9fc2 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -395,10 +395,19 @@ class DeviceRunner {
     );
 
     /**
-     * Drop the prepared state for `callable_id` and decrement the SO buffer's
-     * hash-keyed refcount; frees the device buffer when the count hits zero.
-     * Kernel binaries are shared across callables and only released by
-     * finalize().
+     * Host-orchestration sibling for hbg variants. See a2a3 onboard
+     * device_runner.h for full contract. Mutually exclusive with the
+     * trb-shaped overload.
+     */
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Drop the prepared state for `callable_id`. trb path: decrement orch SO
+     * refcount, free when zero. hbg path: dlclose the host handle. Kernel
+     * binaries are shared and only released by finalize().
      */
     int unregister_prepared_callable(int32_t callable_id);
 
@@ -417,6 +426,13 @@ class DeviceRunner {
      */
     size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
 
+    /**
+     * Number of host-side dlopens triggered by
+     * `register_prepared_callable_host_orch` (hbg variant). Mirrors
+     * `aicpu_dlopen_count` for the host-orchestration path.
+     */
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
+
 private:
     // Internal state
     int device_id_{-1};
@@ -449,12 +465,17 @@ class DeviceRunner {
     // device_runner.h for the full design narrative; mirrored here so a5
     // shares the same dispatch surface.
     struct PreparedCallableState {
+        // trb path
         uint64_t hash{0};
         uint64_t dev_orch_so_addr{0};
         size_t dev_orch_so_size{0};
         std::string func_name;
         std::string config_name;
+        // common
         std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
     };
     struct OrchSoBuffer {
         void *dev_addr{nullptr};
@@ -466,6 +487,8 @@ class DeviceRunner {
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
     // Monotonic AICPU dlopen counter (first-sighting bind only; never decremented).
     size_t aicpu_dlopen_total_{0};
+    // Monotonic host-side dlopen counter for hbg variants.
+    size_t host_dlopen_total_{0};
     // Sticky flag: prepare_callable was called at least once. Lets finalize()
     // distinguish legacy-path leaks from prepared-path kernels that legitimately
     // live until finalize.
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 8110263fc..2cd9f07be 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -21,6 +21,8 @@
 #include "task_args.h"
 
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -370,9 +372,10 @@ int prepare_callable(
             runner->release_run_context();
         });
 
-        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
-        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
-        Runtime *r = new (rt_buf) Runtime();
+        // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB,
+        // larger than the default thread stack.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
         r->host_api.device_malloc = device_malloc;
         r->host_api.device_free = device_free;
         r->host_api.copy_to_device = copy_to_device;
@@ -382,7 +385,6 @@ int prepare_callable(
 
         rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
         if (rc != 0) {
-            r->~Runtime();
             return rc;
         }
 
@@ -399,11 +401,18 @@ int prepare_callable(
         // they belong to the prepared state now.
         r->clear_registered_kernels();
 
+#ifdef RUNTIME_HOST_ORCH
+        rc = runner->register_prepared_callable_host_orch(
+            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+        );
+        r->pending_host_dlopen_handle_ = nullptr;
+        r->pending_host_orch_func_ptr_ = nullptr;
+#else
         rc = runner->register_prepared_callable(
             callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
             r->get_device_orch_config_name(), std::move(kernel_addrs)
         );
-        r->~Runtime();
+#endif
         return rc;
     } catch (...) {
         return -1;
@@ -510,6 +519,15 @@ int run_prepared(
 int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
 #endif  // RUNTIME_HAS_CALLABLE_ID
 
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
 size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
     if (ctx == NULL) return 0;
     try {
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 0fd9278a5..c926f6b36 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -666,6 +666,12 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
             return -1;
         }
         const auto &state = it->second;
+        // hbg variant: orch SO never crosses host/device boundary.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
         const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
         if (first_sighting) {
             ++aicpu_dlopen_total_;
@@ -789,16 +795,53 @@ int DeviceRunner::register_prepared_callable(
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
 int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
     auto it = prepared_callables_.find(callable_id);
     if (it == prepared_callables_.end()) {
         return 0;
     }
-    const uint64_t hash = it->second.hash;
+    PreparedCallableState state = std::move(it->second);
     prepared_callables_.erase(it);
     aicpu_seen_callable_ids_.erase(callable_id);
 
-    auto buf_it = orch_so_dedup_.find(hash);
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg path: dlclose host handle; no orch SO refcount.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
     if (buf_it != orch_so_dedup_.end()) {
         if (--buf_it->second.refcount <= 0) {
             mem_alloc_.free(buf_it->second.dev_addr);
@@ -824,10 +867,15 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
             LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
             return -1;
         }
-        runtime.func_id_to_addr_[kv.first] = kv.second;
+        runtime.replay_function_bin_addr(kv.first, kv.second);
     }
+#ifdef RUNTIME_HOST_ORCH
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+#else
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
+#endif
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
     return 0;
 }
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 3a39a31df..042121518 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -219,7 +219,13 @@ class DeviceRunner {
         const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
     );
 
-    /** Drop prepared state for `callable_id`; refcounts the dedup'd SO buffer. */
+    /** Host-orchestration sibling for hbg variants. See a2a3 onboard. */
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /** Drop prepared state for `callable_id`; trb refcounts SO, hbg dlcloses handle. */
     int unregister_prepared_callable(int32_t callable_id);
 
     /** True iff `callable_id` has prepared state staged. */
@@ -231,6 +237,9 @@ class DeviceRunner {
     /** Monotonic AICPU dlopen counter (first-sighting only; never decremented). */
     size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
 
+    /** Monotonic host-side dlopen counter for hbg variants. */
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
+
 private:
     // Configuration
     int device_id_{-1};
@@ -255,12 +264,17 @@ class DeviceRunner {
 
     // Per-callable_id prepared state (callable.md design). Mirrors onboard.
     struct PreparedCallableState {
+        // trb path
         uint64_t hash{0};
         uint64_t dev_orch_so_addr{0};
         size_t dev_orch_so_size{0};
         std::string func_name;
         std::string config_name;
+        // common
         std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
     };
     struct OrchSoBuffer {
         void *dev_addr{nullptr};
@@ -271,6 +285,7 @@ class DeviceRunner {
     std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
     std::unordered_set<int32_t> aicpu_seen_callable_ids_;
     size_t aicpu_dlopen_total_{0};
+    size_t host_dlopen_total_{0};
     bool prepared_callable_path_used_{false};
 
     // Runtime pointer for print_handshake_results
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 00aa79028..40ab89134 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -22,6 +22,8 @@
 
 #include <new>
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -307,9 +309,9 @@ int prepare_callable(
     pthread_setspecific(g_runner_key, ctx);
 
     try {
-        // Temp Runtime so prepare_callable_impl can upload kernels via host_api.
-        alignas(Runtime) uint8_t rt_buf[sizeof(Runtime)];
-        Runtime *r = new (rt_buf) Runtime();
+        // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
         r->host_api.device_malloc = device_malloc;
         r->host_api.device_free = device_free;
         r->host_api.copy_to_device = copy_to_device;
@@ -319,7 +321,6 @@ int prepare_callable(
 
         int rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
         if (rc != 0) {
-            r->~Runtime();
             pthread_setspecific(g_runner_key, nullptr);
             return rc;
         }
@@ -333,11 +334,18 @@ int prepare_callable(
         }
         r->clear_registered_kernels();
 
+#ifdef RUNTIME_HOST_ORCH
+        rc = runner->register_prepared_callable_host_orch(
+            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+        );
+        r->pending_host_dlopen_handle_ = nullptr;
+        r->pending_host_orch_func_ptr_ = nullptr;
+#else
         rc = runner->register_prepared_callable(
             callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
             r->get_device_orch_config_name(), std::move(kernel_addrs)
         );
-        r->~Runtime();
+#endif
         pthread_setspecific(g_runner_key, nullptr);
         return rc;
     } catch (...) {
@@ -452,4 +460,13 @@ size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
     }
 }
 
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
 }  // extern "C"
diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
index cf6618170..f75215b6e 100644
--- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
@@ -276,31 +276,27 @@ extern "C" {
 #endif
 
 /**
- * Initialize a pre-allocated runtime with dynamic orchestration.
- *
- * This function loads the orchestration SO from binary data via a temp file,
- * resolves the orchestration function via dlsym, then calls it to build the
- * task graph. The orchestration function is responsible for:
- * - Allocating device memory via device_malloc()
- * - Copying data to device via copy_to_device()
- * - Building the task graph
- * - Recording tensor pairs via record_tensor_pair()
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
- * @return 0 on success, -1 on failure
+ * Stage the per-callable resources for the host_build_graph variant: upload
+ * kernel binaries and dlopen the orchestration SO on the host. The dlopen
+ * handle and resolved entry-symbol pointer are parked on the runtime via
+ * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the
+ * platform layer can hoist them into PreparedCallableState. Splitting this
+ * out of init_runtime_impl is what callable.md's prepare_callable / run_prepared
+ * design rests on for hbg — the dlopen runs once per cid instead of every run.
  */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -329,7 +325,9 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    // Load orchestration SO from binary data via temp file
+    // Load orchestration SO from binary data via temp file. Held open across
+    // the lifetime of the prepared callable; closed by
+    // DeviceRunner::unregister_prepared_callable.
     std::string fd_path;
     if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) {
         LOG_ERROR("Failed to create temp SO file");
@@ -343,7 +341,7 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    dlerror();  // Clear any existing error
+    dlerror();
     OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(dlsym(handle, orch_func_name));
     const char *dlsym_error = dlerror();
     if (dlsym_error != nullptr) {
@@ -354,11 +352,42 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
 
     LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name);
 
-    // Clear any previous tensor pairs
+    runtime->pending_host_dlopen_handle_ = handle;
+    runtime->pending_host_orch_func_ptr_ = reinterpret_cast<void *>(orch_func);
+    // hbg never uploads orch SO bytes to the device; clear the trb staging
+    // fields so DeviceRunner::register_prepared_callable cannot mistake this
+    // for a trb-shaped registration.
+    runtime->pending_orch_so_data_ = nullptr;
+    runtime->pending_orch_so_size_ = 0;
+    return 0;
+}
+
+/**
+ * Per-run binding for hbg: invoke the previously-resolved orchestration entry
+ * point against the supplied args, then upload tensor info / allocation
+ * storage. Assumes prepare_callable_impl populated
+ * `pending_host_orch_func_ptr_` (either freshly during prepare_callable, or
+ * via DeviceRunner::bind_prepared_callable_to_runtime when run_prepared
+ * replays a prepared cid onto a fresh Runtime).
+ */
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(runtime->pending_host_orch_func_ptr_);
+    if (orch_func == nullptr) {
+        LOG_ERROR("bind_prepared_to_runtime_impl: host orch_func pointer is null");
+        return -1;
+    }
+
     runtime->clear_tensor_pairs();
 
     LOG_INFO_V0("=== Calling Orchestration Function ===");
-
     LOG_DEBUG(
         "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(),
         orch_args->tensor_count(), orch_args->scalar_count()
@@ -370,13 +399,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder
     };
 
-    // Call orchestration function to build task graph
-    // The orchestration function handles device memory allocation and copy-to-device
     int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
     if (rc != 0) {
         LOG_ERROR("Orchestration function failed with code %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -384,7 +410,6 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
     if (rc != 0) {
         LOG_ERROR("Failed to upload tensor allocations: %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -396,17 +421,34 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
             runtime->clear_tensor_allocation_storage();
         }
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
     LOG_INFO_V0("Runtime initialized. Ready for execution from Python.");
+    return 0;
+}
+
+/**
+ * Compatibility shim: legacy single-call init_runtime_impl drives the existing
+ * run_runtime path. The callable.md split keeps it as
+ * prepare_callable_impl + bind_prepared_to_runtime_impl so legacy callers see
+ * one function while run_prepared reuses the prep half across runs. The shim
+ * dlcloses the orchestration SO immediately because legacy callers (no cid)
+ * never see register_prepared_callable.
+ */
+int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
+    int rc = prepare_callable_impl(runtime, callable);
+    if (rc != 0) return rc;
 
-    // Host orchestration is complete once orch_func returns. The task graph now
-    // lives in Runtime, so the orchestration SO can be closed immediately.
-    dlclose(handle);
+    rc = bind_prepared_to_runtime_impl(runtime, orch_args);
 
-    return 0;
+    // Legacy path: orchestration SO is no longer needed once orch_func returned.
+    if (runtime->pending_host_dlopen_handle_ != nullptr) {
+        dlclose(runtime->pending_host_dlopen_handle_);
+        runtime->pending_host_dlopen_handle_ = nullptr;
+        runtime->pending_host_orch_func_ptr_ = nullptr;
+    }
+    return rc;
 }
 
 /**
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index 704cf6477..9e6abcccf 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -29,6 +29,13 @@
 #ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 #define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 
+// Mirrors a2a3/host_build_graph: tells the shared platform layer this variant
+// implements callable.md, with the hbg branch (host dlopen of the orch SO).
+#define RUNTIME_HAS_CALLABLE_ID 1
+// Marks this variant as host-orchestrated: the platform layer's
+// register/bind/unregister logic uses host_dlopen_handle/host_orch_func_ptr.
+#define RUNTIME_HOST_ORCH 1
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -448,6 +455,16 @@ class Runtime {
      */
     void set_function_bin_addr(int func_id, uint64_t addr);
 
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. See a2a3 hbg
+     * runtime.h for the full contract.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr) {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        func_id_to_addr_[func_id] = addr;
+    }
+
     int get_registered_kernel_count() const { return registered_kernel_count_; }
 
     int get_registered_kernel_func_id(int index) const {
@@ -477,6 +494,15 @@ class Runtime {
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
+    // Host-orchestration staging (callable.md hbg path). prepare_callable_impl
+    // dlopens the orch SO on the host and parks the handle + entry-symbol
+    // pointer here so DeviceRunner::register_prepared_callable_host_orch can
+    // claim them; bind_prepared_callable_to_runtime restores them onto a fresh
+    // Runtime so bind_prepared_to_runtime_impl can call orch_func without a
+    // second dlopen.
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
+
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 7279ee5e9..8b2ee4a73 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -294,6 +294,13 @@ class Runtime {
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_prepared_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr);
 
     int get_registered_kernel_count() const;
     int get_registered_kernel_func_id(int index) const;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 80ae1b8b2..5fc34b7b2 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -173,6 +173,14 @@ void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
     func_id_to_addr_[func_id] = addr;
 }
 
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
 int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
 
 int Runtime::get_registered_kernel_func_id(int index) const {
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 1ed20eb4c..26874b163 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -154,6 +154,7 @@ void ChipWorker::init(
         run_prepared_fn_ = load_symbol<RunPreparedFn>(handle, "run_prepared");
         unregister_callable_fn_ = load_symbol<UnregisterCallableFn>(handle, "unregister_callable");
         get_aicpu_dlopen_count_fn_ = load_symbol<GetAicpuDlopenCountFn>(handle, "get_aicpu_dlopen_count");
+        get_host_dlopen_count_fn_ = load_symbol<GetAicpuDlopenCountFn>(handle, "get_host_dlopen_count");
         finalize_device_fn_ = load_symbol<FinalizeDeviceFn>(handle, "finalize_device");
         // ACL lifecycle + comm_* are part of the uniform host_runtime.so ABI.
         // Every platform runtime exports all of them — runtimes that do not
@@ -251,6 +252,7 @@ void ChipWorker::finalize() {
     run_prepared_fn_ = nullptr;
     unregister_callable_fn_ = nullptr;
     get_aicpu_dlopen_count_fn_ = nullptr;
+    get_host_dlopen_count_fn_ = nullptr;
     finalize_device_fn_ = nullptr;
     ensure_acl_ready_fn_ = nullptr;
     create_comm_stream_fn_ = nullptr;
@@ -351,6 +353,13 @@ size_t ChipWorker::aicpu_dlopen_count() const {
     return get_aicpu_dlopen_count_fn_(device_ctx_);
 }
 
+size_t ChipWorker::host_dlopen_count() const {
+    if (!device_set_) {
+        return 0;
+    }
+    return get_host_dlopen_count_fn_(device_ctx_);
+}
+
 uint64_t ChipWorker::malloc(size_t size) {
     if (!device_set_) {
         throw std::runtime_error("ChipWorker device not set; call set_device() first");
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 9c2e43af8..7b699fc60 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -77,6 +77,10 @@ class ChipWorker : public IWorker {
     /// AICPU dlopens.
     size_t aicpu_dlopen_count() const;
 
+    /// Number of host-side dlopens (host_build_graph variant). Mirrors
+    /// `aicpu_dlopen_count` for the trb path; returns 0 on device-orch variants.
+    size_t host_dlopen_count() const;
+
     uint64_t malloc(size_t size);
     void free(uint64_t ptr);
     void copy_to(uint64_t dst, uint64_t src, size_t size);
@@ -157,6 +161,7 @@ class ChipWorker : public IWorker {
     RunPreparedFn run_prepared_fn_ = nullptr;
     UnregisterCallableFn unregister_callable_fn_ = nullptr;
     GetAicpuDlopenCountFn get_aicpu_dlopen_count_fn_ = nullptr;
+    GetAicpuDlopenCountFn get_host_dlopen_count_fn_ = nullptr;
     FinalizeDeviceFn finalize_device_fn_ = nullptr;
     EnsureAclReadyFn ensure_acl_ready_fn_ = nullptr;
     CreateCommStreamFn create_comm_stream_fn_ = nullptr;
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index 8890dfeb1..780b0b24c 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -196,6 +196,14 @@ int unregister_callable(DeviceContextHandle ctx, int32_t callable_id);
  */
 size_t get_aicpu_dlopen_count(DeviceContextHandle ctx);
 
+/**
+ * Number of host-side dlopens triggered by `prepare_callable` on the host
+ * orchestration variants (host_build_graph). Mirrors `get_aicpu_dlopen_count`
+ * for the trb path. Returns 0 on runtime variants whose orchestration runs on
+ * the device.
+ */
+size_t get_host_dlopen_count(DeviceContextHandle ctx);
+
 #ifdef __cplusplus
 }
 #endif

From b89c391d98f8d1a6868caf8f0c89c87a4895a864 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 10:37:50 +0800
Subject: [PATCH 17/28] =?UTF-8?q?feat(callable):=20Phase=202=20=E2=80=94?=
 =?UTF-8?q?=20add=20prepared=5Fcallable=20ST=20tests=20for=20hbg=20variant?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror the trb prepared_callable ST suite to host_build_graph:

- tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
  reuses a2a3 vector_example kernel for the 5 prepared_callable scenarios
  (single-cid prepare→run, multi-cid alternation, repeated run, unregister,
  host_dlopen_count assertions).
- tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
  with self-contained dump_tensor-style kernels under kernels/{aiv,
  orchestration}/.

Both assert host_dlopen_count == distinct_registered_cids and
aicpu_dlopen_count == 0 (hbg path does not trigger AICPU dlopen).

Verified: 5 tests pass on each variant under sim.
---
 .../test_prepared_callable.py                 | 232 ++++++++++++++++++
 .../kernels/aiv/kernel_add.cpp                |  63 +++++
 .../kernels/aiv/kernel_add_scalar_inplace.cpp |  63 +++++
 .../orchestration/dump_tensor_orch.cpp        |  73 ++++++
 .../test_prepared_callable.py                 | 193 +++++++++++++++
 5 files changed, 624 insertions(+)
 create mode 100644 tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
 create mode 100644 tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp
 create mode 100644 tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp
 create mode 100644 tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp
 create mode 100644 tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py

diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..64c58ed9c
--- /dev/null
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared on host_build_graph.
+
+Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable for the hbg
+variant: instead of the AICPU dlopening the orch SO once per cid, hbg dlopens
+on the host inside prepare_callable and replays the cached handle/fn pointer
+on every run_prepared. The dlopen counter to assert is `host_dlopen_count`,
+not `aicpu_dlopen_count` (which stays 0 — AICPU never sees the orch SO).
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_VECTOR_KERNELS = "../vector_example/kernels"
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPreparedCallableHbg(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable on hbg."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orch.cpp",
+            "function_name": "build_example_graph",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3}
+    _PLATFORMS = ["a2a3sim", "a2a3"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # vector_example orchestration computes (a + b + 1) * (a + b + 2)
+        a, b = args.a, args.b
+        args.f[:] = (a + b + 1) * (a + b + 2)
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        worker.prepare_callable(0, callable_obj)
+        worker.prepare_callable(1, callable_obj)
+
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(0, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(1, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        worker.unregister_callable(0)
+        worker.unregister_callable(1)
+
+    # ------------------------------------------------------------------
+    # host_dlopen_count assertions (callable.md §7 verification, hbg path).
+    #
+    # hbg increments host_dlopen_count on every register_prepared_callable_host_orch
+    # invocation (i.e. each `prepare_callable` call), independent of how many
+    # times run_prepared is invoked afterwards. AICPU never dlopens the orch
+    # SO on this variant, so aicpu_dlopen_count stays at 0.
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        """prepare(0) + run(0) × 5 → host_dlopen delta == 1, aicpu == 0."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new host dlopen for 5 runs of cid=0, "
+                f"got delta {st_worker.host_dlopen_count - baseline}"
+            )
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu, "hbg must not trigger any AICPU orch SO dlopens"
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        """prepare(0)+prepare(1) + (run(0),run(1)) × 5 → host_dlopen delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(1, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, 1, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new host dlopens for cids {{0,1}} interleaved, "
+                f"got delta {st_worker.host_dlopen_count - baseline}"
+            )
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu
+        finally:
+            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(1)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """prepare(0) twice → second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(0, callable_obj)
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """prepare(0)+run(0)+unregister(0)+prepare(0)+run(0) → host_dlopen delta == 2.
+
+        Counter is monotonic — re-prepare always counts a fresh dlopen.
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1
+            st_worker.unregister_callable(0)
+            registered = False
+            after_unreg = st_worker.host_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the host dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct host dlopens), "
+                f"got delta {st_worker.host_dlopen_count - baseline}"
+            )
+        finally:
+            if registered:
+                st_worker.unregister_callable(0)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp
new file mode 100644
index 000000000..8e2094807
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]);
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
+
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(vRows, vCols);
+    TileData src1Tile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp
new file mode 100644
index 000000000..056442e21
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ float *inout = reinterpret_cast<__gm__ float *>(args[0]);
+
+    union {
+        uint64_t u64;
+        float f32;
+    } converter;
+    converter.u64 = args[1];
+    float scalar = converter.f32;
+
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData srcTile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(srcTile, 0x0);
+    TASSIGN(dstTile, 0x10000);
+
+    GlobalData inoutGlobal(inout);
+
+    TLOAD(srcTile, inoutGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dstTile, srcTile, scalar);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(inoutGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp
new file mode 100644
index 000000000..8c8d807c4
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Dump-tensor interface demo for host_build_graph.
+ *
+ * Demonstrates the two ways to register tensor metadata for dump:
+ *   Task 0 (add):                add_task() + set_tensor_info_to_task()
+ *   Task 1 (add_scalar_inplace): add_task_with_tensor_info()
+ *
+ * Computation: f = (a + b) + 1  (a=2, b=3 → f=6)
+ */
+
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+int build_dump_tensor_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
+    void *host_a = orch_args.tensor(0).data_as<void>();
+    void *host_b = orch_args.tensor(1).data_as<void>();
+    void *host_f = orch_args.tensor(2).data_as<void>();
+    size_t size_a = orch_args.tensor(0).nbytes();
+    size_t size_b = orch_args.tensor(1).nbytes();
+    size_t size_f = orch_args.tensor(2).nbytes();
+    uint32_t size = orch_args.tensor(0).shapes[0];
+
+    TensorInfo ext_a_info = make_tensor_info_from_tensor_arg(orch_args.tensor(0));
+    TensorInfo ext_b_info = make_tensor_info_from_tensor_arg(orch_args.tensor(1));
+    TensorInfo ext_f_info = make_tensor_info_from_tensor_arg(orch_args.tensor(2));
+
+    void *dev_a = device_malloc(runtime, size_a);
+    copy_to_device(runtime, dev_a, host_a, size_a);
+
+    void *dev_b = device_malloc(runtime, size_b);
+    copy_to_device(runtime, dev_b, host_b, size_b);
+
+    void *dev_f = device_malloc(runtime, size_f);
+    record_tensor_pair(runtime, host_f, dev_f, size_f);
+
+    // Task 0: a + b → f  (add_task + set_tensor_info_to_task)
+    uint64_t args_t0[4] = {
+        reinterpret_cast<uint64_t>(dev_a),
+        reinterpret_cast<uint64_t>(dev_b),
+        reinterpret_cast<uint64_t>(dev_f),
+        size,
+    };
+    int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV);
+    TensorInfo t0_info[] = {ext_a_info, ext_b_info, ext_f_info};
+    set_tensor_info_to_task(runtime, t0, t0_info, 3);
+
+    // Task 1: f += 1.0  (add_task_with_tensor_info)
+    union {
+        float f32;
+        uint64_t u64;
+    } sc;
+    sc.f32 = 1.0f;
+    uint64_t args_t1[3] = {reinterpret_cast<uint64_t>(dev_f), sc.u64, size};
+    TensorInfo t1_info[] = {ext_f_info};
+    int t1 = add_task_with_tensor_info(runtime, args_t1, 3, 1, CoreType::AIV, t1_info, 1);
+
+    add_successor(runtime, t0, t1);
+
+    return 0;
+}
+
+}  // extern "C"
diff --git a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..92ba39f50
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared on a5/host_build_graph.
+
+Mirrors tests/st/a2a3/host_build_graph/prepared_callable for the a5 variant.
+Reuses the dump_tensor example kernels (a + b + 1) since a5/hbg has no
+vector_example today and dump_tensor already runs cleanly on a5sim.
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPreparedCallableHbgA5(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable on a5/hbg."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/dump_tensor_orch.cpp",
+            "function_name": "build_dump_tensor_graph",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_add_scalar_inplace.cpp",
+                "core_type": "aiv",
+                "signature": [D.INOUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3}
+    _PLATFORMS = ["a5sim", "a5"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # dump_tensor orchestration computes f = (a + b) + 1
+        args.f[:] = (args.a + args.b) + 1
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        worker.prepare_callable(0, callable_obj)
+        worker.prepare_callable(1, callable_obj)
+
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(0, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(1, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        worker.unregister_callable(0)
+        worker.unregister_callable(1)
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(1, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, 1, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu
+        finally:
+            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(1)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(0, callable_obj)
+        finally:
+            st_worker.unregister_callable(0)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1
+            st_worker.unregister_callable(0)
+            registered = False
+            assert st_worker.host_dlopen_count - baseline == 1, "unregister must NOT decrement the host dlopen counter"
+            st_worker.prepare_callable(0, callable_obj)
+            registered = True
+            self._run_one(st_worker, 0, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2
+        finally:
+            if registered:
+                st_worker.unregister_callable(0)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)

From c0ff9f2fba7fe65f9deba9f846d0da9685619ce4 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 11:09:11 +0800
Subject: [PATCH 18/28] =?UTF-8?q?refactor(callable):=20Phase=203=20?=
 =?UTF-8?q?=E2=80=94=20drop=20RUNTIME=5FHAS=5FCALLABLE=5FID=20and=20RUNTIM?=
 =?UTF-8?q?E=5FHOST=5FORCH=20macros?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All four runtime variants (a2a3/{trb,hbg}, a5/{trb,hbg}) now implement
prepare_callable / run_prepared / unregister_callable end-to-end, so the
build-time guards that picked between the real implementation and stubs
or between trb/hbg staging fields are no longer load-bearing.

Unify the public Runtime API across variants so the platform layer can
branch at runtime instead:

- trb runtime.h (a2a3+a5): add pending_host_dlopen_handle_ /
  pending_host_orch_func_ptr_ host-only fields (always nullptr on trb).
- hbg runtime.h (a2a3+a5): add device_orch_func_name_ /
  device_orch_config_name_ + set/get accessors (always empty on hbg).
- 4 device_runner.cpp: bind_prepared_callable_to_runtime now writes
  both host_dlopen and device_orch_func_name fields unconditionally;
  whichever set was populated by the corresponding register_*
  overload wins, the other stays at its default.
- 4 pto_runtime_c_api.cpp: prepare_callable picks the trb vs hbg path
  by inspecting r->pending_host_dlopen_handle_ at runtime instead of
  via #ifdef RUNTIME_HOST_ORCH.

Mechanical removals:

- 4 runtime.h: drop #define RUNTIME_HAS_CALLABLE_ID and (where present)
  RUNTIME_HOST_ORCH.
- 8 platform files (.h/.cpp): unwrap every #ifdef RUNTIME_HAS_CALLABLE_ID
  and RUNTIME_HOST_ORCH block, keeping the real implementation; delete
  the dlsym-stub #else branches in the c_api files (no variant needs
  them now).

Verified: 4 sim binaries compile, 4×5 prepared_callable ST tests pass
(20 total), tests/ut/py/test_chip_worker.py 15 pass.
---
 .../platform/onboard/host/device_runner.cpp   | 22 ++------
 .../platform/onboard/host/device_runner.h     |  7 ---
 .../onboard/host/pto_runtime_c_api.cpp        | 54 ++++++-------------
 src/a2a3/platform/sim/host/device_runner.cpp  | 13 -----
 src/a2a3/platform/sim/host/device_runner.h    |  7 ---
 .../platform/sim/host/pto_runtime_c_api.cpp   | 43 +++++----------
 .../host_build_graph/runtime/runtime.h        | 34 ++++++++----
 .../runtime/runtime.h                         | 13 ++---
 .../platform/onboard/host/device_runner.cpp   | 13 -----
 .../onboard/host/pto_runtime_c_api.cpp        | 50 +++++------------
 src/a5/platform/sim/host/device_runner.cpp    | 13 -----
 .../platform/sim/host/pto_runtime_c_api.cpp   | 43 +++++----------
 .../host_build_graph/runtime/runtime.h        | 30 ++++++++---
 .../runtime/runtime.h                         | 14 ++---
 14 files changed, 117 insertions(+), 239 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index ef0556503..172fa2024 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -747,7 +747,6 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
     // Per-callable_id path (callable.md): when run_prepared bound a known
     // callable_id, the SO bytes were already H2D'd at prepare_callable time.
     // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
@@ -786,7 +785,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         );
         return 0;
     }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
@@ -844,7 +842,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
@@ -993,23 +990,20 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
         }
         runtime.replay_function_bin_addr(kv.first, kv.second);
     }
-#ifdef RUNTIME_HOST_ORCH
-    // hbg: replay the cached host dlopen so bind_prepared_to_runtime_impl can
-    // invoke orch_func without redoing dlopen+dlsym. Guarded by
-    // RUNTIME_HOST_ORCH because trb's Runtime has no host-orch staging fields.
+    // Replay both paths unconditionally — the runtime carries staging fields
+    // for both trb (device-side dlopen via entry-symbol names) and hbg (host-
+    // side dlopen handle + fn ptr). Whichever set was populated by
+    // register_prepared_callable / register_prepared_callable_host_orch wins;
+    // the other set stays at its initial value (empty string / nullptr).
     runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
     runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
-#else
-    // trb: AICPU dlopens from device buffer using the entry-symbol names.
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
-#endif
     // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
     // with the authoritative first_sighting answer right before launch.
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
     return 0;
 }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 int DeviceRunner::finalize() {
     if (device_id_ == -1) {
@@ -1037,11 +1031,7 @@ int DeviceRunner::finalize() {
     // legacy regression signal is preserved for callers that never went
     // through prepare_callable.
     if (!func_id_to_addr_.empty()) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
         const bool prepared_path_used = prepared_callable_path_used_;
-#else
-        const bool prepared_path_used = false;
-#endif
         if (prepared_path_used) {
             LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
         } else {
@@ -1070,7 +1060,6 @@ int DeviceRunner::finalize() {
     // Release any prepared-callable orch SO buffers that callers forgot to
     // unregister. Refcounts no longer matter at this point — the device is
     // about to be reset.
-#ifdef RUNTIME_HAS_CALLABLE_ID
     for (auto &kv : orch_so_dedup_) {
         if (kv.second.dev_addr != nullptr) {
             mem_alloc_.free(kv.second.dev_addr);
@@ -1080,7 +1069,6 @@ int DeviceRunner::finalize() {
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     // Cleanup performance profiling
     if (l2_perf_collector_.is_initialized()) {
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 5c6aa1715..998e197db 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -422,7 +422,6 @@ class DeviceRunner {
      */
     void release_run_context();
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
     /**
      * Stage a per-callable_id orchestration SO into device memory and remember
      * the supporting metadata (entry/config symbol names, kernel func_id ↔
@@ -515,10 +514,6 @@ class DeviceRunner {
      * path doesn't dlopen on every run.
      */
     size_t host_dlopen_count() const { return host_dlopen_total_; }
-#else   // RUNTIME_HAS_CALLABLE_ID
-    size_t aicpu_dlopen_count() const { return 0; }
-    size_t host_dlopen_count() const { return 0; }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 private:
     // Internal state
@@ -564,7 +559,6 @@ class DeviceRunner {
     // `aicpu_seen_callable_ids_` tracks which ids have already been delivered
     // to the AICPU at least once so prepare_orch_so can set
     // register_new_callable_id_ correctly on first sighting.
-#ifdef RUNTIME_HAS_CALLABLE_ID
     struct PreparedCallableState {
         // trb path (AICPU dlopens orch SO from device buffer)
         uint64_t hash{0};
@@ -599,7 +593,6 @@ class DeviceRunner {
     // legacy-path "kernel still cached at finalize" leaks from prepared-path
     // kernels that legitimately live until finalize.
     bool prepared_callable_path_used_{false};
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     // ACL lifecycle (process-wide). aclInit must run exactly once; ensure_acl_ready
     // gates it behind this flag. finalize() drives aclFinalize only if we observed
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index c2c655184..6da94df42 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -42,10 +42,8 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
-#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -306,7 +304,6 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
  * Per-callable_id preparation (callable.md design)
  * =========================================================================== */
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable(
     DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
     size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
@@ -363,22 +360,22 @@ int prepare_callable(
         // they belong to the prepared state now.
         r->clear_registered_kernels();
 
-#ifdef RUNTIME_HOST_ORCH
-        // hbg: prepare_callable_impl already dlopen+dlsym'd on the host. Hand
-        // the handle/fn pointer over to DeviceRunner; it owns the dlopen
-        // lifetime from here until unregister_prepared_callable.
-        rc = runner->register_prepared_callable_host_orch(
-            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
-        );
-        // Clear staging so the Runtime destructor does not see them again.
-        r->pending_host_dlopen_handle_ = nullptr;
-        r->pending_host_orch_func_ptr_ = nullptr;
-#else
-        rc = runner->register_prepared_callable(
-            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
-            r->get_device_orch_config_name(), std::move(kernel_addrs)
-        );
-#endif
+        // Pick the path by inspecting which staging fields the runtime carries:
+        // hbg's prepare_callable_impl populates pending_host_dlopen_handle_;
+        // trb's leaves it null and instead populates pending_orch_so_data_ +
+        // device_orch_func_name_/config_name_.
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
         return rc;
     } catch (...) {
         return -1;
@@ -465,25 +462,6 @@ int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
         return -1;
     }
 }
-#else   // RUNTIME_HAS_CALLABLE_ID
-// Stubs so the dlsym surface is uniform across runtime variants. ChipWorker
-// resolves these unconditionally; variants that lack callable.md support
-// reject the calls at runtime instead of failing to load the library.
-int prepare_callable(
-    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
-) {
-    LOG_ERROR("prepare_callable not supported by this runtime variant");
-    return -1;
-}
-int run_prepared(
-    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
-    size_t, int, int, int, const char *
-) {
-    LOG_ERROR("run_prepared not supported by this runtime variant");
-    return -1;
-}
-int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
     if (ctx == NULL) return 0;
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 20596afa5..f9068bcb6 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -676,7 +676,6 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
     // Per-callable_id path (callable.md): mirror onboard. Bytes were staged
     // at register_prepared_callable time; here we only stamp metadata onto
     // the runtime and resolve `register_new_callable_id_` from first sighting.
@@ -709,7 +708,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         );
         return 0;
     }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
@@ -758,7 +756,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
@@ -896,17 +893,13 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
         }
         runtime.replay_function_bin_addr(kv.first, kv.second);
     }
-#ifdef RUNTIME_HOST_ORCH
     runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
     runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
-#else
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
-#endif
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
     return 0;
 }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 int DeviceRunner::finalize() {
     // Skip if already finalized
@@ -937,11 +930,7 @@ int DeviceRunner::finalize() {
     // leaves them resident across runs and relies on finalize() to reclaim
     // them; that is not a leak.
     if (!func_id_to_addr_.empty()) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
         const bool prepared_path_used = prepared_callable_path_used_;
-#else
-        const bool prepared_path_used = false;
-#endif
         if (prepared_path_used) {
             LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
         } else {
@@ -969,7 +958,6 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.shrink_to_fit();
 
     // Release any prepared-callable orch SO buffers callers forgot to drop.
-#ifdef RUNTIME_HAS_CALLABLE_ID
     for (auto &kv : orch_so_dedup_) {
         if (kv.second.dev_addr != nullptr) {
             mem_alloc_.free(kv.second.dev_addr);
@@ -979,7 +967,6 @@ int DeviceRunner::finalize() {
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 4a1759054..6f943800a 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -212,7 +212,6 @@ class DeviceRunner {
      */
     void remove_kernel_binary(int func_id);
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
     int register_prepared_callable(
         int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
         const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
@@ -229,10 +228,6 @@ class DeviceRunner {
     int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
     size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
     size_t host_dlopen_count() const { return host_dlopen_total_; }
-#else   // RUNTIME_HAS_CALLABLE_ID
-    size_t aicpu_dlopen_count() const { return 0; }
-    size_t host_dlopen_count() const { return 0; }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 private:
     // Configuration
@@ -257,7 +252,6 @@ class DeviceRunner {
     std::vector<uint8_t> host_orch_so_copy_;
 
     // Per-callable_id prepared state (callable.md design). Mirrors onboard.
-#ifdef RUNTIME_HAS_CALLABLE_ID
     struct PreparedCallableState {
         // trb path
         uint64_t hash{0};
@@ -288,7 +282,6 @@ class DeviceRunner {
     // legitimately staged by prepare_callable (which is owned until finalize
     // by design).
     bool prepared_callable_path_used_{false};
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     // AICPU executor SO: load-once, matching onboard's binaries_loaded_ pattern.
     // The aicpu_executor g_aicpu_executor static lives inside the dlopen'd DSO;
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index da71f750b..5089c3858 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -38,10 +38,8 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
-#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -292,7 +290,6 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
  * Per-callable_id preparation (callable.md design)
  * =========================================================================== */
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable(
     DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
     size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
@@ -337,18 +334,18 @@ int prepare_callable(
         }
         r->clear_registered_kernels();
 
-#ifdef RUNTIME_HOST_ORCH
-        rc = runner->register_prepared_callable_host_orch(
-            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
-        );
-        r->pending_host_dlopen_handle_ = nullptr;
-        r->pending_host_orch_func_ptr_ = nullptr;
-#else
-        rc = runner->register_prepared_callable(
-            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
-            r->get_device_orch_config_name(), std::move(kernel_addrs)
-        );
-#endif
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
         pthread_setspecific(g_runner_key, nullptr);
         return rc;
     } catch (...) {
@@ -437,22 +434,6 @@ int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
         return -1;
     }
 }
-#else   // RUNTIME_HAS_CALLABLE_ID
-int prepare_callable(
-    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
-) {
-    LOG_ERROR("prepare_callable not supported by this runtime variant");
-    return -1;
-}
-int run_prepared(
-    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
-    size_t, int, int, int, const char *
-) {
-    LOG_ERROR("run_prepared not supported by this runtime variant");
-    return -1;
-}
-int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 size_t get_host_dlopen_count(DeviceContextHandle ctx) {
     if (ctx == NULL) return 0;
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index e48d4500e..ea5766f8e 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -29,17 +29,6 @@
 #ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 #define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 
-// Tells the shared platform layer (device_runner.{h,cpp},
-// pto_runtime_c_api.cpp) that this runtime variant participates in callable.md
-// prepare_callable / run_prepared. hbg dlopens the orchestration SO on the
-// host, so the platform layer routes register/unregister through the
-// host_dlopen_handle branch instead of the AICPU per-cid orch_so_table_.
-#define RUNTIME_HAS_CALLABLE_ID 1
-// Marks this variant as host-orchestrated: the platform layer's
-// register/bind/unregister logic uses runtime->pending_host_dlopen_handle_ and
-// pending_host_orch_func_ptr_, which only exist on hbg-shaped Runtimes.
-#define RUNTIME_HOST_ORCH 1
-
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -504,6 +493,29 @@ class Runtime {
     void *pending_host_dlopen_handle_{nullptr};
     void *pending_host_orch_func_ptr_{nullptr};
 
+    // Device-orchestration entry/config symbol names (callable.md trb path).
+    // Always empty on this hbg variant — included for API parity so the shared
+    // platform layer can call set_device_orch_func_name unconditionally.
+    char device_orch_func_name_[64]{};
+    char device_orch_config_name_[64]{};
+
+    void set_device_orch_func_name(const char *name) {
+        device_orch_func_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_func_name_, name, sizeof(device_orch_func_name_) - 1);
+            device_orch_func_name_[sizeof(device_orch_func_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_func_name() const { return device_orch_func_name_; }
+    void set_device_orch_config_name(const char *name) {
+        device_orch_config_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_config_name_, name, sizeof(device_orch_config_name_) - 1);
+            device_orch_config_name_[sizeof(device_orch_config_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_config_name() const { return device_orch_config_name_; }
+
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 9bd847379..87764591a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -29,12 +29,6 @@
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 
-// This variant supports the per-callable_id dispatch protocol (callable.md).
-// DeviceRunner and pto_runtime_c_api.cpp check this at compile time to guard
-// callable_id-specific code paths so the same sources compile cleanly against
-// variants that lack the protocol (host_build_graph, aicpu_build_graph).
-#define RUNTIME_HAS_CALLABLE_ID 1
-
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -319,6 +313,13 @@ class Runtime {
     // the memcpy, but their values while running on device are irrelevant.
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
+
+    // Host-orchestration staging (callable.md hbg path). Always nullptr on
+    // this trb variant — included for API parity with host_build_graph so the
+    // shared platform layer can branch on `pending_host_dlopen_handle_ !=
+    // nullptr` at runtime instead of via a build-time macro.
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
 };
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 2d318d7fe..3892bbf3d 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -602,7 +602,6 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
     // Per-callable_id path (callable.md): when run_prepared bound a known
     // callable_id, the SO bytes were already H2D'd at prepare_callable time.
     // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
@@ -640,7 +639,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         );
         return 0;
     }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
@@ -693,7 +691,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
@@ -843,19 +840,15 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
         }
         runtime.replay_function_bin_addr(kv.first, kv.second);
     }
-#ifdef RUNTIME_HOST_ORCH
     runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
     runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
-#else
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
-#endif
     // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
     // with the authoritative first_sighting answer right before launch.
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
     return 0;
 }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 int DeviceRunner::finalize() {
     if (device_id_ == -1) {
@@ -883,11 +876,7 @@ int DeviceRunner::finalize() {
     // legacy regression signal is preserved for callers that never went
     // through prepare_callable.
     if (!func_id_to_addr_.empty()) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
         const bool prepared_path_used = prepared_callable_path_used_;
-#else
-        const bool prepared_path_used = false;
-#endif
         if (prepared_path_used) {
             LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
         } else {
@@ -914,7 +903,6 @@ int DeviceRunner::finalize() {
     // Release any prepared-callable orch SO buffers that callers forgot to
     // unregister. Refcounts no longer matter at this point — the device is
     // about to be reset.
-#ifdef RUNTIME_HAS_CALLABLE_ID
     for (auto &kv : orch_so_dedup_) {
         if (kv.second.dev_addr != nullptr) {
             mem_alloc_.free(kv.second.dev_addr);
@@ -924,7 +912,6 @@ int DeviceRunner::finalize() {
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     // Cleanup performance profiling (frees L2PerfSetupHeader + all per-core/per-thread buffers)
     if (l2_perf_collector_.is_initialized()) {
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 2cd9f07be..17dc618d3 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -42,10 +42,8 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
-#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -338,13 +336,8 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
 }
 /* ===========================================================================
  * Per-callable_id preparation (callable.md design)
- *
- * Variants that define RUNTIME_HAS_CALLABLE_ID get the real prepare/run_prepared
- * implementation; others fall back to dlsym-resolvable stubs that fail loudly
- * at call time so ChipWorker's unconditional symbol resolution still succeeds.
  * =========================================================================== */
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable(
     DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
     size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
@@ -401,18 +394,18 @@ int prepare_callable(
         // they belong to the prepared state now.
         r->clear_registered_kernels();
 
-#ifdef RUNTIME_HOST_ORCH
-        rc = runner->register_prepared_callable_host_orch(
-            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
-        );
-        r->pending_host_dlopen_handle_ = nullptr;
-        r->pending_host_orch_func_ptr_ = nullptr;
-#else
-        rc = runner->register_prepared_callable(
-            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
-            r->get_device_orch_config_name(), std::move(kernel_addrs)
-        );
-#endif
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
         return rc;
     } catch (...) {
         return -1;
@@ -499,25 +492,6 @@ int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
         return -1;
     }
 }
-#else   // RUNTIME_HAS_CALLABLE_ID
-// Stubs so the dlsym surface is uniform across runtime variants. ChipWorker
-// resolves these unconditionally; variants that lack callable.md support
-// reject the calls at runtime instead of failing to load the library.
-int prepare_callable(
-    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
-) {
-    LOG_ERROR("prepare_callable not supported by this runtime variant");
-    return -1;
-}
-int run_prepared(
-    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
-    size_t, int, int, int, const char *
-) {
-    LOG_ERROR("run_prepared not supported by this runtime variant");
-    return -1;
-}
-int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 size_t get_host_dlopen_count(DeviceContextHandle ctx) {
     if (ctx == NULL) return 0;
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index c926f6b36..118a0849d 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -654,7 +654,6 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
     // Per-callable_id path (callable.md): mirror onboard. Bytes were staged
     // at register_prepared_callable time; here we only stamp metadata onto
     // the runtime and resolve `register_new_callable_id_` from first sighting.
@@ -686,7 +685,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         );
         return 0;
     }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
@@ -735,7 +733,6 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     return 0;
 }
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int DeviceRunner::register_prepared_callable(
     int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
     std::vector<std::pair<int, uint64_t>> kernel_addrs
@@ -869,17 +866,13 @@ int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t ca
         }
         runtime.replay_function_bin_addr(kv.first, kv.second);
     }
-#ifdef RUNTIME_HOST_ORCH
     runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
     runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
-#else
     runtime.set_device_orch_func_name(state.func_name.c_str());
     runtime.set_device_orch_config_name(state.config_name.c_str());
-#endif
     runtime.set_active_callable_id(callable_id, /*is_new=*/false);
     return 0;
 }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 int DeviceRunner::finalize() {
     // Skip if already finalized
@@ -922,11 +915,7 @@ int DeviceRunner::finalize() {
     // leaves them resident across runs and relies on finalize() to reclaim
     // them; that is not a leak.
     if (!func_id_to_addr_.empty()) {
-#ifdef RUNTIME_HAS_CALLABLE_ID
         const bool prepared_path_used = prepared_callable_path_used_;
-#else
-        const bool prepared_path_used = false;
-#endif
         if (prepared_path_used) {
             LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
         } else {
@@ -954,7 +943,6 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.shrink_to_fit();
 
     // Release any prepared-callable orch SO buffers callers forgot to drop.
-#ifdef RUNTIME_HAS_CALLABLE_ID
     for (auto &kv : orch_so_dedup_) {
         if (kv.second.dev_addr != nullptr) {
             mem_alloc_.free(kv.second.dev_addr);
@@ -964,7 +952,6 @@ int DeviceRunner::finalize() {
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 40ab89134..0c08b395d 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -38,10 +38,8 @@ extern "C" {
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
 int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
-#endif
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -291,7 +289,6 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
  * Per-callable_id preparation (callable.md design)
  * =========================================================================== */
 
-#ifdef RUNTIME_HAS_CALLABLE_ID
 int prepare_callable(
     DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
     size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
@@ -334,18 +331,18 @@ int prepare_callable(
         }
         r->clear_registered_kernels();
 
-#ifdef RUNTIME_HOST_ORCH
-        rc = runner->register_prepared_callable_host_orch(
-            callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
-        );
-        r->pending_host_dlopen_handle_ = nullptr;
-        r->pending_host_orch_func_ptr_ = nullptr;
-#else
-        rc = runner->register_prepared_callable(
-            callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
-            r->get_device_orch_config_name(), std::move(kernel_addrs)
-        );
-#endif
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
         pthread_setspecific(g_runner_key, nullptr);
         return rc;
     } catch (...) {
@@ -434,22 +431,6 @@ int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
         return -1;
     }
 }
-#else   // RUNTIME_HAS_CALLABLE_ID
-int prepare_callable(
-    DeviceContextHandle, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t
-) {
-    LOG_ERROR("prepare_callable not supported by this runtime variant");
-    return -1;
-}
-int run_prepared(
-    DeviceContextHandle, RuntimeHandle, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *,
-    size_t, int, int, int, const char *
-) {
-    LOG_ERROR("run_prepared not supported by this runtime variant");
-    return -1;
-}
-int unregister_callable(DeviceContextHandle, int32_t) { return 0; }
-#endif  // RUNTIME_HAS_CALLABLE_ID
 
 size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
     if (ctx == NULL) return 0;
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index 9e6abcccf..54ed30075 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -29,13 +29,6 @@
 #ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 #define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
 
-// Mirrors a2a3/host_build_graph: tells the shared platform layer this variant
-// implements callable.md, with the hbg branch (host dlopen of the orch SO).
-#define RUNTIME_HAS_CALLABLE_ID 1
-// Marks this variant as host-orchestrated: the platform layer's
-// register/bind/unregister logic uses host_dlopen_handle/host_orch_func_ptr.
-#define RUNTIME_HOST_ORCH 1
-
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -503,6 +496,29 @@ class Runtime {
     void *pending_host_dlopen_handle_{nullptr};
     void *pending_host_orch_func_ptr_{nullptr};
 
+    // Device-orchestration entry/config symbol names (callable.md trb path).
+    // Always empty on this hbg variant — included for API parity so the shared
+    // platform layer can call set_device_orch_func_name unconditionally.
+    char device_orch_func_name_[64]{};
+    char device_orch_config_name_[64]{};
+
+    void set_device_orch_func_name(const char *name) {
+        device_orch_func_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_func_name_, name, sizeof(device_orch_func_name_) - 1);
+            device_orch_func_name_[sizeof(device_orch_func_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_func_name() const { return device_orch_func_name_; }
+    void set_device_orch_config_name(const char *name) {
+        device_orch_config_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_config_name_, name, sizeof(device_orch_config_name_) - 1);
+            device_orch_config_name_[sizeof(device_orch_config_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_config_name() const { return device_orch_config_name_; }
+
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 8b2ee4a73..1f388a43e 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -29,13 +29,6 @@
 #ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 #define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 
-// This variant supports the per-callable_id dispatch protocol (callable.md).
-// DeviceRunner and pto_runtime_c_api.cpp check this at compile time to guard
-// callable_id-specific code paths so the same sources compile cleanly against
-// variants that lack the protocol (host_build_graph). This guard is removed
-// in a later phase once every variant implements the protocol.
-#define RUNTIME_HAS_CALLABLE_ID 1
-
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>   // for fprintf, printf
@@ -331,6 +324,13 @@ class Runtime {
     // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_).
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
+
+    // Host-orchestration staging (callable.md hbg path). Always nullptr on
+    // this trb variant — included for API parity with host_build_graph so the
+    // shared platform layer can branch on `pending_host_dlopen_handle_ !=
+    // nullptr` at runtime instead of via a build-time macro.
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
 };
 
 #endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_

From e15489d9e49823a5af583803f22724ac57278181 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 11:18:18 +0800
Subject: [PATCH 19/28] =?UTF-8?q?refactor(callable):=20Phase=203=20?=
 =?UTF-8?q?=E2=80=94=20drop=20Python=20fallback=20to=20legacy=20run()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that all four runtime variants implement prepare_callable /
run_prepared end-to-end, Worker no longer needs a fallback to the
legacy chip_worker.run(callable, args, cfg) lower-level binding when
the runtime returned -1 from the C ABI stub.

Worker.py removals:
- _PREPARED_CALLABLE_DISABLED_ENV / _prepared_callable_disabled() and
  the PTO2_DISABLE_PREPARED_CALLABLE env-var rollback knob.
- _l2_use_prepared field, _l2_prepare() method, and the conditional
  prepare-then-fallback dance in register() / _init_level2() / run().
- prepared_unsupported flag and _run_legacy() in both
  _chip_process_loop and _chip_process_loop_with_bootstrap. Both helpers
  now have a simpler _ensure_prepared() that always prepares-or-raises.

Worker.run(L2) and the chip_process loops now always go through
run_prepared. A registered ChipCallable that fails to prepare now
surfaces the underlying RuntimeError instead of silently rerouting.

Verified: tests/ut/py/test_chip_worker.py 15 pass,
tests/ut/py/test_worker/ 65 pass + 3 hardware skipped, hbg
prepared_callable ST 5×2 pass, a2a3/trb vector_example regression
passes.
---
 python/simpler/worker.py | 151 ++++++---------------------------------
 1 file changed, 22 insertions(+), 129 deletions(-)

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index e3be2d70b..8f0fc9ed7 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -87,16 +87,6 @@ def my_l4_orch(orch, args, config):
 _BOOTSTRAP_WAIT_TIMEOUT_S = 120.0
 _BOOTSTRAP_POLL_INTERVAL_S = 0.001
 
-# Stage 4 (callable.md): rollback knob.  When set to "1" the L2 path skips
-# `prepare_callable` at init() and `Worker.run` falls back to the legacy
-# `_chip_worker.run(callable, args, cfg)` lower-level binding.  L3+ paths
-# are unaffected — the cid mailbox protocol does not have a legacy fallback.
-_PREPARED_CALLABLE_DISABLED_ENV = "PTO2_DISABLE_PREPARED_CALLABLE"
-
-
-def _prepared_callable_disabled() -> bool:
-    return os.environ.get(_PREPARED_CALLABLE_DISABLED_ENV, "") == "1"
-
 
 # ---------------------------------------------------------------------------
 # Unified mailbox layout (must match worker_manager.h MAILBOX_OFF_*)
@@ -321,43 +311,15 @@ def _chip_process_loop(
     # pre-warms via _CTRL_PREPARE, but TASK_READY also lazy-prepares as a
     # safety net (e.g. registrations that bypassed the prefetch path).
     prepared: set[int] = set()
-    # Some runtime variants (e.g. a5 onboard) ship `prepare_callable` /
-    # `run_prepared` as stubs that return -1.  When the very first
-    # prepare_callable raises, flip this flag and fall back to the legacy
-    # `cw.run(callable_obj, args, cfg)` path for every subsequent task.
-    # The fallback is a transitional shim — once every variant implements
-    # the prepared path it can go away.
-    prepared_unsupported = False
-
-    def _ensure_prepared(cid: int) -> bool:
-        """Return True iff `cid` is ready for `run_prepared`.
-
-        Returns False (and sets `prepared_unsupported` in the enclosing
-        scope) when the runtime variant does not implement
-        `prepare_callable`, signalling the caller to take the legacy path.
-        """
-        nonlocal prepared_unsupported
-        if prepared_unsupported:
-            return False
+
+    def _ensure_prepared(cid: int) -> None:
         if cid in prepared:
-            return True
+            return
         callable_obj = registry.get(cid)
         if callable_obj is None:
             raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
-        try:
-            cw.prepare_callable(cid, callable_obj)
-        except RuntimeError:
-            prepared_unsupported = True
-            return False
+        cw.prepare_callable(cid, callable_obj)
         prepared.add(cid)
-        return True
-
-    def _run_legacy(cid: int, args, cfg) -> None:
-        """Legacy path: resolve cid back to its ChipCallable and run it."""
-        callable_obj = registry.get(cid)
-        if callable_obj is None:
-            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
-        cw.run(callable_obj, args, cfg)
 
     while True:
         state = _mailbox_load_i32(state_addr)
@@ -369,10 +331,8 @@ def _run_legacy(cid: int, args, cfg) -> None:
             msg = ""
             try:
                 args = _read_args_from_mailbox(buf)
-                if _ensure_prepared(cid):
-                    cw.run_prepared(cid, args, cfg)
-                else:
-                    _run_legacy(cid, args, cfg)
+                _ensure_prepared(cid)
+                cw.run_prepared(cid, args, cfg)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -475,32 +435,15 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
     # Per-child set of cids already prepared on this device (Stage 3,
     # callable.md).  Mirrors `_chip_process_loop`'s `prepared`.
     prepared: set[int] = set()
-    # Variants without per-cid prepare/run_prepared (e.g. a5 onboard) trip
-    # this on the very first prepare and the loop falls back to legacy run.
-    prepared_unsupported = False
-
-    def _ensure_prepared(cid: int) -> bool:
-        nonlocal prepared_unsupported
-        if prepared_unsupported:
-            return False
+
+    def _ensure_prepared(cid: int) -> None:
         if cid in prepared:
-            return True
+            return
         callable_obj = registry.get(cid)
         if callable_obj is None:
             raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
-        try:
-            cw._impl.prepare_callable(cid, callable_obj)
-        except RuntimeError:
-            prepared_unsupported = True
-            return False
+        cw._impl.prepare_callable(cid, callable_obj)
         prepared.add(cid)
-        return True
-
-    def _run_legacy(cid: int, args, cfg) -> None:
-        callable_obj = registry.get(cid)
-        if callable_obj is None:
-            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
-        cw._impl.run(callable_obj, args, cfg)
 
     try:
         while True:
@@ -513,10 +456,8 @@ def _run_legacy(cid: int, args, cfg) -> None:
                 msg = ""
                 try:
                     args = _read_args_from_mailbox(buf)
-                    if _ensure_prepared(cid):
-                        cw._impl.run_prepared(cid, args, cfg)
-                    else:
-                        _run_legacy(cid, args, cfg)
+                    _ensure_prepared(cid)
+                    cw._impl.run_prepared(cid, args, cfg)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -676,12 +617,6 @@ def __init__(
 
         # Level-2 internals
         self._chip_worker: Optional[ChipWorker] = None
-        # Stage 4 (callable.md): when the bound runtime variant supports
-        # prepare_callable, L2 routes Worker.run(cid) through run_prepared;
-        # otherwise the cid is resolved to its ChipCallable and the legacy
-        # lower-level binding handles dispatch.  Decided lazily in
-        # `_init_level2` (and toggled by `PTO2_DISABLE_PREPARED_CALLABLE`).
-        self._l2_use_prepared = False
 
         # Level-3+ internals
         self._worker: Optional[_Worker] = None
@@ -717,23 +652,6 @@ def __init__(
     # Callable registration (before init)
     # ------------------------------------------------------------------
 
-    def _l2_prepare(self, cid: int, target) -> bool:
-        """Try to pre-warm a ChipCallable on the L2 device.
-
-        Returns True on success.  When the bound runtime variant lacks
-        prepare_callable support (e.g. host_build_graph / aicpu_build_graph,
-        which return -1 from the C ABI stub), flips ``_l2_use_prepared``
-        off and returns False so the caller stops trying further cids and
-        Worker.run falls back to the legacy lower-level binding.
-        """
-        assert self._chip_worker is not None
-        try:
-            self._chip_worker.prepare_callable(cid, target)
-            return True
-        except RuntimeError:
-            self._l2_use_prepared = False
-            return False
-
     def register(self, target) -> int:
         """Register a callable. Returns the cid passed to ``run`` / ``submit_*``.
 
@@ -752,10 +670,6 @@ def register(self, target) -> int:
             no COW constraint).  When called post-init, ChipCallables are
             prepared on the device immediately; pre-init registrations are
             batched and prepared at the end of ``init()``.
-
-        Both pre-warm steps are skipped under
-        ``PTO2_DISABLE_PREPARED_CALLABLE=1``; ``Worker.run`` then
-        falls back to the legacy lower-level ``_chip_worker.run`` binding.
         """
         if self.level >= 3 and self._initialized:
             raise RuntimeError(
@@ -766,12 +680,10 @@ def register(self, target) -> int:
         self._callable_registry[cid] = target
 
         # L2 post-init: pre-warm immediately so the very first
-        # `Worker.run(cid, …)` is a clean cache hit.  When the runtime
-        # does not support prepare_callable (Stage 2 stub variants),
-        # `_l2_prepare` flips `_l2_use_prepared` off and `Worker.run`
-        # silently falls back to the legacy binding.
-        if self.level == 2 and self._initialized and isinstance(target, ChipCallable) and self._l2_use_prepared:
-            self._l2_prepare(cid, target)
+        # `Worker.run(cid, …)` is a clean cache hit.
+        if self.level == 2 and self._initialized and isinstance(target, ChipCallable):
+            assert self._chip_worker is not None
+            self._chip_worker.prepare_callable(cid, target)
         return cid
 
     def add_worker(self, worker: "Worker") -> None:
@@ -829,18 +741,11 @@ def _init_level2(self) -> None:
         self._chip_worker.set_device(device_id)
 
         # Stage 4 (callable.md): pre-warm any registered ChipCallable so the
-        # first run(cid, …) does not pay the H2D upload cost.  Skipped under
-        # the rollback env var so the legacy `_chip_worker.run(callable, …)`
-        # path stays viable for emergency triage.  The flag also flips when
-        # the bound runtime variant lacks prepare_callable support
-        # (host_build_graph / aicpu_build_graph still return -1 — see Stage 2):
-        # the first prepare attempt sets `_l2_use_prepared = False` and every
-        # subsequent run() goes through the legacy lower-level binding.
-        self._l2_use_prepared = not _prepared_callable_disabled()
-        if self._l2_use_prepared:
-            for cid, target in self._callable_registry.items():
-                if isinstance(target, ChipCallable) and not self._l2_prepare(cid, target):
-                    break
+        # first run(cid, …) does not pay the H2D upload cost.
+        assert self._chip_worker is not None
+        for cid, target in self._callable_registry.items():
+            if isinstance(target, ChipCallable):
+                self._chip_worker.prepare_callable(cid, target)
 
     def _init_hierarchical(self) -> None:
         device_ids = self._config.get("device_ids", [])
@@ -1235,10 +1140,6 @@ def run(self, callable, args=None, config=None) -> None:
         Stage 4 (callable.md):
           - L2: ``callable`` is a cid returned by ``Worker.register(chip_callable)``.
             Routes to ``_chip_worker.run_prepared(cid, args, cfg)``.
-            Under ``PTO2_DISABLE_PREPARED_CALLABLE=1`` the cid is resolved
-            back to the registered ``ChipCallable`` and the legacy
-            ``_chip_worker.run(callable, args, cfg)`` lower-level binding
-            is invoked instead.
           - L3+: ``callable`` is a Python orch fn invoked with the
             ``Orchestrator`` handle.
 
@@ -1250,15 +1151,7 @@ def run(self, callable, args=None, config=None) -> None:
 
         if self.level == 2:
             assert self._chip_worker is not None
-            if not self._l2_use_prepared:
-                # Rollback / unsupported-runtime path: resolve cid back to
-                # its ChipCallable and call the legacy lower-level binding.
-                target = self._callable_registry.get(int(callable))
-                if target is None:
-                    raise KeyError(f"Worker.run: cid {int(callable)} not found in registry")
-                self._chip_worker.run(target, args, cfg)
-            else:
-                self._chip_worker.run_prepared(int(callable), args, cfg)
+            self._chip_worker.run_prepared(int(callable), args, cfg)
         else:
             self._start_hierarchical()
             assert self._orch is not None

From b976a90bb1bbee911573def16fba29cdefc68386 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 11:46:30 +0800
Subject: [PATCH 20/28] =?UTF-8?q?refactor(callable):=20Phase=204=20?=
 =?UTF-8?q?=E2=80=94=20drop=20run=5Fruntime=20/=20init=5Fruntime=5Fimpl=20?=
 =?UTF-8?q?legacy=20ABI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that all four variants implement prepare_callable / run_prepared and
the Python fallback to the legacy callable-buffer path is gone, the
single-call C ABI it relied on is dead weight. ChipWorker::run becomes a
thin forwarder to run_prepared so the hierarchical IWorker contract is
preserved; the cid still arrives via worker_manager packing s.callable_id
into uint64.

C++ removals:
- 4 platform pto_runtime_c_api.cpp: drop run_runtime() definitions and the
  init_runtime_impl forward decls.
- 4 runtime_maker.cpp: drop the init_runtime_impl compatibility shim that
  bundled prepare_callable_impl + bind_prepared_to_runtime_impl.
- src/common/worker/pto_runtime_c_api.h: drop run_runtime declaration and
  refresh the file-header dlsym list / call-site references.
- src/common/worker/chip_worker.{h,cpp}:
  * IWorker::run(uint64_t, ...) now reinterprets the uint64 as cid and
    delegates to run_prepared.
  * Drop ChipWorker::run(const void*, const void*, ...) overload, the
    RunRuntimeFn typedef, and run_runtime_fn_ dlsym.

Python removals:
- python/bindings/task_interface.cpp: remove the four legacy nanobind
  overloads (run / run / run_raw / run_from_blob); keep run_prepared /
  prepare_callable / unregister_callable.
- python/simpler/task_interface.py: drop ChipWorker.run wrapper; usage
  doc updated to the prepare_callable + run_prepared idiom.
- tests/ut/py/test_chip_worker.py: drop test_run_before_set_device_raises
  (test_run_prepared_before_set_device_raises already covers the same
  state-machine guard).

Verified: 4 sim binaries compile, nanobind wheel rebuilds,
tests/ut/py/test_chip_worker.py 14 pass + tests/ut/py/test_worker/ 65
pass + 3 hardware skipped, 4 variants × 5 prepared_callable ST = 20 pass,
a2a3/trb vector_example + orch_so_cache regression pass.
---
 python/bindings/task_interface.cpp            | 36 ----------
 python/simpler/task_interface.py              | 27 +-------
 .../onboard/host/pto_runtime_c_api.cpp        | 66 ------------------
 .../platform/sim/host/pto_runtime_c_api.cpp   | 69 -------------------
 .../host_build_graph/host/runtime_maker.cpp   | 23 -------
 .../host/runtime_maker.cpp                    | 13 ----
 .../onboard/host/pto_runtime_c_api.cpp        | 66 ------------------
 .../platform/sim/host/pto_runtime_c_api.cpp   | 69 -------------------
 .../host_build_graph/host/runtime_maker.cpp   | 23 -------
 .../host/runtime_maker.cpp                    | 13 ----
 src/common/worker/chip_worker.cpp             | 32 ++-------
 src/common/worker/chip_worker.h               | 21 ++----
 src/common/worker/pto_runtime_c_api.h         | 54 ++++-----------
 tests/ut/py/test_chip_worker.py               | 13 ----
 14 files changed, 25 insertions(+), 500 deletions(-)

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 88862e66a..67647bfa5 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -622,42 +622,6 @@ NB_MODULE(_task_interface, m) {
         .def("set_device", &ChipWorker::set_device, nb::arg("device_id"))
         .def("reset_device", &ChipWorker::reset_device)
         .def("finalize", &ChipWorker::finalize)
-        .def(
-            "run",
-            [](ChipWorker &self, const PyChipCallable &callable, ChipStorageTaskArgs &args, const CallConfig &config) {
-                self.run(callable.buffer_.data(), &args, config);
-            },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config")
-        )
-        .def(
-            "run",
-            [](ChipWorker &self, const PyChipCallable &callable, TaskArgs &args, const CallConfig &config) {
-                TaskArgsView view = make_view(args);
-                self.run(reinterpret_cast<uint64_t>(callable.buffer_.data()), view, config);
-            },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config"),
-            "Launch a callable from a TaskArgs (used by chip child loops on "
-            "variants without prepare_callable support)."
-        )
-        .def(
-            "run_raw",
-            [](ChipWorker &self, uint64_t callable, uint64_t args, const CallConfig &config) {
-                self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
-            },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config"),
-            "Run with raw pointer arguments (used from forked chip process)."
-        )
-        .def(
-            "run_from_blob",
-            [](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, const CallConfig &config) {
-                TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(blob_ptr), MAILBOX_ARGS_CAPACITY);
-                self.run(callable, view, config);
-            },
-            nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("config"),
-            "Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at "
-            "blob_ptr and dispatch to the runtime. Used from forked chip processes "
-            "reading the WorkerThread mailbox."
-        )
         .def(
             "prepare_callable",
             [](ChipWorker &self, int32_t callable_id, const PyChipCallable &callable) {
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 6362e90e3..ca6abad8e 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -240,7 +240,8 @@ class ChipWorker:
                     aicpu_path="build/lib/.../aicpu.so",
                     aicore_path="build/lib/.../aicore.o")
         worker.set_device(device_id=0)
-        worker.run(chip_callable, orch_args, block_dim=24)
+        worker.prepare_callable(cid=0, chip_callable)
+        worker.run_prepared(cid=0, orch_args, CallConfig(block_dim=24))
         worker.reset_device()
         worker.finalize()
     """
@@ -305,30 +306,6 @@ def finalize(self):
         """
         self._impl.finalize()
 
-    def run(self, callable, args, config=None, **kwargs):
-        """Execute a callable synchronously.
-
-        Args:
-            callable: ChipCallable built from orchestration + kernel binaries.
-            args: ChipStorageTaskArgs for this invocation.
-            config: Optional CallConfig. If None, a default is created.
-            **kwargs: Overrides applied to config (e.g. block_dim=24).
-        """
-        if config is None:
-            config = CallConfig()
-        for k, v in kwargs.items():
-            setattr(config, k, v)
-        self._impl.run(callable, args, config)
-
-    def run_from_blob(self, callable, blob_ptr, config):
-        """Execute via a serialized args blob in shared memory.
-
-        Used by `_chip_process_loop` after reading the mailbox: instead of
-        deserializing the args into Python objects, the C++ side parses the
-        POD blob directly at `blob_ptr`.
-        """
-        self._impl.run_from_blob(int(callable), int(blob_ptr), config)
-
     def prepare_callable(self, callable_id, callable):
         """Stage a ChipCallable under ``callable_id`` for repeated cheap launches.
 
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 6da94df42..a3d883029 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -41,7 +41,6 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
@@ -199,71 +198,6 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-) {
-    if (ctx == NULL || runtime == NULL) return -1;
-    if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1;
-
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-
-    pthread_once(&g_runner_key_once, create_runner_key);
-    pthread_setspecific(g_runner_key, ctx);
-    auto tsd_guard = RAIIScopeGuard([]() {
-        pthread_setspecific(g_runner_key, nullptr);
-    });
-
-    try {
-        int rc = runner->prepare_run_context(device_id);
-        if (rc != 0) return rc;
-        auto run_context_guard = RAIIScopeGuard([runner]() {
-            runner->release_run_context();
-        });
-
-        Runtime *r = new (runtime) Runtime();
-        r->host_api.device_malloc = device_malloc;
-        r->host_api.device_free = device_free;
-        r->host_api.copy_to_device = copy_to_device;
-        r->host_api.copy_from_device = copy_from_device;
-        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
-        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
-
-        LOG_DEBUG("About to call init_runtime_impl, r=%p", (void *)r);
-        rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
-        LOG_DEBUG("init_runtime_impl returned: %d", rc);
-        if (rc != 0) {
-            r->set_gm_sm_ptr(nullptr);
-            validate_runtime_impl(r);
-            r->~Runtime();
-            return rc;
-        }
-
-        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
-        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
-        runner->set_pmu_enabled(enable_pmu);
-        runner->set_output_prefix(output_prefix);
-
-        std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
-        std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
-        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
-        if (rc != 0) {
-            validate_runtime_impl(r);
-            r->~Runtime();
-            return rc;
-        }
-
-        rc = validate_runtime_impl(r);
-        r->~Runtime();
-        return rc;
-    } catch (...) {
-        return -1;
-    }
-}
-
 int finalize_device(DeviceContextHandle ctx) {
     if (ctx == NULL) return -1;
     try {
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 5089c3858..3d0bba1fa 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -37,7 +37,6 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
@@ -160,74 +159,6 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-) {
-    if (ctx == NULL || runtime == NULL) return -1;
-
-    pthread_once(&g_runner_key_once, create_runner_key);
-    pthread_setspecific(g_runner_key, ctx);
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-
-    try {
-        // Phase 1: placement new + build graph
-        Runtime *r = new (runtime) Runtime();
-        r->host_api.device_malloc = device_malloc;
-        r->host_api.device_free = device_free;
-        r->host_api.copy_to_device = copy_to_device;
-        r->host_api.copy_from_device = copy_from_device;
-        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
-        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
-
-        int rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
-        if (rc != 0) {
-            r->set_gm_sm_ptr(nullptr);
-            validate_runtime_impl(r);
-            r->~Runtime();
-            pthread_setspecific(g_runner_key, nullptr);
-            return rc;
-        }
-
-        // Phase 2: publish diagnostics enablement to the DeviceRunner so run()
-        // and its helpers can read the three sub-features uniformly (via
-        // members, not Runtime / run() args).
-        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
-        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
-        runner->set_pmu_enabled(enable_pmu);
-        runner->set_output_prefix(output_prefix);
-
-        // Phase 3: launch
-        std::vector<uint8_t> aicpu_vec;
-        std::vector<uint8_t> aicore_vec;
-        if (aicpu_binary != NULL && aicpu_size > 0) {
-            aicpu_vec.assign(aicpu_binary, aicpu_binary + aicpu_size);
-        }
-        if (aicore_binary != NULL && aicore_size > 0) {
-            aicore_vec.assign(aicore_binary, aicore_binary + aicore_size);
-        }
-        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
-        if (rc != 0) {
-            validate_runtime_impl(r);
-            r->~Runtime();
-            pthread_setspecific(g_runner_key, nullptr);
-            return rc;
-        }
-
-        // Phase 4: finalize (copy results back)
-        rc = validate_runtime_impl(r);
-        r->~Runtime();
-        pthread_setspecific(g_runner_key, nullptr);
-        return rc;
-    } catch (...) {
-        pthread_setspecific(g_runner_key, nullptr);
-        return -1;
-    }
-}
-
 int finalize_device(DeviceContextHandle ctx) {
     if (ctx == NULL) return -1;
     try {
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index f75215b6e..7bcc1f50d 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -428,29 +428,6 @@ int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *o
     return 0;
 }
 
-/**
- * Compatibility shim: legacy single-call init_runtime_impl drives the existing
- * run_runtime path. The callable.md split keeps it as
- * prepare_callable_impl + bind_prepared_to_runtime_impl so legacy callers see
- * one function while run_prepared reuses the prep half across runs. The shim
- * dlcloses the orchestration SO immediately because legacy callers (no cid)
- * never see register_prepared_callable.
- */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    int rc = prepare_callable_impl(runtime, callable);
-    if (rc != 0) return rc;
-
-    rc = bind_prepared_to_runtime_impl(runtime, orch_args);
-
-    // Legacy path: orchestration SO is no longer needed once orch_func returned.
-    if (runtime->pending_host_dlopen_handle_ != nullptr) {
-        dlclose(runtime->pending_host_dlopen_handle_);
-        runtime->pending_host_dlopen_handle_ = nullptr;
-        runtime->pending_host_orch_func_ptr_ = nullptr;
-    }
-    return rc;
-}
-
 /**
  * Validate runtime results and cleanup.
  *
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index aac9072a2..99b31c07f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -306,19 +306,6 @@ extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorage
     return 0;
 }
 
-/**
- * Compatibility shim: `init_runtime_impl` is the legacy single-call path that
- * still drives every `run_runtime` invocation today. The callable.md split
- * keeps it as `prepare_callable_impl + bind_prepared_to_runtime_impl` so the
- * legacy path stays one function to platform code, while `run_prepared` can
- * skip the prepare half once a callable_id is staged.
- */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    int rc = prepare_callable_impl(runtime, callable);
-    if (rc != 0) return rc;
-    return bind_prepared_to_runtime_impl(runtime, orch_args);
-}
-
 /**
  * Validate runtime results and cleanup.
  *
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 17dc618d3..11726fe9d 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -41,7 +41,6 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
@@ -166,71 +165,6 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-) {
-    if (ctx == NULL || runtime == NULL) return -1;
-    if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1;
-
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-
-    pthread_once(&g_runner_key_once, create_runner_key);
-    pthread_setspecific(g_runner_key, ctx);
-    auto tsd_guard = RAIIScopeGuard([]() {
-        pthread_setspecific(g_runner_key, nullptr);
-    });
-
-    try {
-        int rc = runner->prepare_run_context(device_id);
-        if (rc != 0) return rc;
-        auto run_context_guard = RAIIScopeGuard([runner]() {
-            runner->release_run_context();
-        });
-
-        Runtime *r = new (runtime) Runtime();
-        r->host_api.device_malloc = device_malloc;
-        r->host_api.device_free = device_free;
-        r->host_api.copy_to_device = copy_to_device;
-        r->host_api.copy_from_device = copy_from_device;
-        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
-        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
-
-        LOG_DEBUG("About to call init_runtime_impl, r=%p", (void *)r);
-        rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
-        LOG_DEBUG("init_runtime_impl returned: %d", rc);
-        if (rc != 0) {
-            r->set_gm_sm_ptr(nullptr);
-            validate_runtime_impl(r);
-            r->~Runtime();
-            return rc;
-        }
-
-        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
-        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
-        runner->set_pmu_enabled(enable_pmu);
-        runner->set_output_prefix(output_prefix);
-
-        std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
-        std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
-        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
-        if (rc != 0) {
-            validate_runtime_impl(r);
-            r->~Runtime();
-            return rc;
-        }
-
-        rc = validate_runtime_impl(r);
-        r->~Runtime();
-        return rc;
-    } catch (...) {
-        return -1;
-    }
-}
-
 int finalize_device(DeviceContextHandle ctx) {
     if (ctx == NULL) return -1;
     try {
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 0c08b395d..113bfdb07 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -37,7 +37,6 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
 int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
@@ -160,74 +159,6 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-) {
-    if (ctx == NULL || runtime == NULL) return -1;
-
-    pthread_once(&g_runner_key_once, create_runner_key);
-    pthread_setspecific(g_runner_key, ctx);
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-
-    try {
-        // Phase 1: placement new + build graph
-        Runtime *r = new (runtime) Runtime();
-        r->host_api.device_malloc = device_malloc;
-        r->host_api.device_free = device_free;
-        r->host_api.copy_to_device = copy_to_device;
-        r->host_api.copy_from_device = copy_from_device;
-        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
-        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
-
-        int rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
-        if (rc != 0) {
-            r->set_gm_sm_ptr(nullptr);
-            validate_runtime_impl(r);
-            r->~Runtime();
-            pthread_setspecific(g_runner_key, nullptr);
-            return rc;
-        }
-
-        // Phase 2: publish diagnostics enablement to the DeviceRunner so run()
-        // and its helpers can read the three sub-features uniformly (via
-        // members, not Runtime / run() args).
-        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
-        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
-        runner->set_pmu_enabled(enable_pmu);
-        runner->set_output_prefix(output_prefix);
-
-        // Phase 3: launch
-        std::vector<uint8_t> aicpu_vec;
-        std::vector<uint8_t> aicore_vec;
-        if (aicpu_binary != NULL && aicpu_size > 0) {
-            aicpu_vec.assign(aicpu_binary, aicpu_binary + aicpu_size);
-        }
-        if (aicore_binary != NULL && aicore_size > 0) {
-            aicore_vec.assign(aicore_binary, aicore_binary + aicore_size);
-        }
-        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
-        if (rc != 0) {
-            validate_runtime_impl(r);
-            r->~Runtime();
-            pthread_setspecific(g_runner_key, nullptr);
-            return rc;
-        }
-
-        // Phase 4: finalize (copy results back)
-        rc = validate_runtime_impl(r);
-        r->~Runtime();
-        pthread_setspecific(g_runner_key, nullptr);
-        return rc;
-    } catch (...) {
-        pthread_setspecific(g_runner_key, nullptr);
-        return -1;
-    }
-}
-
 int finalize_device(DeviceContextHandle ctx) {
     if (ctx == NULL) return -1;
     try {
diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
index f75215b6e..7bcc1f50d 100644
--- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
@@ -428,29 +428,6 @@ int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *o
     return 0;
 }
 
-/**
- * Compatibility shim: legacy single-call init_runtime_impl drives the existing
- * run_runtime path. The callable.md split keeps it as
- * prepare_callable_impl + bind_prepared_to_runtime_impl so legacy callers see
- * one function while run_prepared reuses the prep half across runs. The shim
- * dlcloses the orchestration SO immediately because legacy callers (no cid)
- * never see register_prepared_callable.
- */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    int rc = prepare_callable_impl(runtime, callable);
-    if (rc != 0) return rc;
-
-    rc = bind_prepared_to_runtime_impl(runtime, orch_args);
-
-    // Legacy path: orchestration SO is no longer needed once orch_func returned.
-    if (runtime->pending_host_dlopen_handle_ != nullptr) {
-        dlclose(runtime->pending_host_dlopen_handle_);
-        runtime->pending_host_dlopen_handle_ = nullptr;
-        runtime->pending_host_orch_func_ptr_ = nullptr;
-    }
-    return rc;
-}
-
 /**
  * Validate runtime results and cleanup.
  *
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index ccd03d898..f84c26dcb 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -306,19 +306,6 @@ extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorage
     return 0;
 }
 
-/**
- * Compatibility shim: `init_runtime_impl` is the legacy single-call path that
- * still drives every `run_runtime` invocation today. The callable.md split
- * keeps it as `prepare_callable_impl + bind_prepared_to_runtime_impl` so the
- * legacy path stays one function to platform code, while `run_prepared` can
- * skip the prepare half once a callable_id is staged.
- */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    int rc = prepare_callable_impl(runtime, callable);
-    if (rc != 0) return rc;
-    return bind_prepared_to_runtime_impl(runtime, orch_args);
-}
-
 /**
  * Validate runtime results and cleanup.
  *
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 26874b163..70ec7591e 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -148,7 +148,6 @@ void ChipWorker::init(
         copy_to_device_ctx_fn_ = load_symbol<CopyToDeviceCtxFn>(handle, "copy_to_device_ctx");
         copy_from_device_ctx_fn_ = load_symbol<CopyFromDeviceCtxFn>(handle, "copy_from_device_ctx");
         get_runtime_size_fn_ = load_symbol<GetRuntimeSizeFn>(handle, "get_runtime_size");
-        run_runtime_fn_ = load_symbol<RunRuntimeFn>(handle, "run_runtime");
         simpler_init_fn_ = load_symbol<SimplerInitFn>(handle, "simpler_init");
         prepare_callable_fn_ = load_symbol<PrepareCallableFn>(handle, "prepare_callable");
         run_prepared_fn_ = load_symbol<RunPreparedFn>(handle, "run_prepared");
@@ -247,7 +246,6 @@ void ChipWorker::finalize() {
     copy_to_device_ctx_fn_ = nullptr;
     copy_from_device_ctx_fn_ = nullptr;
     get_runtime_size_fn_ = nullptr;
-    run_runtime_fn_ = nullptr;
     prepare_callable_fn_ = nullptr;
     run_prepared_fn_ = nullptr;
     unregister_callable_fn_ = nullptr;
@@ -271,30 +269,12 @@ void ChipWorker::finalize() {
 }
 
 void ChipWorker::run(uint64_t callable, TaskArgsView args, const CallConfig &config) {
-    // L2 ABI edge: assemble the fixed-size ChipStorageTaskArgs POD from the
-    // view and hand it to the runtime. This conversion used to happen at
-    // submit time (stored on the slot); it now runs lazily in the worker so
-    // the slot can carry a single TaskArgs irrespective of the destination.
-    ChipStorageTaskArgs chip_storage = view_to_chip_storage(args);
-    run(reinterpret_cast<const void *>(callable), &chip_storage, config);
-}
-
-void ChipWorker::run(const void *callable, const void *args, const CallConfig &config) {
-    config.validate();
-    if (!device_set_) {
-        throw std::runtime_error("ChipWorker device not set; call set_device() first");
-    }
-
-    void *rt = runtime_buf_.data();
-
-    int rc = run_runtime_fn_(
-        device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(),
-        aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_l2_swimlane,
-        config.enable_dump_tensor, config.enable_pmu, config.output_prefix
-    );
-    if (rc != 0) {
-        throw std::runtime_error("run_runtime failed with code " + std::to_string(rc));
-    }
+    // Stage 4 (callable.md): the hierarchical layer (worker_manager.cpp) packs
+    // the cid produced by Worker.register() into this uint64. ChipWorker
+    // treats it as such — it must already have been prepared via
+    // prepare_callable. The legacy "callable buffer ptr → run_runtime" path is
+    // gone.
+    run_prepared(static_cast<int32_t>(static_cast<uint32_t>(callable)), args, config);
 }
 
 void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) {
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 7b699fc60..cf1d7b319 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -52,19 +52,13 @@ class ChipWorker : public IWorker {
     /// Terminal — the object cannot be reused after this.
     void finalize();
 
-    // IWorker: build a ChipStorageTaskArgs POD from `args` and execute the
-    // runtime synchronously. `callable` is a ChipCallable buffer pointer
-    // cast to uint64.
+    // IWorker: dispatch the cid `callable` (packed into uint64 by the
+    // hierarchical layer; see callable.md) by delegating to run_prepared.
+    // The cid must already have been prepared via prepare_callable.
     void run(uint64_t callable, TaskArgsView args, const CallConfig &config) override;
 
-    // Direct invocation (used by Python wrapper and internal tests) — bypasses
-    // the TaskArgsView path and takes a ready-made ChipStorageTaskArgs POD.
-    void run(const void *callable, const void *args, const CallConfig &config);
-
-    // Per-callable_id preparation (callable.md design). The runtime variant
-    // bound at init() may export real implementations or stubs that return
-    // -1; ChipWorker forwards the result to the caller. callable_id must be
-    // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64). Requires set_device().
+    // Per-callable_id preparation (callable.md design). Requires set_device()
+    // and a callable_id in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64).
     void prepare_callable(int32_t callable_id, const void *callable);
     void run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config);
     void run_prepared(int32_t callable_id, const void *args, const CallConfig &config);
@@ -122,10 +116,6 @@ class ChipWorker : public IWorker {
     using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using CopyFromDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using GetRuntimeSizeFn = size_t (*)();
-    using RunRuntimeFn = int (*)(
-        void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t,
-        int, int, int, const char *
-    );
     using SimplerInitFn = void (*)(void *, int, int);
     using PrepareCallableFn =
         int (*)(void *, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t);
@@ -155,7 +145,6 @@ class ChipWorker : public IWorker {
     CopyToDeviceCtxFn copy_to_device_ctx_fn_ = nullptr;
     CopyFromDeviceCtxFn copy_from_device_ctx_fn_ = nullptr;
     GetRuntimeSizeFn get_runtime_size_fn_ = nullptr;
-    RunRuntimeFn run_runtime_fn_ = nullptr;
     SimplerInitFn simpler_init_fn_ = nullptr;
     PrepareCallableFn prepare_callable_fn_ = nullptr;
     RunPreparedFn run_prepared_fn_ = nullptr;
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index 780b0b24c..9e73e34c1 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -17,11 +17,12 @@
  *
  * Public API — resolved by ChipWorker via dlsym:
  *   create_device_context, destroy_device_context,
- *   get_runtime_size, set_device, run_runtime, finalize_device,
- *   device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx
+ *   get_runtime_size, set_device, finalize_device,
+ *   device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx,
+ *   prepare_callable, run_prepared, unregister_callable
  *
  * Memory management: caller allocates a buffer of get_runtime_size() bytes
- * and passes it to run_runtime(). Error codes: 0 = success, negative = error.
+ * and passes it to run_prepared(). Error codes: 0 = success, negative = error.
  */
 
 #ifndef SRC_COMMON_WORKER_PTO_RUNTIME_C_API_H_
@@ -57,7 +58,7 @@ void destroy_device_context(DeviceContextHandle ctx);
 /** Return sizeof(Runtime) for caller buffer allocation. */
 size_t get_runtime_size(void);
 
-/** Set the target device. Must be called before the first run_runtime(). */
+/** Set the target device. Must be called before the first run_prepared(). */
 int set_device(DeviceContextHandle ctx, int device_id);
 
 /** Allocate device memory in the given device context. */
@@ -72,42 +73,10 @@ int copy_to_device_ctx(DeviceContextHandle ctx, void *dev_ptr, const void *host_
 /** Copy device memory to a host pointer within the given device context. */
 int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *dev_ptr, size_t size);
 
-/**
- * Build the task graph, execute on device, copy results back, and clean up.
- *
- * @param ctx               Device context from create_device_context()
- * @param runtime           Caller-allocated buffer (size from get_runtime_size())
- * @param callable          Opaque ChipCallable pointer (orchestration + kernel binaries)
- * @param args              Opaque ChipStorageTaskArgs pointer (tensor/scalar arguments)
- * @param block_dim         Number of AICore blocks
- * @param aicpu_thread_num  Number of AICPU scheduler threads
- * @param device_id         Target device
- * @param aicpu_binary      AICPU executor binary blob
- * @param aicpu_size        Size of AICPU binary
- * @param aicore_binary     AICore executor binary blob
- * @param aicore_size       Size of AICore binary
- * @param enable_l2_swimlane       1 to enable perf swimlane collection, 0 to disable
- * @param enable_dump_tensor 1 to enable tensor dump, 0 to disable
- * @param enable_pmu        0 = PMU disabled; >0 = enabled, value selects event type
- * @param output_prefix     NUL-terminated directory path under which diagnostic
- *                          artifacts (l2_perf_records.json / tensor_dump/ /
- *                          pmu.csv) are written. Required (non-empty) whenever
- *                          any diagnostic flag is enabled; ignored otherwise.
- *
- * Log configuration is applied separately via simpler_init() at ChipWorker
- * init time and read from runner state when populating KernelArgs.
- * @return 0 on success, negative on error
- */
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-);
-
 /**
  * One-shot platform-side log init. Called once by ChipWorker::init() right
  * after dlopen, before any other entry. Pushes the user's chosen severity +
- * INFO verbosity into HostLogger and into runner state (which run_runtime
+ * INFO verbosity into HostLogger and into runner state (which run_prepared
  * later forwards to AICPU via KernelArgs).
  *
  * On onboard, also calls dlog_setlevel(-1, log_level, 0) so CANN's runtime
@@ -161,11 +130,12 @@ int prepare_callable(
 /**
  * Launch a callable previously staged via `prepare_callable`.
  *
- * Same effective behavior as `run_runtime` but skips the per-run kernel
- * upload + orch SO H2D, looking up the prepared state by `callable_id`. The
- * AICPU side dispatches via `orch_so_table_[callable_id]` (see
- * runtime.h::set_active_callable_id). The first run for a given callable_id
- * sets `register_new_callable_id_` so the AICPU does its one-time dlopen.
+ * Looks up the prepared state by `callable_id`, restores the kernel func_id ↔
+ * dev_addr table onto a fresh Runtime, and dispatches without re-uploading
+ * kernels or re-copying the orch SO. The AICPU side dispatches via
+ * `orch_so_table_[callable_id]` (see runtime.h::set_active_callable_id). The
+ * first run for a given callable_id sets `register_new_callable_id_` so the
+ * AICPU does its one-time dlopen.
  *
  * @return 0 on success, negative on error (no prep state, NULL ctx, etc.).
  */
diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py
index adeeeb6fc..d6489dc09 100644
--- a/tests/ut/py/test_chip_worker.py
+++ b/tests/ut/py/test_chip_worker.py
@@ -68,19 +68,6 @@ def test_initial_state(self):
         assert worker.device_set is False
         assert worker.device_id == -1
 
-    def test_run_before_set_device_raises(self):
-        from _task_interface import ChipCallable, ChipStorageTaskArgs  # noqa: PLC0415
-
-        worker = _ChipWorker()
-        config = CallConfig()
-        args = ChipStorageTaskArgs()
-
-        # Build a minimal ChipCallable for the test
-        callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[])
-
-        with pytest.raises(RuntimeError, match="device not set"):
-            worker.run(callable_obj, args, config)
-
     def test_set_device_before_init_raises(self):
         worker = _ChipWorker()
         with pytest.raises(RuntimeError, match="not initialized"):

From 8e7b29195bbfe9c3294db1f9cde25b72aecd5123 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 12:09:50 +0800
Subject: [PATCH 21/28] =?UTF-8?q?refactor(callable):=20Phase=204=20?=
 =?UTF-8?q?=E2=80=94=20drop=20has=5Fnew=5Forch=5Fso=5F=20and=20AICPU=20leg?=
 =?UTF-8?q?acy=20single=20slot?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The single-slot orch SO cache and the callable_id==-1 fallback path
existed only to serve the now-deleted run_runtime() ABI. With every
caller routed through prepare_callable / run_prepared, callable_id is
always in [0, MAX_REGISTERED_CALLABLE_IDS) and AICPU dispatches via
orch_so_table_[callable_id] unconditionally.

Runtime structure:
- 4 runtime.h (a2a3+a5 × trb+hbg): drop has_new_orch_so_ field; simplify
  set_dev_orch_so to (dev_addr, size).
- 2 trb shared/runtime.cpp: drop has_new_orch_so() implementation; drop
  the dirty-flag init in reset.
- 4 platform device_runner.{h,cpp}: drop the third arg from every
  set_dev_orch_so call (5 sites per platform); update doc-comments that
  referenced has_new_orch_so_.

AICPU executor (2 trb aicpu_executor.cpp):
- Drop legacy single-slot fields (orch_so_handle_, orch_so_path_,
  orch_func_, orch_bind_runtime_, orch_config_func_) along with the
  destructor branch and deinit comment that preserved them.
- Replace the use_table-ternary fork with unconditional access into
  orch_so_table_[callable_id]; reload is governed by
  register_new_callable_id().
- Reject any callable_id outside [0, MAX_REGISTERED_CALLABLE_IDS) (the
  -1 escape hatch is gone).
- The run() teardown branch that called orch_bind_runtime_(nullptr) now
  reads the per-cid bind from the table.

Verified: 4 sim binaries compile, tests/ut/py/test_chip_worker.py 14
pass + tests/ut/py/test_worker/ 65 pass + 3 hardware skipped, 4 variants
× 5 prepared_callable ST = 20 pass.
---
 .../platform/onboard/host/device_runner.cpp   | 10 +--
 .../platform/onboard/host/device_runner.h     |  4 +-
 src/a2a3/platform/sim/host/device_runner.cpp  | 12 ++-
 src/a2a3/platform/sim/host/device_runner.h    |  4 +-
 .../host_build_graph/runtime/runtime.h        |  6 +-
 .../aicpu/aicpu_executor.cpp                  | 88 +++++++------------
 .../runtime/runtime.h                         | 28 +++---
 .../runtime/shared/runtime.cpp                |  8 +-
 .../platform/onboard/host/device_runner.cpp   | 10 +--
 src/a5/platform/sim/host/device_runner.cpp    | 10 +--
 .../host_build_graph/runtime/runtime.h        |  4 +-
 .../aicpu/aicpu_executor.cpp                  | 88 +++++++------------
 .../runtime/runtime.h                         | 27 +++---
 .../runtime/shared/runtime.cpp                |  8 +-
 14 files changed, 117 insertions(+), 190 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 172fa2024..7f94aa7d1 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -763,7 +763,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         // AICPU does no per-cid dlopen. Skip the orch_so_table_ bookkeeping
         // (and the AICPU dlopen counter) and clear the device-orch metadata.
         if (state.host_dlopen_handle != nullptr) {
-            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_dev_orch_so(0, 0);
             runtime.set_active_callable_id(cid, /*is_new=*/false);
             return 0;
         }
@@ -771,7 +771,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         if (first_sighting) {
             ++aicpu_dlopen_total_;
         }
-        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
         // The c_api caller passed is_new=false; refresh with the authoritative
         // first_sighting flag before AICPU consumes register_new_callable_id_.
         runtime.set_active_callable_id(cid, first_sighting);
@@ -793,7 +793,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (host_so_data == nullptr || host_so_size == 0) {
         // Host-orchestration mode (no device SO needed).
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -801,7 +801,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -837,7 +837,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     }
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 998e197db..afca1bc30 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -641,8 +641,8 @@ class DeviceRunner {
     );
 
     /**
-     * Populate runtime.{dev_orch_so_addr_, dev_orch_so_size_, has_new_orch_so_}
-     * from `runtime.pending_orch_so_data_` / `_size_`.
+     * Populate runtime.{dev_orch_so_addr_, dev_orch_so_size_} from
+     * `runtime.pending_orch_so_data_` / `_size_`.
      *
      * The host tracks the SO identity via a 64-bit hash derived from the ELF
      * GNU Build-ID. When the hash matches the previous run, the device-side
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index f9068bcb6..795a4220f 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -359,8 +359,6 @@ int DeviceRunner::run(
         runtime.workers[i].task = 0;
         // First 1/3 are AIC, remaining 2/3 are AIV
         runtime.workers[i].core_type = (i < num_aic) ? CoreType::AIC : CoreType::AIV;
-        runtime.workers[i].enable_profiling_flag = enable_profiling_flag;
-        runtime.workers[i].l2_perf_records_addr = static_cast<uint64_t>(0);
     }
 
     // Set function_bin_addr for each task: func_id_to_addr_[] stores CoreCallable
@@ -690,7 +688,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         // hbg: orch SO never crosses host/device — clear device-orch metadata
         // and skip AICPU bookkeeping. See onboard/device_runner.cpp.
         if (state.host_dlopen_handle != nullptr) {
-            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_dev_orch_so(0, 0);
             runtime.set_active_callable_id(cid, /*is_new=*/false);
             return 0;
         }
@@ -698,7 +696,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         if (first_sighting) {
             ++aicpu_dlopen_total_;
         }
-        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
         runtime.set_active_callable_id(cid, first_sighting);
         runtime.pending_orch_so_data_ = nullptr;
         runtime.pending_orch_so_size_ = 0;
@@ -715,7 +713,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     runtime.pending_orch_so_size_ = 0;
 
     if (host_so_data == nullptr || host_so_size == 0) {
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -723,7 +721,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -751,7 +749,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     std::memcpy(dev_orch_so_buffer_, host_orch_so_copy_.data(), host_so_size);
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 6f943800a..4825817de 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -330,8 +330,8 @@ class DeviceRunner {
      * Stage the orchestration SO bytes into a host-resident buffer that
      * `aicpu_executor` can dlopen. Identical contract to the onboard
      * version: `runtime.pending_orch_so_data_/size_` are consumed and
-     * `runtime.{dev_orch_so_addr_, dev_orch_so_size_, has_new_orch_so_}`
-     * are populated with the cache-aware result.
+     * `runtime.{dev_orch_so_addr_, dev_orch_so_size_}` are populated with
+     * the cache-aware result.
      */
     int prepare_orch_so(Runtime &runtime);
 
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index ea5766f8e..bd447955c 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -464,14 +464,13 @@ class Runtime {
     // NOTE: Placed at end of class to avoid affecting device memory layout
     HostApi host_api;
 
-    // Device orchestration SO metadata: device buffer + dirty flag (host
+    // Device orchestration SO metadata: device buffer pointer + size (host
     // populates these via DeviceRunner::prepare_orch_so before launch).
     // host_build_graph runtime variant currently does not load device
     // orchestration SOs, but DeviceRunner is shared with the other variants
     // and unconditionally writes these fields, so they must exist.
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
-    bool has_new_orch_so_{false};
 
     // Per-callable_id dispatch (callable.md). hbg orch runs on host, so AICPU
     // never reads `active_callable_id_`; the field exists for parity with the
@@ -516,10 +515,9 @@ class Runtime {
     }
     const char *get_device_orch_config_name() const { return device_orch_config_name_; }
 
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
-        has_new_orch_so_ = is_new;
     }
     void set_active_callable_id(int32_t callable_id, bool is_new) {
         active_callable_id_ = callable_id;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index a93f4884e..cf760836b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -91,10 +91,8 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) {
 static PTO2Runtime *rt{nullptr};
 
 // Per-callable_id orchestration SO table. AICPU side of the callable.md
-// design: when `runtime->active_callable_id_ >= 0` the executor dispatches
-// `orch_so_table_[active_callable_id_]` (created on first sighting of that
-// callable_id, kept warm across runs); when `active_callable_id_ < 0` it
-// falls back to the legacy single slot governed by `has_new_orch_so_`.
+// design: the executor dispatches `orch_so_table_[active_callable_id_]`
+// (created on first sighting of that callable_id, kept warm across runs).
 // MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
 // (mailbox uint32 callable_id, register() returns small ints) and is shared
 // with the host bounds check in DeviceRunner::register_prepared_callable —
@@ -127,19 +125,13 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Legacy single-slot orch SO cache (active_callable_id_ == -1 path).
-    void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};
-
-    // Shared orchestration function pointer (loaded by first orch thread, used by all)
-    DeviceOrchestrationFunc orch_func_{nullptr};
-    DeviceOrchestrationBindRuntimeFunc orch_bind_runtime_{nullptr};
-    DeviceOrchestrationConfigFunc orch_config_func_{nullptr};
+    // Cached orch args pointer set by the orchestration thread before scheduler
+    // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
-    // Per-callable_id table (active_callable_id_ >= 0 path). Single orch thread today, so
-    // first-write/read race is not possible; if multiple orch threads are
-    // ever introduced, guard the in_use=false→true transition with a mutex.
+    // Per-callable_id table. Single orch thread today, so first-write/read
+    // race is not possible; if multiple orch threads are ever introduced,
+    // guard the in_use=false→true transition with a mutex.
     OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
 
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
@@ -151,17 +143,9 @@ struct AicpuExecutor {
     void deinit(Runtime *runtime);
 
     ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). Both
-        // the legacy slot and every in-use callable_id slot are dlclose()'d here;
-        // each is otherwise kept alive across runs for cache-hit reuse.
-        if (orch_so_handle_ != nullptr) {
-            dlclose(orch_so_handle_);
-            orch_so_handle_ = nullptr;
-        }
-        if (orch_so_path_[0] != '\0') {
-            unlink(orch_so_path_);
-            orch_so_path_[0] = '\0';
-        }
+        // Process-wide teardown (the single static instance dies here). Every
+        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
+        // alive across runs for cache-hit reuse.
         for (auto &e : orch_so_table_) {
             if (!e.in_use) continue;
             if (e.handle != nullptr) dlclose(e.handle);
@@ -229,32 +213,23 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Per-callable_id dispatch (callable.md): when active_callable_id_ >= 0 the orch
-            // SO state lives in `orch_so_table_[callable_id]` keyed by registration
-            // order; reload is governed by `register_new_callable_id_`. When
-            // active_callable_id_ < 0 we fall back to the legacy single-slot cache
-            // governed by `has_new_orch_so_`. The local pointers below let
-            // the rest of this branch ignore the choice.
+            // Per-callable_id dispatch (callable.md): the orch SO state lives
+            // in `orch_so_table_[callable_id]` keyed by registration order;
+            // reload is governed by `register_new_callable_id_`.
             const int32_t callable_id = runtime->get_active_callable_id();
-            const bool use_table = (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS);
-            // -1 is the legacy sentinel that intentionally falls back to the
-            // single-slot orch SO cache. Any other negative value is a
-            // protocol violation.
-            if (callable_id >= MAX_REGISTERED_CALLABLE_IDS || (callable_id < 0 && callable_id != -1)) {
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
                 LOG_ERROR(
                     "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
                 );
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
-            void **p_handle = use_table ? &orch_so_table_[callable_id].handle : &orch_so_handle_;
-            char *p_path = use_table ? orch_so_table_[callable_id].path : orch_so_path_;
-            DeviceOrchestrationFunc *p_func = use_table ? &orch_so_table_[callable_id].func : &orch_func_;
-            DeviceOrchestrationBindRuntimeFunc *p_bind =
-                use_table ? &orch_so_table_[callable_id].bind : &orch_bind_runtime_;
-            DeviceOrchestrationConfigFunc *p_config_func =
-                use_table ? &orch_so_table_[callable_id].config_func : &orch_config_func_;
-            const bool reload_so = use_table ? runtime->register_new_callable_id() : runtime->has_new_orch_so();
+            void **p_handle = &orch_so_table_[callable_id].handle;
+            char *p_path = orch_so_table_[callable_id].path;
+            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
+            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
+            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
+            const bool reload_so = runtime->register_new_callable_id();
 
             if (reload_so) {
                 LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
@@ -387,7 +362,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 *p_bind = bind_runtime_func;
                 *p_config_func = config_func;
                 snprintf(p_path, 256, "%s", so_path);
-                if (use_table) orch_so_table_[callable_id].in_use = true;
+                orch_so_table_[callable_id].in_use = true;
             } else {
                 LOG_INFO_V0(
                     "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
@@ -427,7 +402,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                         *p_func = nullptr;
                         *p_bind = nullptr;
                         *p_config_func = nullptr;
-                        if (use_table) orch_so_table_[callable_id].in_use = false;
+                        orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
@@ -689,13 +664,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     if (prev_finished + 1 == thread_num_) {
         finished_.store(true, std::memory_order_release);
         // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep orch_so_handle_ alive for
-        // the next run's cache-hit reuse (see run() reload_so branch).
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
         if (!runtime->get_orch_built_on_host() && rt != nullptr) {
             // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
             framework_bind_runtime(nullptr);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
             }
             runtime_destroy(rt);
         }
@@ -721,10 +700,9 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     orch_to_sched_ = false;
 
     orch_args_cached_ = nullptr;
-    // orch_so_handle_ / orch_func_ / orch_bind_runtime_ / orch_config_func_ / orch_so_path_ are
-    // intentionally preserved across deinit: the next run reuses them when
-    // has_new_orch_so() == false. The destructor releases them at process
-    // teardown.
+    // orch_so_table_ entries are intentionally preserved across deinit: the
+    // next run reuses cached handles when register_new_callable_id() returns
+    // false. The destructor releases them at process teardown.
 
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 87764591a..97d06f4d2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -189,20 +189,12 @@ class Runtime {
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
-    // `has_new_orch_so_` tells AICPU whether the host believes the SO identity
-    // changed since the previous run — when false AICPU reuses its cached
-    // dlopen handle and skips writing the file again.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    bool has_new_orch_so_;
-    // Per-callable_id dispatch (callable.md design). When
-    // `active_callable_id_ >= 0`, AICPU dispatches via
-    // `orch_so_table_[active_callable_id_]` instead of the legacy single-slot
-    // cache; `register_new_callable_id_` then signals whether the host is
-    // delivering a freshly-registered callable_id (write+dlopen) or reusing an
-    // already-loaded one. `active_callable_id_ == -1` keeps the legacy fast
-    // path (run_runtime() compatibility shim) — has_new_orch_so_ governs
-    // reload.
+    // Per-callable_id dispatch (callable.md design). AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
+    // signals whether the host is delivering a freshly-registered
+    // callable_id (write+dlopen) or reusing an already-loaded one.
     int32_t active_callable_id_;
     bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
@@ -257,13 +249,13 @@ class Runtime {
     void set_orch_args(const ChipStorageTaskArgs &args);
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new);
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
-    bool has_new_orch_so() const;
-    // Per-callable_id dispatch (callable.md). callable_id < 0 disables and
-    // falls back to the legacy single-slot orch SO cache governed by
-    // has_new_orch_so_.
+    // Per-callable_id dispatch (callable.md). callable_id must be in
+    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
+    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
+    // reuse the cached entry.
     void set_active_callable_id(int32_t callable_id, bool is_new);
     int32_t get_active_callable_id() const;
     bool register_new_callable_id() const;
@@ -308,7 +300,7 @@ class Runtime {
     // Host-only staging for orchestration SO. runtime_maker publishes the
     // callable-owned pointer here; DeviceRunner consumes it before launching
     // the device-side execution and replaces it with the device-resident
-    // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_). The fields
+    // buffer metadata (dev_orch_so_addr_, dev_orch_so_size_). The fields
     // below are zeroed on the device because DeviceRunner clears them before
     // the memcpy, but their values while running on device are irrelevant.
     const void *pending_orch_so_data_{nullptr};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 1f6375a6b..98d464549 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -52,7 +52,6 @@ Runtime::Runtime() {
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
     dev_orch_so_size_ = 0;
-    has_new_orch_so_ = false;
     active_callable_id_ = -1;
     register_new_callable_id_ = false;
     device_orch_func_name_[0] = '\0';
@@ -104,19 +103,16 @@ void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
 // Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+// owned by DeviceRunner; only the address/size travels in Runtime).
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
     dev_orch_so_addr_ = dev_addr;
     dev_orch_so_size_ = size;
-    has_new_orch_so_ = is_new;
 }
 
 uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
 
 uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
 
-bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
-
 void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
     active_callable_id_ = callable_id;
     register_new_callable_id_ = is_new;
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 3892bbf3d..e306bba79 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -617,7 +617,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         // hbg variant: orch SO never crosses host/device, so AICPU does no
         // per-cid dlopen. Skip orch_so_table_ bookkeeping and clear metadata.
         if (state.host_dlopen_handle != nullptr) {
-            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_dev_orch_so(0, 0);
             runtime.set_active_callable_id(cid, /*is_new=*/false);
             return 0;
         }
@@ -625,7 +625,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         if (first_sighting) {
             ++aicpu_dlopen_total_;
         }
-        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
         // The c_api caller passed is_new=false; refresh with the authoritative
         // first_sighting flag before AICPU consumes register_new_callable_id_.
         runtime.set_active_callable_id(cid, first_sighting);
@@ -646,7 +646,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     runtime.pending_orch_so_size_ = 0;
 
     if (host_so_data == nullptr || host_so_size == 0) {
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -654,7 +654,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -686,7 +686,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     }
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 118a0849d..cc4007ca2 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -667,7 +667,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         const auto &state = it->second;
         // hbg variant: orch SO never crosses host/device boundary.
         if (state.host_dlopen_handle != nullptr) {
-            runtime.set_dev_orch_so(0, 0, false);
+            runtime.set_dev_orch_so(0, 0);
             runtime.set_active_callable_id(cid, /*is_new=*/false);
             return 0;
         }
@@ -675,7 +675,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
         if (first_sighting) {
             ++aicpu_dlopen_total_;
         }
-        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size, first_sighting);
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
         runtime.set_active_callable_id(cid, first_sighting);
         runtime.pending_orch_so_data_ = nullptr;
         runtime.pending_orch_so_size_ = 0;
@@ -692,7 +692,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     runtime.pending_orch_so_size_ = 0;
 
     if (host_so_data == nullptr || host_so_size == 0) {
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -700,7 +700,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -728,7 +728,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     std::memcpy(dev_orch_so_buffer_, host_orch_so_copy_.data(), host_so_size);
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index 54ed30075..a68b29fea 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -478,7 +478,6 @@ class Runtime {
     // Device orchestration SO metadata (see a2a3 host_build_graph runtime.h).
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
-    bool has_new_orch_so_{false};
     // Per-callable_id dispatch (callable.md). hbg orch runs on host, so AICPU
     // never reads `active_callable_id_`; the field exists for parity with the
     // shared platform layer (DeviceRunner stamps it on every run).
@@ -519,10 +518,9 @@ class Runtime {
     }
     const char *get_device_orch_config_name() const { return device_orch_config_name_; }
 
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
-        has_new_orch_so_ = is_new;
     }
     void set_active_callable_id(int32_t callable_id, bool is_new) {
         active_callable_id_ = callable_id;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index b10724738..78d5ef52c 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -91,10 +91,8 @@ static int32_t read_runtime_status(Runtime *runtime) {
 static PTO2Runtime *rt{nullptr};
 
 // Per-callable_id orchestration SO table. AICPU side of the callable.md
-// design: when `runtime->active_callable_id_ >= 0` the executor dispatches
-// `orch_so_table_[active_callable_id_]` (created on first sighting of that
-// callable_id, kept warm across runs); when `active_callable_id_ < 0` it
-// falls back to the legacy single slot governed by `has_new_orch_so_`.
+// design: the executor dispatches `orch_so_table_[active_callable_id_]`
+// (created on first sighting of that callable_id, kept warm across runs).
 // MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
 // (mailbox uint32 callable_id, register() returns small ints) and is shared
 // with the host bounds check in DeviceRunner::register_prepared_callable —
@@ -127,19 +125,13 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Legacy single-slot orch SO cache (active_callable_id_ == -1 path).
-    void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};
-
-    // Shared orchestration function pointer (loaded by first orch thread, used by all)
-    DeviceOrchestrationFunc orch_func_{nullptr};
-    DeviceOrchestrationBindRuntimeFunc orch_bind_runtime_{nullptr};
-    DeviceOrchestrationConfigFunc orch_config_func_{nullptr};
+    // Cached orch args pointer set by the orchestration thread before scheduler
+    // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
-    // Per-callable_id table (active_callable_id_ >= 0 path). Single orch thread today, so
-    // first-write/read race is not possible; if multiple orch threads are
-    // ever introduced, guard the in_use=false→true transition with a mutex.
+    // Per-callable_id table. Single orch thread today, so first-write/read
+    // race is not possible; if multiple orch threads are ever introduced,
+    // guard the in_use=false→true transition with a mutex.
     OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
 
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
@@ -151,17 +143,9 @@ struct AicpuExecutor {
     void deinit(Runtime *runtime);
 
     ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). Both
-        // the legacy slot and every in-use callable_id slot are dlclose()'d here;
-        // each is otherwise kept alive across runs for cache-hit reuse.
-        if (orch_so_handle_ != nullptr) {
-            dlclose(orch_so_handle_);
-            orch_so_handle_ = nullptr;
-        }
-        if (orch_so_path_[0] != '\0') {
-            unlink(orch_so_path_);
-            orch_so_path_[0] = '\0';
-        }
+        // Process-wide teardown (the single static instance dies here). Every
+        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
+        // alive across runs for cache-hit reuse.
         for (auto &e : orch_so_table_) {
             if (!e.in_use) continue;
             if (e.handle != nullptr) dlclose(e.handle);
@@ -229,32 +213,23 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Per-callable_id dispatch (callable.md): when active_callable_id_ >= 0 the orch
-            // SO state lives in `orch_so_table_[callable_id]` keyed by registration
-            // order; reload is governed by `register_new_callable_id_`. When
-            // active_callable_id_ < 0 we fall back to the legacy single-slot cache
-            // governed by `has_new_orch_so_`. The local pointers below let
-            // the rest of this branch ignore the choice.
+            // Per-callable_id dispatch (callable.md): the orch SO state lives
+            // in `orch_so_table_[callable_id]` keyed by registration order;
+            // reload is governed by `register_new_callable_id_`.
             const int32_t callable_id = runtime->get_active_callable_id();
-            const bool use_table = (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS);
-            // -1 is the legacy sentinel that intentionally falls back to the
-            // single-slot orch SO cache. Any other negative value is a
-            // protocol violation.
-            if (callable_id >= MAX_REGISTERED_CALLABLE_IDS || (callable_id < 0 && callable_id != -1)) {
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
                 LOG_ERROR(
                     "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
                 );
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
-            void **p_handle = use_table ? &orch_so_table_[callable_id].handle : &orch_so_handle_;
-            char *p_path = use_table ? orch_so_table_[callable_id].path : orch_so_path_;
-            DeviceOrchestrationFunc *p_func = use_table ? &orch_so_table_[callable_id].func : &orch_func_;
-            DeviceOrchestrationBindRuntimeFunc *p_bind =
-                use_table ? &orch_so_table_[callable_id].bind : &orch_bind_runtime_;
-            DeviceOrchestrationConfigFunc *p_config_func =
-                use_table ? &orch_so_table_[callable_id].config_func : &orch_config_func_;
-            const bool reload_so = use_table ? runtime->register_new_callable_id() : runtime->has_new_orch_so();
+            void **p_handle = &orch_so_table_[callable_id].handle;
+            char *p_path = orch_so_table_[callable_id].path;
+            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
+            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
+            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
+            const bool reload_so = runtime->register_new_callable_id();
 
             if (reload_so) {
                 LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
@@ -387,7 +362,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 *p_bind = bind_runtime_func;
                 *p_config_func = config_func;
                 snprintf(p_path, 256, "%s", so_path);
-                if (use_table) orch_so_table_[callable_id].in_use = true;
+                orch_so_table_[callable_id].in_use = true;
             } else {
                 LOG_INFO_V0(
                     "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
@@ -427,7 +402,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                         *p_func = nullptr;
                         *p_bind = nullptr;
                         *p_config_func = nullptr;
-                        if (use_table) orch_so_table_[callable_id].in_use = false;
+                        orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
@@ -693,13 +668,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     if (prev_finished + 1 == thread_num_) {
         finished_.store(true, std::memory_order_release);
         // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep orch_so_handle_ alive for
-        // the next run's cache-hit reuse (see run() reload_so branch).
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
         if (!runtime->get_orch_built_on_host() && rt != nullptr) {
             // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
             framework_bind_runtime(nullptr);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
             }
             runtime_destroy(rt);
         }
@@ -725,10 +704,9 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     orch_to_sched_ = false;
 
     orch_args_cached_ = nullptr;
-    // orch_so_handle_ / orch_func_ / orch_bind_runtime_ / orch_config_func_ / orch_so_path_ are
-    // intentionally preserved across deinit: the next run reuses them when
-    // has_new_orch_so() == false. The destructor releases them at process
-    // teardown.
+    // orch_so_table_ entries are intentionally preserved across deinit: the
+    // next run reuses cached handles when register_new_callable_id() returns
+    // false. The destructor releases them at process teardown.
 
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 1f388a43e..8062078cc 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -203,19 +203,12 @@ class Runtime {
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
-    // `has_new_orch_so_` tells AICPU whether the host believes the SO identity
-    // changed since the previous run — when false AICPU reuses its cached
-    // dlopen handle and skips writing the file again.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    bool has_new_orch_so_;
-    // Per-callable_id dispatch (callable.md). When `active_callable_id_ >= 0`,
-    // AICPU dispatches via `orch_so_table_[active_callable_id_]` instead of
-    // the legacy single-slot cache; `register_new_callable_id_` then signals
-    // whether the host is delivering a freshly-registered callable_id
-    // (write+dlopen) or reusing an already-loaded one. `active_callable_id_
-    // == -1` keeps the legacy fast path (run_runtime() compatibility shim) —
-    // has_new_orch_so_ governs reload.
+    // Per-callable_id dispatch (callable.md design). AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
+    // signals whether the host is delivering a freshly-registered
+    // callable_id (write+dlopen) or reusing an already-loaded one.
     int32_t active_callable_id_;
     bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
@@ -270,13 +263,13 @@ class Runtime {
     void set_orch_args(const ChipStorageTaskArgs &args);
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new);
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
-    bool has_new_orch_so() const;
-    // Per-callable_id dispatch (callable.md). callable_id < 0 disables and
-    // falls back to the legacy single-slot orch SO cache governed by
-    // has_new_orch_so_.
+    // Per-callable_id dispatch (callable.md). callable_id must be in
+    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
+    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
+    // reuse the cached entry.
     void set_active_callable_id(int32_t callable_id, bool is_new);
     int32_t get_active_callable_id() const;
     bool register_new_callable_id() const;
@@ -321,7 +314,7 @@ class Runtime {
     // Host-only staging for orchestration SO. runtime_maker publishes the
     // callable-owned pointer here; DeviceRunner consumes it before launching
     // the device-side execution and replaces it with the device-resident
-    // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_).
+    // buffer metadata (dev_orch_so_addr_, dev_orch_so_size_).
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 5fc34b7b2..714ba3955 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -54,7 +54,6 @@ Runtime::Runtime() {
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
     dev_orch_so_size_ = 0;
-    has_new_orch_so_ = false;
     active_callable_id_ = -1;
     register_new_callable_id_ = false;
     device_orch_func_name_[0] = '\0';
@@ -106,19 +105,16 @@ void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
 // Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+// owned by DeviceRunner; only the address/size travels in Runtime).
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
     dev_orch_so_addr_ = dev_addr;
     dev_orch_so_size_ = size;
-    has_new_orch_so_ = is_new;
 }
 
 uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
 
 uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
 
-bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
-
 void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
     active_callable_id_ = callable_id;
     register_new_callable_id_ = is_new;

From 1293581b82b799b2033dbb5cb50061c6e150d771 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 14:42:55 +0800
Subject: [PATCH 22/28] fix(pr): migrate vector_add and child_memory examples
 to register/cid API

- vector_add: register chip_callable before init(), pass cid to worker.run
- child_memory: register before init(), pass cid to orch.submit_next_level
- Update vector_add README and docstring diagram to match the new flow

Resolves CI failures in st-sim-a2a3 (ubuntu/macos) on PR #710.
---
 examples/workers/l2/vector_add/README.md |  2 +-
 examples/workers/l2/vector_add/main.py   | 11 +++++++----
 examples/workers/l3/child_memory/main.py |  3 ++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/workers/l2/vector_add/README.md b/examples/workers/l2/vector_add/README.md
index 7e5776d9e..e294fb2aa 100644
--- a/examples/workers/l2/vector_add/README.md
+++ b/examples/workers/l2/vector_add/README.md
@@ -96,7 +96,7 @@ args.add_tensor(ContinuousTensor.make(dev_a,   shape, DataType.FLOAT32))
 args.add_tensor(ContinuousTensor.make(dev_b,   shape, DataType.FLOAT32))
 args.add_tensor(ContinuousTensor.make(dev_out, shape, DataType.FLOAT32))
 
-worker.run(chip_callable, args, CallConfig())
+worker.run(chip_cid, args, CallConfig())  # chip_cid = worker.register(chip_callable) before init()
 ```
 
 The tensor order must match `signature` order on the `ChipCallable`. `run()`
diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py
index 94867ca32..6ad1480ad 100644
--- a/examples/workers/l2/vector_add/main.py
+++ b/examples/workers/l2/vector_add/main.py
@@ -19,7 +19,8 @@
     host arrays ──[worker.malloc + copy_to]──►  device buffers
                                           │
                                           ▼
-                              worker.run(chip_callable, task_args, cfg)
+                       chip_cid = worker.register(chip_callable)  # before init()
+                              worker.run(chip_cid, task_args, cfg)
                                           │
     device result ──[worker.copy_from]──► host array ──[torch compare]
 
@@ -126,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable:
     )
 
 
-def _run(worker: Worker, chip_callable: ChipCallable) -> None:
+def _run(worker: Worker, chip_cid: int) -> None:
     """Allocate device memory, copy inputs, execute, copy outputs back, verify."""
     # --- 1. Prepare host arrays ---
     torch.manual_seed(42)
@@ -154,7 +155,7 @@ def _run(worker: Worker, chip_callable: ChipCallable) -> None:
     # --- 4. Run. CallConfig() defaults are fine for this kernel. ---
     config = CallConfig()
     print("[vector_add] running on device...")
-    worker.run(chip_callable, args, config)
+    worker.run(chip_cid, args, config)
 
     # --- 5. D2H copy back + verify ---
     worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
@@ -183,10 +184,12 @@ def run(platform: str, device_id: int) -> int:
     chip_callable = build_chip_callable(platform)
     print(f"[vector_add] compiled. binary_size={chip_callable.binary_size} bytes")
 
+    chip_cid = worker.register(chip_callable)
+
     print(f"[vector_add] init worker (device={device_id})...")
     worker.init()
     try:
-        _run(worker, chip_callable)
+        _run(worker, chip_cid)
     finally:
         worker.close()
     return 0
diff --git a/examples/workers/l3/child_memory/main.py b/examples/workers/l3/child_memory/main.py
index 2dfe3f4e0..b107983fb 100644
--- a/examples/workers/l3/child_memory/main.py
+++ b/examples/workers/l3/child_memory/main.py
@@ -147,6 +147,7 @@ def run(platform: str, device_id: int) -> int:
 
     print(f"[child_memory] compiling kernels for {platform}...")
     chip_callable = build_chip_callable(platform)
+    chip_cid = worker.register(chip_callable)
 
     print("[child_memory] init worker...")
     worker.init()
@@ -172,7 +173,7 @@ def orch_fn(orch, _args, cfg):
                 a.add_tensor(make_tensor_arg(host_a), TensorArgType.INPUT)
                 a.add_tensor(w_dev, TensorArgType.INPUT)
                 a.add_tensor(make_tensor_arg(out), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(chip_callable, a, cfg, worker=0)
+                orch.submit_next_level(chip_cid, a, cfg, worker=0)
 
             # dev_w is reclaimed by DeviceRunner::finalize on worker.close() —
             # we don't orch.free it here, that's the whole point of child_memory.

From a1bd0ff3f54dd902e13fc333f18f0630dcb0ec50 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 14:44:17 +0800
Subject: [PATCH 23/28] fix(pr): silence ruff PLR0915 on
 _chip_process_loop_with_bootstrap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Function grew to 104 statements (limit 100) after the callable refactor.
The function is structured as a single dispatch loop over the bootstrap +
control-mailbox protocol — splitting it would obscure the state machine,
so add PLR0915 to the existing PLR0912 noqa.

Resolves the pre-commit CI failure on PR #710.
---
 python/simpler/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 8f0fc9ed7..2e402e778 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -373,7 +373,7 @@ def _ensure_prepared(cid: int) -> None:
             break
 
 
-def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
+def _chip_process_loop_with_bootstrap(  # noqa: PLR0912, PLR0915
     buf: memoryview,
     bins,
     device_id: int,

From 55a8c7ecd7988c9ccf6056f408afdfea32006380 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 15:00:16 +0800
Subject: [PATCH 24/28] fix(pr): make a5 prepared_callable test mirror a2a3,
 add a5 kernel-bin dedup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause of CI a5 sim trb failures: tests/st/a5/.../prepared_callable used
the vector_example orchestration (which dispatches func_ids 0/1/2) but only
registered func_id=0. AICPU jumped to a NULL kernel address on func_id 1/2
and segfaulted, cascading through the pytest-xdist workers and dragging
spmd_*/orch_so_cache/mixed_example down with it.

Test fix: align tests/st/a5/.../prepared_callable verbatim with the a2a3
sibling — register all three vector_example AIV kernels (add/add_scalar/mul),
update the golden formula to match the orchestration's 5-task DAG.

Runtime parity (defensive — not exercised by current a5 CI but matches the
07156614 fix on a2a3 onboard so future cross-callable func_id reuse on a5
does not regress):
- src/a5/platform/onboard: add func_id_to_hash_ map, reject cached entry on
  hash mismatch, evict + re-upload on changed binary. finalize() and
  remove_kernel_binary() clear the parallel map.
- src/a5/platform/sim: compare cached CoreCallable bytes via memcmp on each
  upload (mirrors a2a3 sim — no separate hash map needed because the
  MappedKernel cache already retains the original bytes).
---
 .../platform/onboard/host/device_runner.cpp   | 22 ++++++-
 src/a5/platform/onboard/host/device_runner.h  |  1 +
 src/a5/platform/sim/host/device_runner.cpp    | 19 ++++++-
 .../test_prepared_callable.py                 | 57 +++++++++++++------
 4 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index e306bba79..7797ad2d9 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -889,6 +889,7 @@ int DeviceRunner::finalize() {
         }
     }
     func_id_to_addr_.clear();
+    func_id_to_hash_.clear();
     binaries_loaded_ = false;
 
     if (dev_orch_so_buffer_ != nullptr) {
@@ -1039,11 +1040,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the callable.md prepared-callable path, multiple ChipCallables
+    // share a single ChipWorker (and DeviceRunner) and can pick distinct kernel
+    // binaries for the same func_id. Naively reusing the cached entry hands the
+    // AICore the previous callable's kernel: dispatch never completes the new
+    // task and the AICPU spins forever.
+    const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size);
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return it->second;
+        auto hash_it = func_id_to_hash_.find(func_id);
+        if (hash_it != func_id_to_hash_.end() && hash_it->second == new_hash) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching hash), returning cached address", func_id);
+            return it->second;
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        mem_alloc_.free(reinterpret_cast<void *>(it->second));
+        func_id_to_addr_.erase(it);
+        func_id_to_hash_.erase(func_id);
     }
 
     LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size);
@@ -1073,6 +1087,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     func_id_to_addr_[func_id] = callable_addr;
+    func_id_to_hash_[func_id] = new_hash;
 
     LOG_DEBUG("  func_id=%d -> callable_addr=0x%lx, binary_code_addr=0x%lx", func_id, callable_addr, binary_code_addr);
 
@@ -1090,6 +1105,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 
     mem_alloc_.free(gm_addr);
     func_id_to_addr_.erase(it);
+    func_id_to_hash_.erase(func_id);
 
     LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr);
 }
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index a7d5c9fc2..8eb5b48e1 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -454,6 +454,7 @@ class DeviceRunner {
     // Kernel binary management
     bool binaries_loaded_{false};              // true after AICPU SO loaded
     std::map<int, uint64_t> func_id_to_addr_;  // func_id -> function_bin_addr (device GM)
+    std::map<int, uint64_t> func_id_to_hash_;  // func_id -> elf_build_id_64(bin_data)
 
     // Orchestration SO cache (host-tracked, device-resident).
     uint64_t cached_orch_so_hash_{0};
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index cc4007ca2..82cd28b18 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -978,11 +978,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the callable.md prepared-callable path, multiple ChipCallables
+    // share a single ChipWorker (and hence DeviceRunner) and can pick distinct
+    // kernel binaries for the same func_id.  Naively reusing the cached entry
+    // hands the AICore the previous callable's kernel and segfaults at dispatch.
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        const auto &cached_callable = *reinterpret_cast<const CoreCallable *>(it->second.callable_buf);
+        const auto *new_callable = reinterpret_cast<const CoreCallable *>(bin_data);
+        if (cached_callable.binary_size() == new_callable->binary_size() &&
+            std::memcmp(cached_callable.binary_data(), new_callable->binary_data(), new_callable->binary_size()) == 0) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching bytes), returning cached address", func_id);
+            return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        if (it->second.dl_handle != nullptr) dlclose(it->second.dl_handle);
+        delete[] it->second.callable_buf;
+        func_id_to_addr_.erase(it);
     }
 
     // Extract binary from CoreCallable envelope
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index ed1d8751c..040d929d5 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -9,14 +9,15 @@
 # -----------------------------------------------------------------------------------------------------------
 """End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable on a5/trb.
 
-Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable. Uses the
-single-task orchestration borrowed from `orch_so_cache` plus
-`mixed_example/kernels/aiv/kernel_add_standalone.cpp` so the test stays
-focused on the prepare/run_prepared ABI rather than orchestration richness.
-
-aicpu_dlopen_count assertions verify that the per-cid AICPU dispatch table
-collapses repeated runs of the same callable_id into a single AICPU dlopen,
-matching docs/callable.md §7.
+Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable. Reuses the
+vector_example orchestration + AIV kernels. Exercises:
+  - prepare_callable once, then run_prepared twice (second run proves the
+    AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload).
+  - Two distinct callable_ids sharing the same orch SO binary: verifies both
+    produce correct output independently.
+  - unregister_callable after runs complete: should not raise.
+  - aicpu_dlopen_count assertions covering: same-cid repeat, multi-cid
+    interleaving, double-prepare rejection, and unregister + re-prepare.
 """
 
 import pytest
@@ -26,8 +27,7 @@
 from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
 from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
 
-_MIXED_KERNELS = "../mixed_example/kernels"
-_ORCH_SO_CACHE = "../orch_so_cache"
+_VECTOR_KERNELS = "../../../../../examples/a5/tensormap_and_ringbuffer/vector_example/kernels"
 
 
 @scene_test(level=2, runtime="tensormap_and_ringbuffer")
@@ -36,14 +36,26 @@ class TestPreparedCallable(SceneTestCase):
 
     CALLABLE = {
         "orchestration": {
-            "source": f"{_ORCH_SO_CACHE}/kernels/orchestration/example_orchestration.cpp",
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp",
             "function_name": "aicpu_orchestration_entry",
             "signature": [D.IN, D.IN, D.OUT],
         },
         "incores": [
             {
                 "func_id": 0,
-                "source": f"{_MIXED_KERNELS}/aiv/kernel_add_standalone.cpp",
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
@@ -72,8 +84,7 @@ def generate_args(self, params):
         )
 
     def compute_golden(self, args, params):
-        # f = a + b (kernel_add_standalone)
-        args.f[:] = args.a + args.b
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
 
     def _run_and_validate_l2(
         self,
@@ -122,10 +133,18 @@ def _run_and_validate_l2(
 
     # ------------------------------------------------------------------
     # aicpu_dlopen_count assertions (callable.md §7 verification).
-    # See a2a3 prepared_callable test for the contract notes.
+    #
+    # The L2 worker fixture is shared across tests in this class, so the
+    # counter can be non-zero on entry from prior tests' leftover prepared
+    # callables (or from this test class's own test_run). Each test below
+    # snapshots the counter on entry, asserts the *delta* introduced by the
+    # scenario, then unregisters everything it staged so the next test sees
+    # the same baseline (unregister_callable erases the cid, decrementing
+    # the counter).
     # ------------------------------------------------------------------
 
     def _setup_dlopen_count_test(self, st_worker, st_platform):
+        """Common fixture: build callable + config, return (callable, config, case)."""
         case = self.CASES[0]
         callable_obj = self.build_callable(st_platform)
         config = self._build_config(case["config"])
@@ -185,7 +204,13 @@ def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
             st_worker.unregister_callable(0)
 
     def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
-        """Case D: prepare+run+unregister+prepare+run → counter monotonic, delta == 2."""
+        """Case D: prepare(0)+run(0)+unregister(0)+prepare(0)+run(0) → delta == 2.
+
+        unregister erases the cid from aicpu_seen_callable_ids_, so the second
+        prepare/run pair sets register_new_callable_id_ again and the AICPU
+        does a fresh dlopen. The counter is monotonic (does NOT decrement on
+        unregister), so the delta after the second cycle is 2.
+        """
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.aicpu_dlopen_count
         registered = False

From f5911a724583893b6523273febdf89ec89fc54c5 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 8 May 2026 18:22:29 +0800
Subject: [PATCH 25/28] fix(callable): preserve child_memory flag in mailbox
 args deserialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 3 (57963212) introduced `_read_args_from_mailbox` to rebuild a
ChipStorageTaskArgs Python object from the mailbox blob in chip-child
processes (replacing the legacy raw-bytes `run_from_blob` path). The
unpacker read data/shapes/ndims/dtype but skipped the child_memory uint8
at offset 33, so every chip-child-side tensor came back with
child_memory=False (the make() default).

For tensors that carry a chip-owned device pointer — HCCL window slots
in allreduce_distributed, deferred_notify_demo, ffn_tp_parallel —
the bind_prepared_to_runtime_impl host path then treats the device
address as a host pointer, allocates a fresh device buffer, and H2D
copies from the (device) source: AICPU dispatches a task whose tensors
point at uninitialised allocations, so the task lands in ready_queue
with a kernel mask that scheduler/dispatch never advance, surfacing as
the "PTO2 timeout after 800001 idle iterations" hang we saw on a2a3
onboard.

multi_chip_dispatch passes because all of its tensors are host pointers
(child_memory=False), so the missing byte happens to round-trip
correctly. This is also why main is unaffected: there `run_from_blob`
hands the mailbox bytes straight to C++ via reinterpret_cast on the 40B
ContinuousTensor layout, which naturally preserves byte 33.

Read offset 33 explicitly and pass it through ContinuousTensor.make.
Layout matches src/common/task_interface/tensor_arg.h (40B with
child_memory at byte 33).

Verified on a2a3 onboard (devices 9,10):
- examples/workers/l3/allreduce_distributed:        PASS  (was hang)
- examples/a2a3/.../deferred_notify_demo:           PASS  (was hang)
- examples/workers/l3/multi_chip_dispatch:          PASS  (no regression)
---
 python/simpler/worker.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 2e402e778..8c4998882 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -221,7 +221,14 @@ def _read_args_from_mailbox(buf) -> TaskArgs:
         shapes = struct.unpack_from("5I", buf, off + 8)
         ndims = struct.unpack_from("I", buf, off + 28)[0]
         dtype_val = struct.unpack_from("B", buf, off + 32)[0]
-        ct = ContinuousTensor.make(data, tuple(shapes[:ndims]), DataType(dtype_val))
+        # offset 33: child_memory uint8 — must round-trip; otherwise tensors that
+        # carry a chip-owned device pointer (e.g. HCCL window slot) get dropped
+        # back to the default host-pointer path on the chip child, and
+        # bind_prepared_to_runtime_impl re-allocates + H2D-copies treating the
+        # device address as a host pointer.  Layout matches ContinuousTensor in
+        # src/common/task_interface/tensor_arg.h (40B, child_memory @off 33).
+        child_memory = struct.unpack_from("B", buf, off + 33)[0] != 0
+        ct = ContinuousTensor.make(data, tuple(shapes[:ndims]), DataType(dtype_val), child_memory=child_memory)
         args.add_tensor(ct)
 
     sc_off = ct_off + t_count * 40

From faeedd435148127536001be0b4c4d296afe428fa Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Sat, 9 May 2026 09:12:01 +0800
Subject: [PATCH 26/28] refactor(callable): chip child loops use raw blob path;
 consolidate args parsing in C++
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 3 (57963212) made chip-child loops re-deserialise the mailbox
ChipStorageTaskArgs blob in Python via _read_args_from_mailbox before
forwarding to cw.run_prepared. The hand-written Python parser dropped
ContinuousTensor.child_memory at offset 33, which silently broke every
tensor carrying a chip-owned device pointer (HCCL window slots in
allreduce_distributed / deferred_notify_demo / ffn_tp_parallel) on
a2a3 onboard — the runtime treated the device address as a host pointer,
the submitted task stuck in ready_queue with kernel_id=-1 / state=0
forever, surfacing as 'PTO2 timeout after 800001 idle iterations'
on st-onboard-a2a3.

Root cause was duplicating the on-wire ContinuousTensor layout in
Python. Fix: keep the layout single-sourced in C++ and stop redoing
it in Python.

- Add _ChipWorker.run_prepared_from_blob(cid, ptr, capacity, config)
  nanobind overload. Internally calls read_blob (already used by every
  C++ caller) for a zero-copy TaskArgsView, then forwards to the
  existing run_prepared(view, ...) path. No new C-ABI symbol — just a
  Python-side overload over an existing C++ entry point.
- chip-child mailbox loops (_chip_process_loop and
  _chip_process_loop_with_bootstrap) drop the
  args = _read_args_from_mailbox(buf) round-trip and call
  run_prepared_from_blob with the mailbox address directly. The args
  was never inspected in Python, so the typed-object detour bought
  nothing and only added a place to lose fields.
- _read_args_from_mailbox is kept (still used by _sub_worker_loop and
  _child_worker_loop, where the destination is a Python callable) but
  its body collapses to a one-line delegation to the existing nanobind
  read_args_from_blob helper. The hand-rolled struct.unpack_from
  layout (which had to know sizeof(ContinuousTensor)==40 and per-field
  offsets) is gone.

Net effect on chip-child hot path: one Python->C++ call instead of
N+1 (per-tensor make() + add_tensor() + a final run_prepared()), no
intermediate Python TaskArgs / ContinuousTensor object construction.
And there is now exactly one place that knows the on-wire layout
(src/common/task_interface via read_blob), so adding a field to
ContinuousTensor cannot drop it on the chip-child path again.

Verified on a2a3 onboard (devices 9,10) and a2a3sim:
- examples/workers/l3/allreduce_distributed:   PASS  (was hang)
- examples/a2a3/.../deferred_notify_demo:      PASS  (was hang)
- examples/workers/l3/multi_chip_dispatch:     PASS  (no regression)
- examples/workers/l3/child_memory  [a2a3sim]: PASS
- tests/ut/py/test_chip_worker:                14/14 pass
---
 python/bindings/task_interface.cpp | 21 ++++++++-
 python/simpler/worker.py           | 68 +++++++++++-------------------
 2 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 67647bfa5..42449631c 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -646,7 +646,26 @@ NB_MODULE(_task_interface, m) {
                 self.run_prepared(callable_id, view, config);
             },
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
-            "Launch a callable_id from a TaskArgs (used by chip child loops)."
+            "Launch a callable_id from a TaskArgs (used for in-process callers)."
+        )
+        .def(
+            "run_prepared_from_blob",
+            [](ChipWorker &self, int32_t callable_id, uint64_t args_blob_ptr, size_t blob_capacity,
+               const CallConfig &config) {
+                // The mailbox region is the on-wire format `write_blob` produced;
+                // `read_blob` is the matching reader that returns a zero-copy
+                // TaskArgsView into the caller-owned bytes. Forwards to the
+                // existing `run_prepared(cid, view, config)` path so chip-child
+                // loops never re-implement the tensor/scalar layout in Python
+                // (where it has historically dropped fields like child_memory).
+                TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(args_blob_ptr), blob_capacity);
+                self.run_prepared(callable_id, view, config);
+            },
+            nb::arg("callable_id"), nb::arg("args_blob_ptr"), nb::arg("blob_capacity"), nb::arg("config"),
+            "Launch a callable_id from a raw mailbox-blob pointer + capacity "
+            "(used by chip-child mailbox loops to avoid Python-side re-deserialisation "
+            "of the per-task tensor/scalar layout). The blob must be in the format "
+            "produced by `write_blob`; read_blob enforces capacity bounds against shm corruption."
         )
         .def(
             "unregister_callable",
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 8c4998882..0892cc8f8 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -62,6 +62,7 @@ def my_l4_orch(orch, args, config):
     ChipBootstrapMailboxState,
     _mailbox_load_i32,
     _mailbox_store_i32,
+    read_args_from_blob,
 )
 
 from . import _log as _simpler_log
@@ -75,8 +76,6 @@ def my_l4_orch(orch, args, config):
     ChipCallable,
     ChipContext,
     ChipWorker,
-    ContinuousTensor,
-    DataType,
     TaskArgs,
     _Worker,
 )
@@ -197,45 +196,21 @@ def _format_exc(prefix: str, exc: BaseException) -> str:
 def _read_args_from_mailbox(buf) -> TaskArgs:
     """Decode the TaskArgs blob written by C++ write_blob from the mailbox.
 
-    Blob layout at _OFF_ARGS:
-      int32 tensor_count (T), int32 scalar_count (S),
-      ContinuousTensor[T] (40 B each), uint64_t[S] (8 B each).
+    Used by the Python-targeted child loops (sub_worker, nested L4+ child)
+    where the destination of `args` is a Python callable that needs a
+    typed TaskArgs object.  The chip-child loops that immediately forward
+    to C++ run_prepared use the zero-copy `run_prepared_from_blob` path
+    instead — see those loops for the matching comment.
+
+    Delegates to the nanobind helper so the ContinuousTensor layout is
+    parsed by C++ `read_blob` (single source of truth) instead of being
+    reimplemented in Python.  The Python re-implementation that lived
+    here previously dropped the `child_memory` byte (offset 33), which
+    silently broke any tensor carrying a chip-owned device pointer
+    (HCCL window slots etc.) — now structurally impossible.
     """
-    base = _OFF_ARGS
-    t_count = struct.unpack_from("i", buf, base)[0]
-    s_count = struct.unpack_from("i", buf, base + 4)[0]
-    if t_count < 0 or s_count < 0:
-        raise RuntimeError(f"args blob has negative counts: tensors={t_count}, scalars={s_count}")
-    blob_bytes = 8 + t_count * 40 + s_count * 8
-    if blob_bytes > _MAILBOX_ARGS_CAPACITY:
-        raise RuntimeError(
-            f"args blob ({blob_bytes} bytes) exceeds mailbox capacity ({_MAILBOX_ARGS_CAPACITY} bytes); "
-            f"tensors={t_count}, scalars={s_count} — likely a corrupt header or a writer bug"
-        )
-
-    args = TaskArgs()
-    ct_off = base + 8
-    for i in range(t_count):
-        off = ct_off + i * 40
-        data = struct.unpack_from("Q", buf, off)[0]
-        shapes = struct.unpack_from("5I", buf, off + 8)
-        ndims = struct.unpack_from("I", buf, off + 28)[0]
-        dtype_val = struct.unpack_from("B", buf, off + 32)[0]
-        # offset 33: child_memory uint8 — must round-trip; otherwise tensors that
-        # carry a chip-owned device pointer (e.g. HCCL window slot) get dropped
-        # back to the default host-pointer path on the chip child, and
-        # bind_prepared_to_runtime_impl re-allocates + H2D-copies treating the
-        # device address as a host pointer.  Layout matches ContinuousTensor in
-        # src/common/task_interface/tensor_arg.h (40B, child_memory @off 33).
-        child_memory = struct.unpack_from("B", buf, off + 33)[0] != 0
-        ct = ContinuousTensor.make(data, tuple(shapes[:ndims]), DataType(dtype_val), child_memory=child_memory)
-        args.add_tensor(ct)
-
-    sc_off = ct_off + t_count * 40
-    for i in range(s_count):
-        args.add_scalar(struct.unpack_from("Q", buf, sc_off + i * 8)[0])
-
-    return args
+    mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
+    return read_args_from_blob(mailbox_addr + _OFF_ARGS)
 
 
 def _sub_worker_loop(buf, registry: dict) -> None:
@@ -337,9 +312,13 @@ def _ensure_prepared(cid: int) -> None:
             code = 0
             msg = ""
             try:
-                args = _read_args_from_mailbox(buf)
                 _ensure_prepared(cid)
-                cw.run_prepared(cid, args, cfg)
+                # Hand the mailbox bytes straight to C++ (zero-copy zero-decode):
+                # the blob layout is what `write_blob` already wrote, so re-parsing
+                # it in Python is N×40B of avoidable work and a permanent
+                # opportunity to drop a field.  C++ reinterpret_cast<ChipStorageTaskArgs*>
+                # is the source of truth.
+                cw._impl.run_prepared_from_blob(cid, mailbox_addr + _OFF_ARGS, _MAILBOX_ARGS_CAPACITY, cfg)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -462,9 +441,10 @@ def _ensure_prepared(cid: int) -> None:
                 code = 0
                 msg = ""
                 try:
-                    args = _read_args_from_mailbox(buf)
                     _ensure_prepared(cid)
-                    cw._impl.run_prepared(cid, args, cfg)
+                    # Hand the mailbox bytes straight to C++ (zero-copy zero-decode);
+                    # see the matching comment in `_chip_process_loop`.
+                    cw._impl.run_prepared_from_blob(cid, mailbox_addr + _OFF_ARGS, _MAILBOX_ARGS_CAPACITY, cfg)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id}", e)

From afae373d02e0793c6c09ad3b297cc056e1307d30 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Sat, 9 May 2026 16:24:03 +0800
Subject: [PATCH 27/28] fix(pr): plug callable SO leaks and drop in-comment doc
 anchors

- hbg DeviceRunner::finalize() now dlcloses any host orch handles
  callers forgot to unregister; the host process previously leaked one
  dlopen handle per re-created Worker (visible in long-running pytest).
- AICPU executor unlinks the on-disk libdevice_orch_<pid>_<cid>.so
  immediately after dlopen, so chip/sub/next-level children that exit
  via os._exit(0) no longer leave stale .so files in /tmp.
- ChipWorker docstring usage example now uses real keyword names
  (callable_id=, callable=, args=, config=) so the snippet parses as
  valid Python.
- Drop "callable.md" / "Stage N (callable.md)" pointers from comments
  and docstrings; keep the semantic content but remove references to
  the un-archived design doc, per .claude/rules/codestyle.md item 1.
---
 python/bindings/task_interface.cpp            |  2 +-
 python/bindings/worker_bind.h                 |  5 ++-
 python/simpler/orchestrator.py                |  8 ++---
 python/simpler/task_interface.py              |  4 +--
 python/simpler/worker.py                      | 36 +++++++++----------
 simpler_setup/scene_test.py                   | 17 +++++----
 .../platform/onboard/aicpu/orch_so_file.cpp   |  6 ++--
 .../platform/onboard/host/device_runner.cpp   | 28 ++++++++++-----
 .../platform/onboard/host/device_runner.h     |  6 ++--
 .../onboard/host/pto_runtime_c_api.cpp        |  2 +-
 src/a2a3/platform/sim/host/device_runner.cpp  | 29 ++++++++++-----
 src/a2a3/platform/sim/host/device_runner.h    |  2 +-
 .../platform/sim/host/pto_runtime_c_api.cpp   |  2 +-
 .../host_build_graph/host/runtime_maker.cpp   |  4 +--
 .../host_build_graph/runtime/runtime.h        | 10 +++---
 .../aicpu/aicpu_executor.cpp                  | 18 +++++++---
 .../host/runtime_maker.cpp                    | 10 +++---
 .../runtime/runtime.h                         |  8 ++---
 .../platform/onboard/aicpu/orch_so_file.cpp   |  6 ++--
 .../platform/onboard/host/device_runner.cpp   | 28 ++++++++++-----
 src/a5/platform/onboard/host/device_runner.h  |  6 ++--
 .../onboard/host/pto_runtime_c_api.cpp        |  2 +-
 src/a5/platform/sim/host/device_runner.cpp    | 29 ++++++++++-----
 src/a5/platform/sim/host/device_runner.h      |  2 +-
 .../platform/sim/host/pto_runtime_c_api.cpp   |  2 +-
 .../host_build_graph/host/runtime_maker.cpp   |  4 +--
 .../host_build_graph/runtime/runtime.h        | 10 +++---
 .../aicpu/aicpu_executor.cpp                  | 18 +++++++---
 .../host/runtime_maker.cpp                    | 10 +++---
 .../runtime/runtime.h                         |  8 ++---
 src/common/hierarchical/orchestrator.h        |  4 +--
 src/common/hierarchical/types.h               |  6 ++--
 src/common/task_interface/callable_protocol.h |  2 +-
 src/common/worker/chip_worker.cpp             |  9 +++--
 src/common/worker/chip_worker.h               |  8 ++---
 src/common/worker/pto_runtime_c_api.h         |  2 +-
 .../test_prepared_callable.py                 |  2 +-
 .../test_prepared_callable.py                 |  2 +-
 .../test_prepared_callable.py                 |  2 +-
 tests/ut/cpp/CMakeLists.txt                   |  7 ++--
 tests/ut/cpp/common/test_orch_so_file.cpp     |  4 +--
 41 files changed, 212 insertions(+), 158 deletions(-)

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 42449631c..258e00cec 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -629,7 +629,7 @@ NB_MODULE(_task_interface, m) {
             },
             nb::arg("callable_id"), nb::arg("callable"),
             "Stage a ChipCallable under callable_id for cheap repeated launches "
-            "via run_prepared. Variants without callable.md support raise."
+            "via run_prepared. Variants without per-callable_id support raise."
         )
         .def(
             "run_prepared",
diff --git a/python/bindings/worker_bind.h b/python/bindings/worker_bind.h
index fc1e0c909..00355856a 100644
--- a/python/bindings/worker_bind.h
+++ b/python/bindings/worker_bind.h
@@ -102,9 +102,8 @@ inline void bind_worker(nb::module_ &m) {
                 return self.submit_next_level(callable_id, args, config, worker);
             },
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1),
-            "Submit a NEXT_LEVEL (chip) task by registered callable id "
-            "(Stage 3, callable.md). worker= pins to a specific next-level "
-            "worker (-1 = any)."
+            "Submit a NEXT_LEVEL (chip) task by registered callable id. "
+            "worker= pins to a specific next-level worker (-1 = any)."
         )
         .def(
             "submit_next_level_group",
diff --git a/python/simpler/orchestrator.py b/python/simpler/orchestrator.py
index d3f679de2..29bc84db6 100644
--- a/python/simpler/orchestrator.py
+++ b/python/simpler/orchestrator.py
@@ -46,12 +46,12 @@ def my_orch(orch, args, cfg):
 
 
 def _require_cid(callable_or_cid: Any, *, kind: str) -> int:
-    """Coerce a submit argument to a registered cid (Stage 3, callable.md).
+    """Coerce a submit argument to a registered cid.
 
     Raises a clear migration error when the caller still passes a
-    ``ChipCallable`` directly — the Stage 3 contract requires every chip
-    callable to be registered via ``Worker.register(callable)`` *before*
-    ``init()`` so each chip child can pre-warm it on its own device.
+    ``ChipCallable`` directly — every chip callable must be registered
+    via ``Worker.register(callable)`` *before* ``init()`` so each chip
+    child can pre-warm it on its own device.
     """
     if isinstance(callable_or_cid, ChipCallable) or hasattr(callable_or_cid, "buffer_ptr"):
         raise TypeError(
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index ca6abad8e..3dd918b89 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -240,8 +240,8 @@ class ChipWorker:
                     aicpu_path="build/lib/.../aicpu.so",
                     aicore_path="build/lib/.../aicore.o")
         worker.set_device(device_id=0)
-        worker.prepare_callable(cid=0, chip_callable)
-        worker.run_prepared(cid=0, orch_args, CallConfig(block_dim=24))
+        worker.prepare_callable(callable_id=0, callable=chip_callable)
+        worker.run_prepared(callable_id=0, args=orch_args, config=CallConfig(block_dim=24))
         worker.reset_device()
         worker.finalize()
     """
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 0892cc8f8..29f65fb65 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -128,7 +128,7 @@ def my_l4_orch(orch, args, config):
 _CTRL_FREE = 1
 _CTRL_COPY_TO = 2
 _CTRL_COPY_FROM = 3
-# Stage 3 (callable.md): pre-warm a chip child for cid=arg0 by calling
+# Pre-warm a chip child for cid=arg0 by calling
 # `prepare_callable(cid, registry[cid])` so the first run_prepared() does
 # not pay the H2D upload cost.  Sent from the parent right after init()
 # (or whenever a new ChipCallable cid is registered).
@@ -262,7 +262,7 @@ def _chip_process_loop(
     (computed via `_log.get_current_config()`); the child cannot read the
     parent's logger after fork, so the values are passed explicitly.
 
-    Stage 3 (callable.md): TASK_READY carries a cid in OFF_CALLABLE; the
+    Per-callable_id dispatch: TASK_READY carries a cid in OFF_CALLABLE; the
     child looks the cid up in the COW-inherited Python ``registry`` to get
     the ChipCallable, calls ``cw.prepare_callable(cid, callable)`` once,
     then ``cw.run_prepared(cid, args, cfg)``.  ``_CTRL_PREPARE`` is the
@@ -418,8 +418,8 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912, PLR0915
     sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id} bootstrap] ready\n")
     sys.stderr.flush()
 
-    # Per-child set of cids already prepared on this device (Stage 3,
-    # callable.md).  Mirrors `_chip_process_loop`'s `prepared`.
+    # Per-child set of cids already prepared on this device.  Mirrors
+    # `_chip_process_loop`'s `prepared`.
     prepared: set[int] = set()
 
     def _ensure_prepared(cid: int) -> None:
@@ -642,11 +642,11 @@ def __init__(
     def register(self, target) -> int:
         """Register a callable. Returns the cid passed to ``run`` / ``submit_*``.
 
-        Stage 4 (callable.md): a unified id space serves Python functions
-        (sub fn / orch fn) and ``ChipCallable`` instances at every level.
-        L2 returns a cid the user passes to ``Worker.run(cid, args, cfg)``;
-        L3+ returns a cid the orch function passes to
-        ``orch.submit_next_level(cid, …)`` / ``orch.submit_sub(cid, …)``.
+        A unified id space serves Python functions (sub fn / orch fn) and
+        ``ChipCallable`` instances at every level. L2 returns a cid the
+        user passes to ``Worker.run(cid, args, cfg)``; L3+ returns a cid
+        the orch function passes to ``orch.submit_next_level(cid, …)`` /
+        ``orch.submit_sub(cid, …)``.
 
         Timing constraints:
           - L3+: must be called **before** ``init()`` so the COW-inherited
@@ -727,8 +727,8 @@ def _init_level2(self) -> None:
         self._chip_worker.init(binaries)
         self._chip_worker.set_device(device_id)
 
-        # Stage 4 (callable.md): pre-warm any registered ChipCallable so the
-        # first run(cid, …) does not pay the H2D upload cost.
+        # Pre-warm any registered ChipCallable so the first run(cid, …)
+        # does not pay the H2D upload cost.
         assert self._chip_worker is not None
         for cid, target in self._callable_registry.items():
             if isinstance(target, ChipCallable):
@@ -919,9 +919,9 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
 
         # Pre-warm every chip child: for each registered ChipCallable cid,
         # send `_CTRL_PREPARE` to all chip children so the first
-        # `submit_next_level` does not pay the H2D upload cost (callable.md
-        # §3.3).  Sub fns / orch fns do not need pre-warming — the
-        # registry is already COW-inherited.
+        # `submit_next_level` does not pay the H2D upload cost.  Sub fns /
+        # orch fns do not need pre-warming — the registry is already
+        # COW-inherited.
         if device_ids:
             for cid, target in self._callable_registry.items():
                 if isinstance(target, ChipCallable):
@@ -1124,7 +1124,7 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None:
     def run(self, callable, args=None, config=None) -> None:
         """Execute one task (L2) or one DAG (L3+) synchronously.
 
-        Stage 4 (callable.md):
+        Dispatch:
           - L2: ``callable`` is a cid returned by ``Worker.register(chip_callable)``.
             Routes to ``_chip_worker.run_prepared(cid, args, cfg)``.
           - L3+: ``callable`` is a Python orch fn invoked with the
@@ -1169,7 +1169,7 @@ def prepare_callable(self, callable_id: int, callable) -> None:
         """
         assert self._initialized, "Worker not initialized; call init() first"
         if self.level != 2:
-            raise NotImplementedError("prepare_callable is L2-only (callable.md Stage 2)")
+            raise NotImplementedError("prepare_callable is L2-only")
         assert self._chip_worker is not None
         self._chip_worker.prepare_callable(callable_id, callable)
 
@@ -1177,7 +1177,7 @@ def run_prepared(self, callable_id: int, args=None, config=None) -> None:
         """L2 only: launch a callable previously staged via ``prepare_callable``."""
         assert self._initialized, "Worker not initialized; call init() first"
         if self.level != 2:
-            raise NotImplementedError("run_prepared is L2-only (callable.md Stage 2)")
+            raise NotImplementedError("run_prepared is L2-only")
         assert self._chip_worker is not None
         cfg = config if config is not None else CallConfig()
         self._chip_worker.run_prepared(callable_id, args, cfg)
@@ -1186,7 +1186,7 @@ def unregister_callable(self, callable_id: int) -> None:
         """L2 only: drop the prepared state for ``callable_id``."""
         assert self._initialized, "Worker not initialized; call init() first"
         if self.level != 2:
-            raise NotImplementedError("unregister_callable is L2-only (callable.md Stage 2)")
+            raise NotImplementedError("unregister_callable is L2-only")
         assert self._chip_worker is not None
         self._chip_worker.unregister_callable(callable_id)
 
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 401d32dfe..2d13282c9 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -918,11 +918,10 @@ def _run_and_validate_l2(
         config_dict = case.get("config", {})
         orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
 
-        # Stage 4 (callable.md): the L2 entry point is now
-        # `Worker.run(cid, args, cfg)`.  Reuse the cid registered by the
-        # st_worker fixture / standalone path.  For first-time callers
-        # (worker reused across rounds), `_st_l2_cid` caches the cid so
-        # subsequent runs skip re-registration.
+        # The L2 entry point is `Worker.run(cid, args, cfg)`.  Reuse the
+        # cid registered by the st_worker fixture / standalone path.  For
+        # first-time callers (worker reused across rounds), `_st_l2_cid`
+        # caches the cid so subsequent runs skip re-registration.
         cid = getattr(type(self), "_st_l2_cid", None)
         if cid is None:
             cid = worker.register(callable_obj)
@@ -1067,8 +1066,8 @@ def test_run(self, st_platform, st_worker, request):
         cls_name = type(self).__name__
         callable_obj = self.build_callable(st_platform)
         sub_ids = getattr(type(self), "_st_sub_ids", {})
-        # Stage 3 (callable.md): for L3, use pre-registered chip cids
-        # instead of raw ChipCallable objects.
+        # For L3, use pre-registered chip cids instead of raw ChipCallable
+        # objects.
         chip_cids = getattr(type(self), "_st_chip_cids", {})
         if self._st_level == 3 and chip_cids:
             callable_obj = {**chip_cids}
@@ -1612,8 +1611,8 @@ def _create_standalone_worker(group, level, args, selected_by_cls):
     )
     # Register sub callables per-class to avoid name collisions
     per_class_sub_ids: dict[type, dict] = {}
-    # Stage 3 (callable.md): also register ChipCallables here (before init)
-    # so the chip children pre-warm them via _CTRL_PREPARE.
+    # Also register ChipCallables here (before init) so the chip children
+    # pre-warm them via _CTRL_PREPARE.
     per_class_chip_cids: dict[type, dict] = {}
     for cls in group:
         cls_sub_ids = {}
diff --git a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
index a1847adb9..4e7f55232 100644
--- a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
+++ b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
@@ -17,9 +17,9 @@
 
 int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
     // Pid + callable_id naming: AICPU device libc may lack mkstemps. With
-    // Stage 4 per-callable_id dispatch, multiple orch SOs can be resident
-    // in the same device process at once (one per cid in `orch_so_table_`),
-    // so the on-disk file name must be unique per cid — otherwise the
+    // per-callable_id dispatch, multiple orch SOs can be resident in the
+    // same device process at once (one per cid in `orch_so_table_`), so
+    // the on-disk file name must be unique per cid — otherwise the
     // second cid's `O_TRUNC` would silently shred the first cid's already
     // dlopen'd file image and the next launch on cid=0 would SIGBUS.
     // callable_id < 0 is the legacy single-slot path: pid alone is fine.
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 7f94aa7d1..b3c924e64 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -747,8 +747,8 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-    // Per-callable_id path (callable.md): when run_prepared bound a known
-    // callable_id, the SO bytes were already H2D'd at prepare_callable time.
+    // Per-callable_id path: when run_prepared bound a known callable_id,
+    // the SO bytes were already H2D'd at prepare_callable time.
     // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
     // whether the AICPU has seen this id since registration.
     const int32_t cid = runtime.get_active_callable_id();
@@ -1025,8 +1025,8 @@ int DeviceRunner::finalize() {
     so_info_.finalize();
 
     // Kernel binaries are normally released by validate_runtime_impl on the
-    // legacy run() path. The callable.md prepared-callable path intentionally
-    // leaves them resident across runs (shared by func_id) and relies on
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs (shared by func_id) and relies on
     // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the
     // legacy regression signal is preserved for callers that never went
     // through prepare_callable.
@@ -1066,6 +1066,16 @@ int DeviceRunner::finalize() {
         }
     }
     orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
@@ -1235,11 +1245,11 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     // Return cached callable address if already uploaded *and* the new bytes
-    // match. With the callable.md prepared-callable path, multiple ChipCallables
-    // share a single ChipWorker (and DeviceRunner) and can pick distinct kernel
-    // binaries for the same func_id. Naively reusing the cached entry hands the
-    // AICore the previous callable's kernel: dispatch never completes the new
-    // task and the AICPU spins forever.
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and DeviceRunner) and can pick distinct kernel
+    // binaries for the same func_id. Naively reusing the cached entry hands
+    // the AICore the previous callable's kernel: dispatch never completes
+    // the new task and the AICPU spins forever.
     const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size);
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index afca1bc30..c910e47c5 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -537,8 +537,8 @@ class DeviceRunner {
     bool binaries_loaded_{false};              // true after AICPU SO loaded
     std::map<int, uint64_t> func_id_to_addr_;  // func_id -> function_bin_addr (device GM)
     // Parallel hash map for upload_kernel_binary() to detect when the same
-    // func_id is re-uploaded with different binary bytes (different ChipCallable
-    // sharing the same func_id under callable.md / Stage 4).
+    // func_id is re-uploaded with different binary bytes (different
+    // ChipCallable sharing the same func_id under the per-callable_id path).
     std::map<int, uint64_t> func_id_to_hash_;
 
     // Orchestration SO cache. `cached_orch_so_hash_ == 0` means "no cache".
@@ -550,7 +550,7 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
-    // Per-callable_id prepared state (callable.md design).
+    // Per-callable_id prepared state.
     //
     // `prepared_callables_` maps the caller-stable callable_id to the orch
     // SO slice + symbol names needed to launch it. `orch_so_dedup_` shares
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index a3d883029..c647f4887 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -235,7 +235,7 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
 }
 
 /* ===========================================================================
- * Per-callable_id preparation (callable.md design)
+ * Per-callable_id preparation
  * =========================================================================== */
 
 int prepare_callable(
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 795a4220f..8cee9029e 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -674,8 +674,8 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-    // Per-callable_id path (callable.md): mirror onboard. Bytes were staged
-    // at register_prepared_callable time; here we only stamp metadata onto
+    // Per-callable_id path: mirror onboard. Bytes were staged at
+    // register_prepared_callable time; here we only stamp metadata onto
     // the runtime and resolve `register_new_callable_id_` from first sighting.
     const int32_t cid = runtime.get_active_callable_id();
     if (cid >= 0) {
@@ -924,9 +924,9 @@ int DeviceRunner::finalize() {
     }
 
     // Kernel binaries are normally released by validate_runtime_impl on the
-    // legacy run() path. The callable.md prepared-callable path intentionally
-    // leaves them resident across runs and relies on finalize() to reclaim
-    // them; that is not a leak.
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs and relies on finalize() to reclaim them;
+    // that is not a leak.
     if (!func_id_to_addr_.empty()) {
         const bool prepared_path_used = prepared_callable_path_used_;
         if (prepared_path_used) {
@@ -962,6 +962,16 @@ int DeviceRunner::finalize() {
         }
     }
     orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
@@ -992,10 +1002,11 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     // Return cached callable address if already uploaded *and* the new bytes
-    // match. With the callable.md prepared-callable path, multiple ChipCallables
-    // share a single ChipWorker (and hence DeviceRunner) and can pick distinct
-    // kernel binaries for the same func_id.  Naively reusing the cached entry
-    // hands the AICore the previous callable's kernel and segfaults at dispatch.
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and hence DeviceRunner) and can pick distinct
+    // kernel binaries for the same func_id.  Naively reusing the cached
+    // entry hands the AICore the previous callable's kernel and segfaults
+    // at dispatch.
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
         const auto &cached_callable = *reinterpret_cast<const CoreCallable *>(it->second.callable_buf);
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 4825817de..994d92c3b 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -251,7 +251,7 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
-    // Per-callable_id prepared state (callable.md design). Mirrors onboard.
+    // Per-callable_id prepared state. Mirrors onboard.
     struct PreparedCallableState {
         // trb path
         uint64_t hash{0};
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 3d0bba1fa..79b54bf51 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -218,7 +218,7 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
 }
 
 /* ===========================================================================
- * Per-callable_id preparation (callable.md design)
+ * Per-callable_id preparation
  * =========================================================================== */
 
 int prepare_callable(
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index 7bcc1f50d..390ad3d19 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -281,8 +281,8 @@ extern "C" {
  * handle and resolved entry-symbol pointer are parked on the runtime via
  * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the
  * platform layer can hoist them into PreparedCallableState. Splitting this
- * out of init_runtime_impl is what callable.md's prepare_callable / run_prepared
- * design rests on for hbg — the dlopen runs once per cid instead of every run.
+ * out of init_runtime_impl is what the hbg prepare_callable / run_prepared
+ * path rests on — the dlopen runs once per cid instead of every run.
  */
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index bd447955c..25d25dc76 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -472,8 +472,8 @@ class Runtime {
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
 
-    // Per-callable_id dispatch (callable.md). hbg orch runs on host, so AICPU
-    // never reads `active_callable_id_`; the field exists for parity with the
+    // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads
+    // `active_callable_id_`; the field exists for parity with the
     // shared platform layer (DeviceRunner stamps it on every run).
     int32_t active_callable_id_{-1};
     bool register_new_callable_id_{false};
@@ -482,7 +482,7 @@ class Runtime {
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
-    // Host-orchestration staging (callable.md hbg path). prepare_callable_impl
+    // Host-orchestration staging (hbg path). prepare_callable_impl
     // dlopens the orch SO on the host and parks the handle + entry-symbol
     // pointer here so DeviceRunner::register_prepared_callable_host_orch can
     // claim them; bind_prepared_callable_to_runtime restores them onto a fresh
@@ -492,8 +492,8 @@ class Runtime {
     void *pending_host_dlopen_handle_{nullptr};
     void *pending_host_orch_func_ptr_{nullptr};
 
-    // Device-orchestration entry/config symbol names (callable.md trb path).
-    // Always empty on this hbg variant — included for API parity so the shared
+    // Device-orchestration entry/config symbol names (trb path). Always
+    // empty on this hbg variant — included for API parity so the shared
     // platform layer can call set_device_orch_func_name unconditionally.
     char device_orch_func_name_[64]{};
     char device_orch_config_name_[64]{};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index cf760836b..a15584829 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -90,9 +90,9 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
-// Per-callable_id orchestration SO table. AICPU side of the callable.md
-// design: the executor dispatches `orch_so_table_[active_callable_id_]`
-// (created on first sighting of that callable_id, kept warm across runs).
+// Per-callable_id orchestration SO table. The executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of
+// that callable_id, kept warm across runs).
 // MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
 // (mailbox uint32 callable_id, register() returns small ints) and is shared
 // with the host bounds check in DeviceRunner::register_prepared_callable —
@@ -213,8 +213,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Per-callable_id dispatch (callable.md): the orch SO state lives
-            // in `orch_so_table_[callable_id]` keyed by registration order;
+            // Per-callable_id dispatch: the orch SO state lives in
+            // `orch_so_table_[callable_id]` keyed by registration order;
             // reload is governed by `register_new_callable_id_`.
             const int32_t callable_id = runtime->get_active_callable_id();
             if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
@@ -305,6 +305,14 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 }
                 LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
 
+                // Unlink the on-disk SO immediately: dlopen has already mmap'd
+                // the image, so the kernel keeps the inode alive until the
+                // matching dlclose / process exit. This prevents stale
+                // libdevice_orch_<pid>_<cid>.so files from accumulating in
+                // /tmp when child processes exit via os._exit(0), which skips
+                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+                unlink(so_path);
+
                 const char *entry_symbol = runtime->get_device_orch_func_name();
                 if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
                     entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 99b31c07f..b93ac103b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -95,8 +95,8 @@ static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader
  * Stage the per-callable resources (kernel binaries + orchestration SO) into
  * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use
  * them. This is the cacheable half of init_runtime_impl: nothing here depends
- * on per-run argument values, so callable.md's prepare_callable / run_prepared
- * split lets us run this once per callable_id and amortize across runs.
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
  *
  * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
  * @param callable  ChipCallable carrying the orch SO + child kernel binaries
@@ -157,9 +157,9 @@ extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *calla
  * callable-side state (kernel binaries, orch SO bytes, func/config names)
  * is already populated by prepare_callable_impl.
  *
- * Splitting this from prepare_callable_impl matches the callable.md design:
- * register/run_prepared invokes this every call, while the prep half runs
- * only once per callable_id.
+ * Splitting this from prepare_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
  *
  * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
  * @param orch_args  Separated tensor/scalar arguments for this run
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 97d06f4d2..ad70a259a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -191,7 +191,7 @@ class Runtime {
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    // Per-callable_id dispatch (callable.md design). AICPU dispatches via
+    // Per-callable_id dispatch. AICPU dispatches via
     // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
     // signals whether the host is delivering a freshly-registered
     // callable_id (write+dlopen) or reusing an already-loaded one.
@@ -252,7 +252,7 @@ class Runtime {
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
-    // Per-callable_id dispatch (callable.md). callable_id must be in
+    // Per-callable_id dispatch. callable_id must be in
     // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
     // whether to (re)load the orch SO into orch_so_table_[callable_id] or
     // reuse the cached entry.
@@ -306,8 +306,8 @@ class Runtime {
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
-    // Host-orchestration staging (callable.md hbg path). Always nullptr on
-    // this trb variant — included for API parity with host_build_graph so the
+    // Host-orchestration staging (hbg path). Always nullptr on this trb
+    // variant — included for API parity with host_build_graph so the
     // shared platform layer can branch on `pending_host_dlopen_handle_ !=
     // nullptr` at runtime instead of via a build-time macro.
     void *pending_host_dlopen_handle_{nullptr};
diff --git a/src/a5/platform/onboard/aicpu/orch_so_file.cpp b/src/a5/platform/onboard/aicpu/orch_so_file.cpp
index a1847adb9..4e7f55232 100644
--- a/src/a5/platform/onboard/aicpu/orch_so_file.cpp
+++ b/src/a5/platform/onboard/aicpu/orch_so_file.cpp
@@ -17,9 +17,9 @@
 
 int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
     // Pid + callable_id naming: AICPU device libc may lack mkstemps. With
-    // Stage 4 per-callable_id dispatch, multiple orch SOs can be resident
-    // in the same device process at once (one per cid in `orch_so_table_`),
-    // so the on-disk file name must be unique per cid — otherwise the
+    // per-callable_id dispatch, multiple orch SOs can be resident in the
+    // same device process at once (one per cid in `orch_so_table_`), so
+    // the on-disk file name must be unique per cid — otherwise the
     // second cid's `O_TRUNC` would silently shred the first cid's already
     // dlopen'd file image and the next launch on cid=0 would SIGBUS.
     // callable_id < 0 is the legacy single-slot path: pid alone is fine.
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 7797ad2d9..068e3d6bc 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -602,8 +602,8 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-    // Per-callable_id path (callable.md): when run_prepared bound a known
-    // callable_id, the SO bytes were already H2D'd at prepare_callable time.
+    // Per-callable_id path: when run_prepared bound a known callable_id,
+    // the SO bytes were already H2D'd at prepare_callable time.
     // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
     // whether the AICPU has seen this id since registration.
     const int32_t cid = runtime.get_active_callable_id();
@@ -870,8 +870,8 @@ int DeviceRunner::finalize() {
     so_info_.finalize();
 
     // Kernel binaries are normally released by validate_runtime_impl on the
-    // legacy run() path. The callable.md prepared-callable path intentionally
-    // leaves them resident across runs (shared by func_id) and relies on
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs (shared by func_id) and relies on
     // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the
     // legacy regression signal is preserved for callers that never went
     // through prepare_callable.
@@ -910,6 +910,16 @@ int DeviceRunner::finalize() {
         }
     }
     orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
@@ -1041,11 +1051,11 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     // Return cached callable address if already uploaded *and* the new bytes
-    // match. With the callable.md prepared-callable path, multiple ChipCallables
-    // share a single ChipWorker (and DeviceRunner) and can pick distinct kernel
-    // binaries for the same func_id. Naively reusing the cached entry hands the
-    // AICore the previous callable's kernel: dispatch never completes the new
-    // task and the AICPU spins forever.
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and DeviceRunner) and can pick distinct kernel
+    // binaries for the same func_id. Naively reusing the cached entry hands
+    // the AICore the previous callable's kernel: dispatch never completes
+    // the new task and the AICPU spins forever.
     const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size);
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 8eb5b48e1..12c1dab84 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -462,9 +462,9 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
-    // Per-callable_id prepared state (callable.md design). See a2a3 onboard
-    // device_runner.h for the full design narrative; mirrored here so a5
-    // shares the same dispatch surface.
+    // Per-callable_id prepared state. See a2a3 onboard device_runner.h for
+    // the full design narrative; mirrored here so a5 shares the same
+    // dispatch surface.
     struct PreparedCallableState {
         // trb path
         uint64_t hash{0};
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 11726fe9d..e3d8660be 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -269,7 +269,7 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_info_v(log_info_v);
 }
 /* ===========================================================================
- * Per-callable_id preparation (callable.md design)
+ * Per-callable_id preparation
  * =========================================================================== */
 
 int prepare_callable(
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 82cd28b18..ea325c7f9 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -654,8 +654,8 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
-    // Per-callable_id path (callable.md): mirror onboard. Bytes were staged
-    // at register_prepared_callable time; here we only stamp metadata onto
+    // Per-callable_id path: mirror onboard. Bytes were staged at
+    // register_prepared_callable time; here we only stamp metadata onto
     // the runtime and resolve `register_new_callable_id_` from first sighting.
     const int32_t cid = runtime.get_active_callable_id();
     if (cid >= 0) {
@@ -911,9 +911,9 @@ int DeviceRunner::finalize() {
     }
 
     // Kernel binaries are normally released by validate_runtime_impl on the
-    // legacy run() path. The callable.md prepared-callable path intentionally
-    // leaves them resident across runs and relies on finalize() to reclaim
-    // them; that is not a leak.
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs and relies on finalize() to reclaim them;
+    // that is not a leak.
     if (!func_id_to_addr_.empty()) {
         const bool prepared_path_used = prepared_callable_path_used_;
         if (prepared_path_used) {
@@ -949,6 +949,16 @@ int DeviceRunner::finalize() {
         }
     }
     orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
     prepared_callables_.clear();
     aicpu_seen_callable_ids_.clear();
     aicpu_dlopen_total_ = 0;
@@ -979,10 +989,11 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     // Return cached callable address if already uploaded *and* the new bytes
-    // match. With the callable.md prepared-callable path, multiple ChipCallables
-    // share a single ChipWorker (and hence DeviceRunner) and can pick distinct
-    // kernel binaries for the same func_id.  Naively reusing the cached entry
-    // hands the AICore the previous callable's kernel and segfaults at dispatch.
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and hence DeviceRunner) and can pick distinct
+    // kernel binaries for the same func_id.  Naively reusing the cached
+    // entry hands the AICore the previous callable's kernel and segfaults
+    // at dispatch.
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
         const auto &cached_callable = *reinterpret_cast<const CoreCallable *>(it->second.callable_buf);
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 042121518..a153a18a1 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -262,7 +262,7 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
-    // Per-callable_id prepared state (callable.md design). Mirrors onboard.
+    // Per-callable_id prepared state. Mirrors onboard.
     struct PreparedCallableState {
         // trb path
         uint64_t hash{0};
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 113bfdb07..db05b3ac1 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -217,7 +217,7 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_info_v(log_info_v);
 }
 /* ===========================================================================
- * Per-callable_id preparation (callable.md design)
+ * Per-callable_id preparation
  * =========================================================================== */
 
 int prepare_callable(
diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
index 7bcc1f50d..390ad3d19 100644
--- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
@@ -281,8 +281,8 @@ extern "C" {
  * handle and resolved entry-symbol pointer are parked on the runtime via
  * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the
  * platform layer can hoist them into PreparedCallableState. Splitting this
- * out of init_runtime_impl is what callable.md's prepare_callable / run_prepared
- * design rests on for hbg — the dlopen runs once per cid instead of every run.
+ * out of init_runtime_impl is what the hbg prepare_callable / run_prepared
+ * path rests on — the dlopen runs once per cid instead of every run.
  */
 int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index a68b29fea..73e201494 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -478,15 +478,15 @@ class Runtime {
     // Device orchestration SO metadata (see a2a3 host_build_graph runtime.h).
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
-    // Per-callable_id dispatch (callable.md). hbg orch runs on host, so AICPU
-    // never reads `active_callable_id_`; the field exists for parity with the
+    // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads
+    // `active_callable_id_`; the field exists for parity with the
     // shared platform layer (DeviceRunner stamps it on every run).
     int32_t active_callable_id_{-1};
     bool register_new_callable_id_{false};
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
-    // Host-orchestration staging (callable.md hbg path). prepare_callable_impl
+    // Host-orchestration staging (hbg path). prepare_callable_impl
     // dlopens the orch SO on the host and parks the handle + entry-symbol
     // pointer here so DeviceRunner::register_prepared_callable_host_orch can
     // claim them; bind_prepared_callable_to_runtime restores them onto a fresh
@@ -495,8 +495,8 @@ class Runtime {
     void *pending_host_dlopen_handle_{nullptr};
     void *pending_host_orch_func_ptr_{nullptr};
 
-    // Device-orchestration entry/config symbol names (callable.md trb path).
-    // Always empty on this hbg variant — included for API parity so the shared
+    // Device-orchestration entry/config symbol names (trb path). Always
+    // empty on this hbg variant — included for API parity so the shared
     // platform layer can call set_device_orch_func_name unconditionally.
     char device_orch_func_name_[64]{};
     char device_orch_config_name_[64]{};
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 78d5ef52c..e9b97d5ff 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -90,9 +90,9 @@ static int32_t read_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
-// Per-callable_id orchestration SO table. AICPU side of the callable.md
-// design: the executor dispatches `orch_so_table_[active_callable_id_]`
-// (created on first sighting of that callable_id, kept warm across runs).
+// Per-callable_id orchestration SO table. The executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of
+// that callable_id, kept warm across runs).
 // MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
 // (mailbox uint32 callable_id, register() returns small ints) and is shared
 // with the host bounds check in DeviceRunner::register_prepared_callable —
@@ -213,8 +213,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Per-callable_id dispatch (callable.md): the orch SO state lives
-            // in `orch_so_table_[callable_id]` keyed by registration order;
+            // Per-callable_id dispatch: the orch SO state lives in
+            // `orch_so_table_[callable_id]` keyed by registration order;
             // reload is governed by `register_new_callable_id_`.
             const int32_t callable_id = runtime->get_active_callable_id();
             if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
@@ -305,6 +305,14 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 }
                 LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
 
+                // Unlink the on-disk SO immediately: dlopen has already mmap'd
+                // the image, so the kernel keeps the inode alive until the
+                // matching dlclose / process exit. This prevents stale
+                // libdevice_orch_<pid>_<cid>.so files from accumulating in
+                // /tmp when child processes exit via os._exit(0), which skips
+                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+                unlink(so_path);
+
                 const char *entry_symbol = runtime->get_device_orch_func_name();
                 if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
                     entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index f84c26dcb..e70f9a309 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -95,8 +95,8 @@ static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *hos
  * Stage the per-callable resources (kernel binaries + orchestration SO) into
  * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use
  * them. This is the cacheable half of init_runtime_impl: nothing here depends
- * on per-run argument values, so callable.md's prepare_callable / run_prepared
- * split lets us run this once per callable_id and amortize across runs.
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
  *
  * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
  * @param callable  ChipCallable carrying the orch SO + child kernel binaries
@@ -157,9 +157,9 @@ extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *calla
  * callable-side state (kernel binaries, orch SO bytes, func/config names)
  * is already populated by prepare_callable_impl.
  *
- * Splitting this from prepare_callable_impl matches the callable.md design:
- * register/run_prepared invokes this every call, while the prep half runs
- * only once per callable_id.
+ * Splitting this from prepare_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
  *
  * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
  * @param orch_args  Separated tensor/scalar arguments for this run
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 8062078cc..48e3c82b6 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -205,7 +205,7 @@ class Runtime {
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    // Per-callable_id dispatch (callable.md design). AICPU dispatches via
+    // Per-callable_id dispatch. AICPU dispatches via
     // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
     // signals whether the host is delivering a freshly-registered
     // callable_id (write+dlopen) or reusing an already-loaded one.
@@ -266,7 +266,7 @@ class Runtime {
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
-    // Per-callable_id dispatch (callable.md). callable_id must be in
+    // Per-callable_id dispatch. callable_id must be in
     // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
     // whether to (re)load the orch SO into orch_so_table_[callable_id] or
     // reuse the cached entry.
@@ -318,8 +318,8 @@ class Runtime {
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
-    // Host-orchestration staging (callable.md hbg path). Always nullptr on
-    // this trb variant — included for API parity with host_build_graph so the
+    // Host-orchestration staging (hbg path). Always nullptr on this trb
+    // variant — included for API parity with host_build_graph so the
     // shared platform layer can branch on `pending_host_dlopen_handle_ !=
     // nullptr` at runtime instead of via a build-time macro.
     void *pending_host_dlopen_handle_{nullptr};
diff --git a/src/common/hierarchical/orchestrator.h b/src/common/hierarchical/orchestrator.h
index a156a9000..f8abdb424 100644
--- a/src/common/hierarchical/orchestrator.h
+++ b/src/common/hierarchical/orchestrator.h
@@ -93,8 +93,8 @@ class Orchestrator {
     void copy_from(int worker_id, uint64_t dst, uint64_t src, size_t size);
 
     // Submit a NEXT_LEVEL task. `callable_id` is a cid registered via
-    // Worker.register() (Stage 3, callable.md): the chip child looks it up
-    // in its COW-inherited Python registry to get the actual ChipCallable.
+    // Worker.register(): the chip child looks it up in its COW-inherited
+    // Python registry to get the actual ChipCallable.
     // Tags inside `args` drive dependency inference; OUTPUT tensors with
     // null data are auto-allocated from the HeapRing.
     // `worker`: logical worker id for affinity (-1 = unconstrained).
diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h
index 33d24cedc..f67fa6028 100644
--- a/src/common/hierarchical/types.h
+++ b/src/common/hierarchical/types.h
@@ -146,9 +146,9 @@ struct TaskSlotState {
     // --- Task data (stored on parent heap, lives until slot CONSUMED) ---
     WorkerType worker_type{WorkerType::NEXT_LEVEL};
     // Unified callable id: NEXT_LEVEL chip callables and SUB fns share the
-    // same Worker.register() id space (Stage 3, callable.md). The mailbox
-    // wire format writes this as a uint64 with the cid in the low 32 bits;
-    // dispatch_process read it identically for both worker types.
+    // same Worker.register() id space. The mailbox wire format writes this
+    // as a uint64 with the cid in the low 32 bits; dispatch_process reads
+    // it identically for both worker types.
     int32_t callable_id{-1};
     CallConfig config{};  // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features)
 
diff --git a/src/common/task_interface/callable_protocol.h b/src/common/task_interface/callable_protocol.h
index 108713910..4e3898804 100644
--- a/src/common/task_interface/callable_protocol.h
+++ b/src/common/task_interface/callable_protocol.h
@@ -9,7 +9,7 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * Per-callable_id protocol constants (callable.md)
+ * Per-callable_id protocol constants
  *
  * Single source of truth for the host↔AICPU per-callable_id dispatch protocol.
  * Kept separate from callable.h so the AICPU side can include it without
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 70ec7591e..7e8fc72b6 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -269,11 +269,10 @@ void ChipWorker::finalize() {
 }
 
 void ChipWorker::run(uint64_t callable, TaskArgsView args, const CallConfig &config) {
-    // Stage 4 (callable.md): the hierarchical layer (worker_manager.cpp) packs
-    // the cid produced by Worker.register() into this uint64. ChipWorker
-    // treats it as such — it must already have been prepared via
-    // prepare_callable. The legacy "callable buffer ptr → run_runtime" path is
-    // gone.
+    // The hierarchical layer (worker_manager.cpp) packs the cid produced by
+    // Worker.register() into this uint64. ChipWorker treats it as such — it
+    // must already have been prepared via prepare_callable. The legacy
+    // "callable buffer ptr → run_runtime" path is gone.
     run_prepared(static_cast<int32_t>(static_cast<uint32_t>(callable)), args, config);
 }
 
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index cf1d7b319..c08b1c618 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -53,12 +53,12 @@ class ChipWorker : public IWorker {
     void finalize();
 
     // IWorker: dispatch the cid `callable` (packed into uint64 by the
-    // hierarchical layer; see callable.md) by delegating to run_prepared.
-    // The cid must already have been prepared via prepare_callable.
+    // hierarchical layer) by delegating to run_prepared. The cid must
+    // already have been prepared via prepare_callable.
     void run(uint64_t callable, TaskArgsView args, const CallConfig &config) override;
 
-    // Per-callable_id preparation (callable.md design). Requires set_device()
-    // and a callable_id in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64).
+    // Per-callable_id preparation. Requires set_device() and a callable_id
+    // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64).
     void prepare_callable(int32_t callable_id, const void *callable);
     void run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config);
     void run_prepared(int32_t callable_id, const void *args, const CallConfig &config);
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index 9e73e34c1..0ef16a13c 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -97,7 +97,7 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v);
 int finalize_device(DeviceContextHandle ctx);
 
 /* ===========================================================================
- * Per-callable_id preparation (callable.md design)
+ * Per-callable_id preparation
  *
  * The triplet below decouples the one-shot prep work (kernel upload + orch SO
  * H2D + caching keyed by `callable_id`) from each `run_prepared` invocation,
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
index 64c58ed9c..c401eb5a9 100644
--- a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -126,7 +126,7 @@ def _run_and_validate_l2(
         worker.unregister_callable(1)
 
     # ------------------------------------------------------------------
-    # host_dlopen_count assertions (callable.md §7 verification, hbg path).
+    # host_dlopen_count assertions (hbg path).
     #
     # hbg increments host_dlopen_count on every register_prepared_callable_host_orch
     # invocation (i.e. each `prepare_callable` call), independent of how many
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index 27335a6c4..a9129f601 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -131,7 +131,7 @@ def _run_and_validate_l2(
         worker.unregister_callable(1)
 
     # ------------------------------------------------------------------
-    # aicpu_dlopen_count assertions (callable.md §7 verification).
+    # aicpu_dlopen_count assertions.
     #
     # The L2 worker fixture is shared across tests in this class, so the
     # counter can be non-zero on entry from prior tests' leftover prepared
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index 040d929d5..f5627fa6d 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -132,7 +132,7 @@ def _run_and_validate_l2(
         worker.unregister_callable(1)
 
     # ------------------------------------------------------------------
-    # aicpu_dlopen_count assertions (callable.md §7 verification).
+    # aicpu_dlopen_count assertions.
     #
     # The L2 worker fixture is shared across tests in this class, so the
     # counter can be non-zero on entry from prior tests' leftover prepared
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 6c525edd0..5ad49cd52 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -246,10 +246,9 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp)
 add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp)
 add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp)
 
-# Per-callable_id orch SO file naming regression (callable.md, see
-# rtStreamSynchronize 507018 root cause). Compiles the a2a3 onboard
-# `create_orch_so_file` against the test source so it runs on no-hw
-# runners too.
+# Per-callable_id orch SO file naming regression (see rtStreamSynchronize
+# 507018 root cause). Compiles the a2a3 onboard `create_orch_so_file`
+# against the test source so it runs on no-hw runners too.
 add_executable(test_orch_so_file
     common/test_orch_so_file.cpp
     ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
diff --git a/tests/ut/cpp/common/test_orch_so_file.cpp b/tests/ut/cpp/common/test_orch_so_file.cpp
index 078e422b9..6e1b32bd6 100644
--- a/tests/ut/cpp/common/test_orch_so_file.cpp
+++ b/tests/ut/cpp/common/test_orch_so_file.cpp
@@ -12,8 +12,8 @@
 //
 // The onboard variants of `create_orch_so_file` (src/{a2a3,a5}/platform/
 // onboard/aicpu/orch_so_file.cpp) historically used pid-only naming, which
-// silently broke once Stage 4 (callable.md) introduced multi-callable
-// dispatch on the same device process: the second cid's `O_TRUNC` open
+// silently broke once multi-callable dispatch was introduced on the same
+// device process: the second cid's `O_TRUNC` open
 // shredded the first cid's already-dlopen'd SO image and the next launch
 // on cid=0 SIGBUS'd inside the AICPU executor (manifesting as
 // `rtStreamSynchronize (AICPU) failed: 507018` on the host).

From aa38eb7953a76b29166c2934701b52ef685c6c9a Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Sat, 9 May 2026 17:39:16 +0800
Subject: [PATCH 28/28] fix(pr): align stragglers with cid API contract for
 #710

Address four review findings on the callable_id refactor:

- scene_test.py: L2 _create_standalone_worker returns (worker, {}, {})
  to match the 3-tuple unpacking used by the L3 path; standalone L2
  runners no longer fail with ValueError.
- sdma_async_completion_demo: register the ChipCallable before init()
  and submit_next_level(chip_cid, ...). raw ChipCallable is rejected
  by both register-after-init guards and Orchestrator._require_cid.
- prepared_callable ST: each of the 4 test classes now owns an isolated
  L2 Worker via a directory-local conftest.py override so the cid table
  is empty on entry; cid 0/1 are renamed _CID_PRIMARY/_CID_SECONDARY
  to make the white-box intent explicit and a stale comment claiming
  unregister decrements the dlopen counter is removed.
- Docs: worker.py module docstring, docs/getting-started.md, and the
  L2/L3 example READMEs all show the full register -> cid -> run /
  submit_next_level pattern, including the must-register-before-init()
  rule for L>=3.
---
 docs/getting-started.md                       | 11 ++-
 .../test_sdma_async_completion_demo.py        |  3 +-
 examples/workers/l2/README.md                 | 11 ++-
 .../workers/l3/multi_chip_dispatch/README.md  | 18 ++--
 python/simpler/worker.py                      | 16 +++-
 simpler_setup/scene_test.py                   |  6 +-
 .../prepared_callable/conftest.py             | 61 +++++++++++++
 .../test_prepared_callable.py                 | 74 ++++++++-------
 .../prepared_callable/conftest.py             | 61 +++++++++++++
 .../test_prepared_callable.py                 | 91 +++++++++++--------
 .../prepared_callable/conftest.py             | 61 +++++++++++++
 .../test_prepared_callable.py                 | 62 ++++++++-----
 .../prepared_callable/conftest.py             | 61 +++++++++++++
 .../test_prepared_callable.py                 | 91 +++++++++++--------
 14 files changed, 474 insertions(+), 153 deletions(-)
 create mode 100644 tests/st/a2a3/host_build_graph/prepared_callable/conftest.py
 create mode 100644 tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py
 create mode 100644 tests/st/a5/host_build_graph/prepared_callable/conftest.py
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py

diff --git a/docs/getting-started.md b/docs/getting-started.md
index a7232d2bf..14a1d3a3f 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -166,14 +166,21 @@ worker.init(host_path=str(binaries.host_path),
             aicore_path=str(binaries.aicore_path))
 worker.set_device(device_id=0)
 
-# Execute callable on device
-worker.run(chip_callable, orch_args, block_dim=24)
+# Register the ChipCallable to obtain a callable_id
+cid = worker.register(chip_callable)
+
+# Execute the registered callable on device
+worker.run(cid, orch_args, block_dim=24)
 
 # Cleanup
 worker.reset_device()
 worker.finalize()
 ```
 
+`ChipWorker` follows the same `register → run(cid)` contract as
+`Worker(level=2)`; reach for the high-level `Worker` first and use
+`ChipWorker` only when a low-level handle is required.
+
 ## Configuration
 
 ### Compile-time Configuration (Runtime Limits)
diff --git a/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py b/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
index fd370712d..5033e0a95 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
@@ -172,6 +172,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -191,7 +192,7 @@ def orch_fn(orch, _args, cfg):
                 args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING)
                 args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING)
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/workers/l2/README.md b/examples/workers/l2/README.md
index 242ca926f..fb6f29251 100644
--- a/examples/workers/l2/README.md
+++ b/examples/workers/l2/README.md
@@ -23,12 +23,19 @@ worker = Worker(
 )
 worker.init()             # load host.so + aicpu.so + aicore.o, set device
 try:
-    # ... allocate device buffers, build ChipCallable, run ...
-    worker.run(chip_callable, task_args, call_config)
+    # ... allocate device buffers, build ChipCallable ...
+    cid = worker.register(chip_callable)   # one-shot: cid is reused across runs
+    worker.run(cid, task_args, call_config)
 finally:
     worker.close()        # release ACL resources and device
 ```
 
+`register()` is the only way to obtain a `cid`; `worker.run` always takes
+that int, never the raw `ChipCallable`. A cid stays valid for the
+lifetime of the worker, so you register once and reuse it across runs —
+this is also why ST cases cache the cid on the test class (see
+`_st_l2_cid` in `simpler_setup/scene_test.py`).
+
 The `try/finally` is important — if anything between `init()` and `close()`
 raises, you still want the device released. The
 [L2 conftest leak issue](https://github.com/hw-native-sys/simpler/issues/604)
diff --git a/examples/workers/l3/multi_chip_dispatch/README.md b/examples/workers/l3/multi_chip_dispatch/README.md
index 83ba2f889..7994dcaf5 100644
--- a/examples/workers/l3/multi_chip_dispatch/README.md
+++ b/examples/workers/l3/multi_chip_dispatch/README.md
@@ -10,9 +10,10 @@ chip outputs. The smallest correct L3 program.
 | ------- | ------------------------------ |
 | Shared-memory tensors | `torch.randn(...).share_memory_()` — chip children see the same storage |
 | `TensorArgType` tags | `INPUT` / `OUTPUT_EXISTING` drive DAG dependency tracking |
-| Python SubWorker | `worker.register(fn)` **before** `init()` |
+| ChipCallable id | `chip_cid = worker.register(chip_callable)` **before** `init()` |
+| Python SubWorker | `sub_cid = worker.register(fn)` **before** `init()` |
 | `Worker(level=3)` config | `device_ids=[0, 1]`, `num_sub_workers=1` |
-| Orchestration | `orch.submit_next_level(...)` per chip + `orch.submit_sub(cid, args)` |
+| Orchestration | `orch.submit_next_level(chip_cid, ...)` per chip + `orch.submit_sub(sub_cid, args)` |
 
 ## Layout
 
@@ -66,7 +67,8 @@ host_b   = [torch.randn(...).share_memory_() for _ in device_ids]
 host_out = [torch.zeros(...).share_memory_() for _ in device_ids]
 
 def subworker(sub_args): ...
-sub_cid = worker.register(subworker)   # BEFORE init() — see below
+chip_cid = worker.register(chip_callable)   # ChipCallable: BEFORE init()
+sub_cid  = worker.register(subworker)        # Python SubWorker: BEFORE init()
 ```
 
 `share_memory_()` moves the tensor's storage to a `mmap` region. After
@@ -74,9 +76,11 @@ sub_cid = worker.register(subworker)   # BEFORE init() — see below
 address, so when the kernel writes to `host_out[i]`, the parent's tensor sees
 it immediately. No explicit copy back.
 
-**`register()` MUST come before `init()`**. `init()` forks child processes;
-the registry is captured by copy-on-write. Anything registered after `init()`
-is invisible to the forked children.
+**`register()` MUST come before `init()`** for *every* callable — both
+the `ChipCallable` dispatched to chips and the Python sub functions.
+`init()` forks child processes; the registry is captured by copy-on-write.
+Anything registered after `init()` is invisible to the forked children,
+and `Worker.register()` at L≥3 raises if called post-init.
 
 ### 2. `init()` — fork + C++ scheduler
 
@@ -93,7 +97,7 @@ def orch_fn(orch, _args, cfg):
         chip_args.add_tensor(make_tensor_arg(host_a[i]),   TensorArgType.INPUT)
         chip_args.add_tensor(make_tensor_arg(host_b[i]),   TensorArgType.INPUT)
         chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING)
-        orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+        orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
     sub_args = TaskArgs()
     for i in range(len(device_ids)):
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 29f65fb65..8fc2861ac 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -8,23 +8,31 @@
 # -----------------------------------------------------------------------------------------------------------
 """Worker — unified factory for all hierarchy levels.
 
+Callable identity is a ``cid`` (int), allocated exclusively by
+``Worker.register(callable)``. ``Worker.run`` and the orchestrator's
+``submit_next_level`` / ``submit_sub`` all take this cid — never the raw
+``ChipCallable`` / Python function. L≥3 ``register()`` must run **before**
+``init()`` so forked chip / sub children inherit the registry via COW.
+
 Usage::
 
     # L2: one NPU chip
     w = Worker(level=2, device_id=8, platform="a2a3", runtime="tensormap_and_ringbuffer")
     w.init()
-    w.run(chip_callable, chip_args, config)
+    chip_cid = w.register(chip_callable)            # L2 may register pre or post init()
+    w.run(chip_cid, chip_args, config)
     w.close()
 
     # L3: multiple chips + SubWorkers, auto-discovery in init()
     w = Worker(level=3, device_ids=[8, 9], num_sub_workers=2,
                platform="a2a3", runtime="tensormap_and_ringbuffer")
-    cid = w.register(lambda args: postprocess())
+    chip_cid = w.register(chip_callable)            # ChipCallable, before init()
+    sub_cid  = w.register(lambda args: postprocess())  # Python sub, before init()
     w.init()
 
     def my_orch(orch, args, cfg):
-        r = orch.submit_next_level(chip_callable, chip_args_ptr, cfg)
-        orch.submit_sub(cid, sub_args)
+        r = orch.submit_next_level(chip_cid, chip_args_ptr, cfg)
+        orch.submit_sub(sub_cid, sub_args)
 
     w.run(my_orch, my_args, my_config)
     w.close()
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 2d13282c9..9241bae92 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -1578,11 +1578,15 @@ def _create_standalone_worker(group, level, args, selected_by_cls):
     ``max_sub_workers`` must be computed from these, not from ``cls.CASES``:
     otherwise a manual case with a larger ``device_count`` inflates the
     allocation even when it isn't scheduled.
+
+    Returns ``(worker, per_class_sub_ids, per_class_chip_cids)`` for both
+    L2 and L3 so the caller can unpack uniformly. L2 has neither sub
+    callables nor pre-registered chip callables, so both dicts are empty.
     """
     first_cls = group[0]
     build = getattr(args, "build", False)
     if level == 2:
-        return first_cls._create_worker(args.platform, args.device, build=build), {}
+        return first_cls._create_worker(args.platform, args.device, build=build), {}, {}
 
     from simpler.worker import Worker  # noqa: PLC0415
 
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py b/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
index c401eb5a9..00a658cc6 100644
--- a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -25,10 +25,22 @@
 
 _VECTOR_KERNELS = "../vector_example/kernels"
 
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
 
 @scene_test(level=2, runtime="host_build_graph")
 class TestPreparedCallableHbg(SceneTestCase):
-    """Exercise prepare_callable / run_prepared / unregister_callable on hbg."""
+    """Exercise prepare_callable / run_prepared / unregister_callable on hbg.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
 
     CALLABLE = {
         "orchestration": {
@@ -102,8 +114,8 @@ def _run_and_validate_l2(
 
         config = self._build_config(config_dict)
 
-        worker.prepare_callable(0, callable_obj)
-        worker.prepare_callable(1, callable_obj)
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
 
         for _ in range(2):
             test_args = self.generate_args(params)
@@ -111,7 +123,7 @@ def _run_and_validate_l2(
             golden_args = test_args.clone()
             self.compute_golden(golden_args, params)
 
-            worker.run_prepared(0, chip_args, config=config)
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
             _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
         test_args = self.generate_args(params)
@@ -119,11 +131,11 @@ def _run_and_validate_l2(
         golden_args = test_args.clone()
         self.compute_golden(golden_args, params)
 
-        worker.run_prepared(1, chip_args, config=config)
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
-        worker.unregister_callable(0)
-        worker.unregister_callable(1)
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
 
     # ------------------------------------------------------------------
     # host_dlopen_count assertions (hbg path).
@@ -151,54 +163,54 @@ def _run_one(self, worker, cid, callable_obj, config, case):
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
     def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
-        """prepare(0) + run(0) × 5 → host_dlopen delta == 1, aicpu == 0."""
+        """prepare(primary) + run × 5 → host_dlopen delta == 1, aicpu == 0."""
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.host_dlopen_count
         baseline_aicpu = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 1, (
-                f"expected exactly 1 new host dlopen for 5 runs of cid=0, "
+                f"expected exactly 1 new host dlopen for 5 runs of primary cid, "
                 f"got delta {st_worker.host_dlopen_count - baseline}"
             )
             assert st_worker.aicpu_dlopen_count == baseline_aicpu, "hbg must not trigger any AICPU orch SO dlopens"
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
-        """prepare(0)+prepare(1) + (run(0),run(1)) × 5 → host_dlopen delta == 2."""
+        """prepare(primary)+prepare(secondary) + alternating runs × 5 → host_dlopen delta == 2."""
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.host_dlopen_count
         baseline_aicpu = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
-            st_worker.prepare_callable(1, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
-                self._run_one(st_worker, 1, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 2, (
-                f"expected exactly 2 new host dlopens for cids {{0,1}} interleaved, "
+                f"expected exactly 2 new host dlopens for two cids interleaved, "
                 f"got delta {st_worker.host_dlopen_count - baseline}"
             )
             assert st_worker.aicpu_dlopen_count == baseline_aicpu
         finally:
-            st_worker.unregister_callable(0)
-            st_worker.unregister_callable(1)
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
 
     def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
-        """prepare(0) twice → second call raises RuntimeError."""
+        """prepare(primary) twice → second call raises RuntimeError."""
         callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             with pytest.raises(RuntimeError):
-                st_worker.prepare_callable(0, callable_obj)
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
-        """prepare(0)+run(0)+unregister(0)+prepare(0)+run(0) → host_dlopen delta == 2.
+        """prepare+run+unregister+prepare+run on the same cid → host_dlopen delta == 2.
 
         Counter is monotonic — re-prepare always counts a fresh dlopen.
         """
@@ -206,26 +218,26 @@ def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
         baseline = st_worker.host_dlopen_count
         registered = False
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 1
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
             registered = False
             after_unreg = st_worker.host_dlopen_count
             assert after_unreg - baseline == 1, (
                 f"unregister must NOT decrement the host dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
             )
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 2, (
                 f"after re-prepare expected counter +2 (two distinct host dlopens), "
                 f"got delta {st_worker.host_dlopen_count - baseline}"
             )
         finally:
             if registered:
-                st_worker.unregister_callable(0)
+                st_worker.unregister_callable(_CID_PRIMARY)
 
 
 if __name__ == "__main__":
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index a9129f601..62ced849b 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -28,10 +28,22 @@
 
 _VECTOR_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
 
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
 
 @scene_test(level=2, runtime="tensormap_and_ringbuffer")
 class TestPreparedCallable(SceneTestCase):
-    """Exercise prepare_callable / run_prepared / unregister_callable ABI."""
+    """Exercise prepare_callable / run_prepared / unregister_callable ABI.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
 
     CALLABLE = {
         "orchestration": {
@@ -104,42 +116,41 @@ def _run_and_validate_l2(
         config = self._build_config(config_dict)
 
         # 1) prepare two callable_ids with the SAME callable (shared orch SO)
-        worker.prepare_callable(0, callable_obj)
-        worker.prepare_callable(1, callable_obj)
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
 
-        # 2) run_prepared cid=0 twice (second run proves dedup/cache hit)
+        # 2) run_prepared primary cid twice (second run proves dedup/cache hit)
         for _ in range(2):
             test_args = self.generate_args(params)
             chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
             golden_args = test_args.clone()
             self.compute_golden(golden_args, params)
 
-            worker.run_prepared(0, chip_args, config=config)
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
             _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
-        # 3) run_prepared cid=1 — different slot, same SO, must also work
+        # 3) run_prepared secondary cid — different slot, same SO, must also work
         test_args = self.generate_args(params)
         chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
         golden_args = test_args.clone()
         self.compute_golden(golden_args, params)
 
-        worker.run_prepared(1, chip_args, config=config)
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
         # 4) unregister both — should not raise
-        worker.unregister_callable(0)
-        worker.unregister_callable(1)
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
 
     # ------------------------------------------------------------------
     # aicpu_dlopen_count assertions.
     #
-    # The L2 worker fixture is shared across tests in this class, so the
-    # counter can be non-zero on entry from prior tests' leftover prepared
-    # callables (or from this test class's own test_run). Each test below
-    # snapshots the counter on entry, asserts the *delta* introduced by the
-    # scenario, then unregisters everything it staged so the next test sees
-    # the same baseline (unregister_callable erases the cid, decrementing
-    # the counter).
+    # The class-scope L2 worker is shared across test methods in this
+    # class (see ./conftest.py), so the counter can be non-zero on entry
+    # from prior methods. Each test below snapshots the counter on entry,
+    # asserts the *delta* introduced by the scenario, then unregisters
+    # everything it staged. unregister_callable does NOT decrement the
+    # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare).
     # ------------------------------------------------------------------
 
     def _setup_dlopen_count_test(self, st_worker, st_platform):
@@ -160,50 +171,50 @@ def _run_one(self, worker, cid, callable_obj, config, case):
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
     def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
-        """Case A: prepare(0) + run(0) × 5 → dlopen_count delta == 1."""
+        """Case A: prepare(primary) + run × 5 → dlopen_count delta == 1."""
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 1, (
-                f"expected exactly 1 new dlopen for 5 runs of cid=0, "
+                f"expected exactly 1 new dlopen for 5 runs of primary cid, "
                 f"got delta {st_worker.aicpu_dlopen_count - baseline}"
             )
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
-        """Case B: prepare(0)+prepare(1) + (run(0),run(1)) × 5 → delta == 2."""
+        """Case B: prepare(primary)+prepare(secondary) + alternating runs × 5 → delta == 2."""
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
-            st_worker.prepare_callable(1, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
-                self._run_one(st_worker, 1, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 2, (
-                f"expected exactly 2 new dlopens for cids {{0,1}} interleaved, "
+                f"expected exactly 2 new dlopens for two cids interleaved, "
                 f"got delta {st_worker.aicpu_dlopen_count - baseline}"
             )
         finally:
-            st_worker.unregister_callable(0)
-            st_worker.unregister_callable(1)
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
 
     def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
-        """Case C: prepare(0) + prepare(0) → second call raises RuntimeError."""
+        """Case C: prepare(primary) + prepare(primary) → second call raises RuntimeError."""
         callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             with pytest.raises(RuntimeError):
-                st_worker.prepare_callable(0, callable_obj)
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
-        """Case D: prepare(0)+run(0)+unregister(0)+prepare(0)+run(0) → delta == 2.
+        """Case D: prepare+run+unregister+prepare+run on the same cid → delta == 2.
 
         unregister erases the cid from aicpu_seen_callable_ids_, so the second
         prepare/run pair sets register_new_callable_id_ again and the AICPU
@@ -214,26 +225,26 @@ def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
         baseline = st_worker.aicpu_dlopen_count
         registered = False
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 1
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
             registered = False
             after_unreg = st_worker.aicpu_dlopen_count
             assert after_unreg - baseline == 1, (
                 f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
             )
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 2, (
                 f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
                 f"got delta {st_worker.aicpu_dlopen_count - baseline}"
             )
         finally:
             if registered:
-                st_worker.unregister_callable(0)
+                st_worker.unregister_callable(_CID_PRIMARY)
 
 
 if __name__ == "__main__":
diff --git a/tests/st/a5/host_build_graph/prepared_callable/conftest.py b/tests/st/a5/host_build_graph/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
index 92ba39f50..1efd00806 100644
--- a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -21,10 +21,22 @@
 from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
 from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
 
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
 
 @scene_test(level=2, runtime="host_build_graph")
 class TestPreparedCallableHbgA5(SceneTestCase):
-    """Exercise prepare_callable / run_prepared / unregister_callable on a5/hbg."""
+    """Exercise prepare_callable / run_prepared / unregister_callable on a5/hbg.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
 
     CALLABLE = {
         "orchestration": {
@@ -91,8 +103,8 @@ def _run_and_validate_l2(
 
         config = self._build_config(config_dict)
 
-        worker.prepare_callable(0, callable_obj)
-        worker.prepare_callable(1, callable_obj)
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
 
         for _ in range(2):
             test_args = self.generate_args(params)
@@ -100,7 +112,7 @@ def _run_and_validate_l2(
             golden_args = test_args.clone()
             self.compute_golden(golden_args, params)
 
-            worker.run_prepared(0, chip_args, config=config)
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
             _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
         test_args = self.generate_args(params)
@@ -108,11 +120,11 @@ def _run_and_validate_l2(
         golden_args = test_args.clone()
         self.compute_golden(golden_args, params)
 
-        worker.run_prepared(1, chip_args, config=config)
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
-        worker.unregister_callable(0)
-        worker.unregister_callable(1)
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
 
     def _setup_dlopen_count_test(self, st_worker, st_platform):
         case = self.CASES[0]
@@ -135,58 +147,58 @@ def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
         baseline = st_worker.host_dlopen_count
         baseline_aicpu = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 1
             assert st_worker.aicpu_dlopen_count == baseline_aicpu
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.host_dlopen_count
         baseline_aicpu = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
-            st_worker.prepare_callable(1, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
-                self._run_one(st_worker, 1, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 2
             assert st_worker.aicpu_dlopen_count == baseline_aicpu
         finally:
-            st_worker.unregister_callable(0)
-            st_worker.unregister_callable(1)
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
 
     def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
         callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             with pytest.raises(RuntimeError):
-                st_worker.prepare_callable(0, callable_obj)
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.host_dlopen_count
         registered = False
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 1
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
             registered = False
             assert st_worker.host_dlopen_count - baseline == 1, "unregister must NOT decrement the host dlopen counter"
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.host_dlopen_count - baseline == 2
         finally:
             if registered:
-                st_worker.unregister_callable(0)
+                st_worker.unregister_callable(_CID_PRIMARY)
 
 
 if __name__ == "__main__":
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index f5627fa6d..a8a7cedf2 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -29,10 +29,22 @@
 
 _VECTOR_KERNELS = "../../../../../examples/a5/tensormap_and_ringbuffer/vector_example/kernels"
 
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
 
 @scene_test(level=2, runtime="tensormap_and_ringbuffer")
 class TestPreparedCallable(SceneTestCase):
-    """Exercise prepare_callable / run_prepared / unregister_callable ABI on a5/trb."""
+    """Exercise prepare_callable / run_prepared / unregister_callable ABI on a5/trb.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
 
     CALLABLE = {
         "orchestration": {
@@ -105,42 +117,41 @@ def _run_and_validate_l2(
         config = self._build_config(config_dict)
 
         # 1) prepare two callable_ids with the SAME callable (shared orch SO)
-        worker.prepare_callable(0, callable_obj)
-        worker.prepare_callable(1, callable_obj)
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
 
-        # 2) run_prepared cid=0 twice (second run proves dedup/cache hit)
+        # 2) run_prepared primary cid twice (second run proves dedup/cache hit)
         for _ in range(2):
             test_args = self.generate_args(params)
             chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
             golden_args = test_args.clone()
             self.compute_golden(golden_args, params)
 
-            worker.run_prepared(0, chip_args, config=config)
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
             _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
-        # 3) run_prepared cid=1 — different slot, same SO, must also work
+        # 3) run_prepared secondary cid — different slot, same SO, must also work
         test_args = self.generate_args(params)
         chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
         golden_args = test_args.clone()
         self.compute_golden(golden_args, params)
 
-        worker.run_prepared(1, chip_args, config=config)
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
         # 4) unregister both — should not raise
-        worker.unregister_callable(0)
-        worker.unregister_callable(1)
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
 
     # ------------------------------------------------------------------
     # aicpu_dlopen_count assertions.
     #
-    # The L2 worker fixture is shared across tests in this class, so the
-    # counter can be non-zero on entry from prior tests' leftover prepared
-    # callables (or from this test class's own test_run). Each test below
-    # snapshots the counter on entry, asserts the *delta* introduced by the
-    # scenario, then unregisters everything it staged so the next test sees
-    # the same baseline (unregister_callable erases the cid, decrementing
-    # the counter).
+    # The class-scope L2 worker is shared across test methods in this
+    # class (see ./conftest.py), so the counter can be non-zero on entry
+    # from prior methods. Each test below snapshots the counter on entry,
+    # asserts the *delta* introduced by the scenario, then unregisters
+    # everything it staged. unregister_callable does NOT decrement the
+    # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare).
     # ------------------------------------------------------------------
 
     def _setup_dlopen_count_test(self, st_worker, st_platform):
@@ -161,50 +172,50 @@ def _run_one(self, worker, cid, callable_obj, config, case):
         _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
     def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
-        """Case A: prepare(0) + run(0) × 5 → dlopen_count delta == 1."""
+        """Case A: prepare(primary) + run × 5 → dlopen_count delta == 1."""
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 1, (
-                f"expected exactly 1 new dlopen for 5 runs of cid=0, "
+                f"expected exactly 1 new dlopen for 5 runs of primary cid, "
                 f"got delta {st_worker.aicpu_dlopen_count - baseline}"
             )
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
-        """Case B: prepare(0)+prepare(1) + (run(0),run(1)) × 5 → delta == 2."""
+        """Case B: prepare(primary)+prepare(secondary) + alternating runs × 5 → delta == 2."""
         callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
         baseline = st_worker.aicpu_dlopen_count
         try:
-            st_worker.prepare_callable(0, callable_obj)
-            st_worker.prepare_callable(1, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
             for _ in range(5):
-                self._run_one(st_worker, 0, callable_obj, config, case)
-                self._run_one(st_worker, 1, callable_obj, config, case)
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 2, (
-                f"expected exactly 2 new dlopens for cids {{0,1}} interleaved, "
+                f"expected exactly 2 new dlopens for two cids interleaved, "
                 f"got delta {st_worker.aicpu_dlopen_count - baseline}"
             )
         finally:
-            st_worker.unregister_callable(0)
-            st_worker.unregister_callable(1)
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
 
     def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
-        """Case C: prepare(0) + prepare(0) → second call raises RuntimeError."""
+        """Case C: prepare(primary) + prepare(primary) → second call raises RuntimeError."""
         callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             with pytest.raises(RuntimeError):
-                st_worker.prepare_callable(0, callable_obj)
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
         finally:
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
 
     def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
-        """Case D: prepare(0)+run(0)+unregister(0)+prepare(0)+run(0) → delta == 2.
+        """Case D: prepare+run+unregister+prepare+run on the same cid → delta == 2.
 
         unregister erases the cid from aicpu_seen_callable_ids_, so the second
         prepare/run pair sets register_new_callable_id_ again and the AICPU
@@ -215,26 +226,26 @@ def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
         baseline = st_worker.aicpu_dlopen_count
         registered = False
         try:
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 1
-            st_worker.unregister_callable(0)
+            st_worker.unregister_callable(_CID_PRIMARY)
             registered = False
             after_unreg = st_worker.aicpu_dlopen_count
             assert after_unreg - baseline == 1, (
                 f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
             )
-            st_worker.prepare_callable(0, callable_obj)
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
             registered = True
-            self._run_one(st_worker, 0, callable_obj, config, case)
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
             assert st_worker.aicpu_dlopen_count - baseline == 2, (
                 f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
                 f"got delta {st_worker.aicpu_dlopen_count - baseline}"
             )
         finally:
             if registered:
-                st_worker.unregister_callable(0)
+                st_worker.unregister_callable(_CID_PRIMARY)
 
 
 if __name__ == "__main__":