diff --git a/conftest.py b/conftest.py
index bde0b7800..5d029caaf 100644
--- a/conftest.py
+++ b/conftest.py
@@ -920,11 +920,22 @@ def st_worker(request, st_platform, device_pool, _l2_worker_pool):
 
         # Register SubCallable entries from cls.CALLABLE
         sub_ids = {}
+        chip_cids = {}
         for entry in cls.CALLABLE.get("callables", []):
             if "callable" in entry:
                 cid = w.register(entry["callable"])
                 sub_ids[entry["name"]] = cid
+            elif "orchestration" in entry:
+                from simpler_setup.scene_test import _compile_chip_callable_from_spec  # noqa: PLC0415
+
+                name = entry["name"]
+                cache_key = (cls.__qualname__, name, st_platform, runtime)
+                chip = _compile_chip_callable_from_spec(entry, st_platform, runtime, cache_key)
+                cid = w.register(chip)
+                chip_cids[name] = cid
+                chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", [])
         cls._st_sub_ids = sub_ids
+        cls._st_chip_cids = chip_cids
 
         w.init()
         yield w
diff --git a/docs/getting-started.md b/docs/getting-started.md
index a7232d2bf..14a1d3a3f 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -166,14 +166,21 @@ worker.init(host_path=str(binaries.host_path),
             aicore_path=str(binaries.aicore_path))
 worker.set_device(device_id=0)
 
-# Execute callable on device
-worker.run(chip_callable, orch_args, block_dim=24)
+# Register the ChipCallable to obtain a callable_id
+cid = worker.register(chip_callable)
+
+# Execute the registered callable on device
+worker.run(cid, orch_args, block_dim=24)
 
 # Cleanup
 worker.reset_device()
 worker.finalize()
 ```
 
+`ChipWorker` follows the same `register → run(cid)` contract as
+`Worker(level=2)`; reach for the high-level `Worker` first and use
+`ChipWorker` only when a low-level handle is required.
+
 ## Configuration
 
 ### Compile-time Configuration (Runtime Limits)
diff --git a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
index 9977a3a4b..7461f1b7e 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
@@ -137,6 +137,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -157,7 +158,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
index 6045efe4d..31cd3c479 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
@@ -159,6 +159,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py b/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
index fd370712d..5033e0a95 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
@@ -172,6 +172,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -191,7 +192,7 @@ def orch_fn(orch, _args, cfg):
                 args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING)
                 args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING)
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
index 2bfab2131..c71fe5498 100644
--- a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
+++ b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
@@ -131,6 +131,7 @@ def run(platform: str = "a5", device_ids: list[int] | None = None, pto_isa_commi
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -151,7 +152,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
index e20b2ecec..d05e19b13 100644
--- a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
+++ b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
@@ -159,6 +159,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 
diff --git a/examples/workers/l2/README.md b/examples/workers/l2/README.md
index 242ca926f..fb6f29251 100644
--- a/examples/workers/l2/README.md
+++ b/examples/workers/l2/README.md
@@ -23,12 +23,19 @@ worker = Worker(
 )
 worker.init()             # load host.so + aicpu.so + aicore.o, set device
 try:
-    # ... allocate device buffers, build ChipCallable, run ...
-    worker.run(chip_callable, task_args, call_config)
+    # ... allocate device buffers, build ChipCallable ...
+    cid = worker.register(chip_callable)   # one-shot: cid is reused across runs
+    worker.run(cid, task_args, call_config)
 finally:
     worker.close()        # release ACL resources and device
 ```
 
+`register()` is the only way to obtain a `cid`; `worker.run` always takes
+that int, never the raw `ChipCallable`. A cid stays valid for the
+lifetime of the worker, so you register once and reuse it across runs —
+this is also why ST cases cache the cid on the test class (see
+`_st_l2_cid` in `simpler_setup/scene_test.py`).
+
 The `try/finally` is important — if anything between `init()` and `close()`
 raises, you still want the device released. The
 [L2 conftest leak issue](https://github.com/hw-native-sys/simpler/issues/604)
diff --git a/examples/workers/l2/vector_add/README.md b/examples/workers/l2/vector_add/README.md
index 7e5776d9e..e294fb2aa 100644
--- a/examples/workers/l2/vector_add/README.md
+++ b/examples/workers/l2/vector_add/README.md
@@ -96,7 +96,7 @@ args.add_tensor(ContinuousTensor.make(dev_a,   shape, DataType.FLOAT32))
 args.add_tensor(ContinuousTensor.make(dev_b,   shape, DataType.FLOAT32))
 args.add_tensor(ContinuousTensor.make(dev_out, shape, DataType.FLOAT32))
 
-worker.run(chip_callable, args, CallConfig())
+worker.run(chip_cid, args, CallConfig())  # chip_cid = worker.register(chip_callable) before init()
 ```
 
 The tensor order must match `signature` order on the `ChipCallable`. `run()`
diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py
index 94867ca32..6ad1480ad 100644
--- a/examples/workers/l2/vector_add/main.py
+++ b/examples/workers/l2/vector_add/main.py
@@ -19,7 +19,8 @@
     host arrays ──[worker.malloc + copy_to]──►  device buffers
                                           │
                                           ▼
-                              worker.run(chip_callable, task_args, cfg)
+                       chip_cid = worker.register(chip_callable)  # before init()
+                              worker.run(chip_cid, task_args, cfg)
                                           │
     device result ──[worker.copy_from]──► host array ──[torch compare]
 
@@ -126,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable:
     )
 
 
-def _run(worker: Worker, chip_callable: ChipCallable) -> None:
+def _run(worker: Worker, chip_cid: int) -> None:
     """Allocate device memory, copy inputs, execute, copy outputs back, verify."""
     # --- 1. Prepare host arrays ---
     torch.manual_seed(42)
@@ -154,7 +155,7 @@ def _run(worker: Worker, chip_callable: ChipCallable) -> None:
     # --- 4. Run. CallConfig() defaults are fine for this kernel. ---
     config = CallConfig()
     print("[vector_add] running on device...")
-    worker.run(chip_callable, args, config)
+    worker.run(chip_cid, args, config)
 
     # --- 5. D2H copy back + verify ---
     worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
@@ -183,10 +184,12 @@ def run(platform: str, device_id: int) -> int:
     chip_callable = build_chip_callable(platform)
     print(f"[vector_add] compiled. binary_size={chip_callable.binary_size} bytes")
 
+    chip_cid = worker.register(chip_callable)
+
     print(f"[vector_add] init worker (device={device_id})...")
     worker.init()
     try:
-        _run(worker, chip_callable)
+        _run(worker, chip_cid)
     finally:
         worker.close()
     return 0
diff --git a/examples/workers/l3/allreduce_distributed/main.py b/examples/workers/l3/allreduce_distributed/main.py
index bd646df82..0dfa3d4de 100644
--- a/examples/workers/l3/allreduce_distributed/main.py
+++ b/examples/workers/l3/allreduce_distributed/main.py
@@ -194,6 +194,7 @@ def run(device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    chip_cid = worker.register(chip_callable)
 
     try:
         print("[allreduce] init worker (forks chip children + bootstraps HCCL)...")
@@ -227,7 +228,7 @@ def orch_fn(orch, _args, cfg):
                 )
                 chip_args.add_scalar(ctx.nranks)
                 chip_args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+                orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
         print("[allreduce] running 2-chip allreduce DAG...")
         worker.run(orch_fn, args=None, config=CallConfig())
diff --git a/examples/workers/l3/child_memory/main.py b/examples/workers/l3/child_memory/main.py
index 2dfe3f4e0..b107983fb 100644
--- a/examples/workers/l3/child_memory/main.py
+++ b/examples/workers/l3/child_memory/main.py
@@ -147,6 +147,7 @@ def run(platform: str, device_id: int) -> int:
 
     print(f"[child_memory] compiling kernels for {platform}...")
     chip_callable = build_chip_callable(platform)
+    chip_cid = worker.register(chip_callable)
 
     print("[child_memory] init worker...")
     worker.init()
@@ -172,7 +173,7 @@ def orch_fn(orch, _args, cfg):
                 a.add_tensor(make_tensor_arg(host_a), TensorArgType.INPUT)
                 a.add_tensor(w_dev, TensorArgType.INPUT)
                 a.add_tensor(make_tensor_arg(out), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(chip_callable, a, cfg, worker=0)
+                orch.submit_next_level(chip_cid, a, cfg, worker=0)
 
             # dev_w is reclaimed by DeviceRunner::finalize on worker.close() —
             # we don't orch.free it here, that's the whole point of child_memory.
diff --git a/examples/workers/l3/ffn_tp_parallel/main.py b/examples/workers/l3/ffn_tp_parallel/main.py
index b41dd561b..aa2bb2d2c 100644
--- a/examples/workers/l3/ffn_tp_parallel/main.py
+++ b/examples/workers/l3/ffn_tp_parallel/main.py
@@ -209,6 +209,8 @@ def run(device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    ffn_cid = worker.register(ffn_local_cc)
+    allreduce_cid = worker.register(allreduce_cc)
 
     try:
         print("[ffn_tp_parallel] init worker (forks chip children + bootstraps HCCL)...")
@@ -231,7 +233,7 @@ def orch_fn(orch, _args, cfg):
                 a1.add_tensor(make_tensor_arg(host_x_shards[i]), TensorArgType.INPUT)
                 a1.add_tensor(make_tensor_arg(host_w_shards[i]), TensorArgType.INPUT)
                 a1.add_tensor(make_tensor_arg(host_partial[i]), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(ffn_local_cc, a1, cfg, worker=i)
+                orch.submit_next_level(ffn_cid, a1, cfg, worker=i)
 
                 # Stage 2: AIV cross-rank sum. Tagging partial_local INPUT
                 # with the same buffer.addr makes TensorMap auto-link this
@@ -250,7 +252,7 @@ def orch_fn(orch, _args, cfg):
                 )
                 a2.add_scalar(ctx.nranks)
                 a2.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(allreduce_cc, a2, cfg, worker=i)
+                orch.submit_next_level(allreduce_cid, a2, cfg, worker=i)
 
         print("[ffn_tp_parallel] running 2-chip 2-stage DAG...")
         worker.run(orch_fn, args=None, config=CallConfig())
diff --git a/examples/workers/l3/multi_chip_dispatch/README.md b/examples/workers/l3/multi_chip_dispatch/README.md
index 83ba2f889..7994dcaf5 100644
--- a/examples/workers/l3/multi_chip_dispatch/README.md
+++ b/examples/workers/l3/multi_chip_dispatch/README.md
@@ -10,9 +10,10 @@ chip outputs. The smallest correct L3 program.
 | ------- | ------------------------------ |
 | Shared-memory tensors | `torch.randn(...).share_memory_()` — chip children see the same storage |
 | `TensorArgType` tags | `INPUT` / `OUTPUT_EXISTING` drive DAG dependency tracking |
-| Python SubWorker | `worker.register(fn)` **before** `init()` |
+| ChipCallable id | `chip_cid = worker.register(chip_callable)` **before** `init()` |
+| Python SubWorker | `sub_cid = worker.register(fn)` **before** `init()` |
 | `Worker(level=3)` config | `device_ids=[0, 1]`, `num_sub_workers=1` |
-| Orchestration | `orch.submit_next_level(...)` per chip + `orch.submit_sub(cid, args)` |
+| Orchestration | `orch.submit_next_level(chip_cid, ...)` per chip + `orch.submit_sub(sub_cid, args)` |
 
 ## Layout
 
@@ -66,7 +67,8 @@ host_b   = [torch.randn(...).share_memory_() for _ in device_ids]
 host_out = [torch.zeros(...).share_memory_() for _ in device_ids]
 
 def subworker(sub_args): ...
-sub_cid = worker.register(subworker)   # BEFORE init() — see below
+chip_cid = worker.register(chip_callable)   # ChipCallable: BEFORE init()
+sub_cid  = worker.register(subworker)        # Python SubWorker: BEFORE init()
 ```
 
 `share_memory_()` moves the tensor's storage to a `mmap` region. After
@@ -74,9 +76,11 @@ sub_cid = worker.register(subworker)   # BEFORE init() — see below
 address, so when the kernel writes to `host_out[i]`, the parent's tensor sees
 it immediately. No explicit copy back.
 
-**`register()` MUST come before `init()`**. `init()` forks child processes;
-the registry is captured by copy-on-write. Anything registered after `init()`
-is invisible to the forked children.
+**`register()` MUST come before `init()`** for *every* callable — both
+the `ChipCallable` dispatched to chips and the Python sub functions.
+`init()` forks child processes; the registry is captured by copy-on-write.
+Anything registered after `init()` is invisible to the forked children,
+and `Worker.register()` at L≥3 raises if called post-init.
 
 ### 2. `init()` — fork + C++ scheduler
 
@@ -93,7 +97,7 @@ def orch_fn(orch, _args, cfg):
         chip_args.add_tensor(make_tensor_arg(host_a[i]),   TensorArgType.INPUT)
         chip_args.add_tensor(make_tensor_arg(host_b[i]),   TensorArgType.INPUT)
         chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING)
-        orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+        orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
     sub_args = TaskArgs()
     for i in range(len(device_ids)):
diff --git a/examples/workers/l3/multi_chip_dispatch/main.py b/examples/workers/l3/multi_chip_dispatch/main.py
index b92a6fc10..1b5278877 100644
--- a/examples/workers/l3/multi_chip_dispatch/main.py
+++ b/examples/workers/l3/multi_chip_dispatch/main.py
@@ -146,6 +146,9 @@ def subworker(sub_args: TaskArgs) -> None:
     print(f"[multi_chip_dispatch] compiling kernels for {platform}...")
     chip_callable = build_chip_callable(platform)
 
+    # Register the ChipCallable so submit_next_level takes a cid.
+    chip_cid = worker.register(chip_callable)
+
     # --- 5. init() forks chip + sub child processes, starts C++ scheduler.
     print("[multi_chip_dispatch] init worker...")
     worker.init()
@@ -165,7 +168,7 @@ def orch_fn(orch, _args, cfg):
                 chip_args.add_tensor(make_tensor_arg(host_a[i]), TensorArgType.INPUT)
                 chip_args.add_tensor(make_tensor_arg(host_b[i]), TensorArgType.INPUT)
                 chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+                orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
             # Sub task that depends on both chip outputs. Tagging the two
             # host_out[i] tensors INPUT tells the scheduler to wait for
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 344758b78..258e00cec 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -623,34 +623,76 @@ NB_MODULE(_task_interface, m) {
         .def("reset_device", &ChipWorker::reset_device)
         .def("finalize", &ChipWorker::finalize)
         .def(
-            "run",
-            [](ChipWorker &self, const PyChipCallable &callable, ChipStorageTaskArgs &args, const CallConfig &config) {
-                self.run(callable.buffer_.data(), &args, config);
+            "prepare_callable",
+            [](ChipWorker &self, int32_t callable_id, const PyChipCallable &callable) {
+                self.prepare_callable(callable_id, callable.buffer_.data());
             },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config")
+            nb::arg("callable_id"), nb::arg("callable"),
+            "Stage a ChipCallable under callable_id for cheap repeated launches "
+            "via run_prepared. Variants without per-callable_id support raise."
         )
         .def(
-            "run_raw",
-            [](ChipWorker &self, uint64_t callable, uint64_t args, const CallConfig &config) {
-                self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
+            "run_prepared",
+            [](ChipWorker &self, int32_t callable_id, ChipStorageTaskArgs &args, const CallConfig &config) {
+                self.run_prepared(callable_id, &args, config);
             },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config"),
-            "Run with raw pointer arguments (used from forked chip process)."
+            nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
+            "Launch a callable_id previously staged via prepare_callable."
         )
         .def(
-            "run_from_blob",
-            [](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, const CallConfig &config) {
-                TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(blob_ptr), MAILBOX_ARGS_CAPACITY);
-                self.run(callable, view, config);
+            "run_prepared",
+            [](ChipWorker &self, int32_t callable_id, TaskArgs &args, const CallConfig &config) {
+                TaskArgsView view = make_view(args);
+                self.run_prepared(callable_id, view, config);
             },
-            nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("config"),
-            "Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at "
-            "blob_ptr and dispatch to the runtime. Used from forked chip processes "
-            "reading the WorkerThread mailbox."
+            nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
+            "Launch a callable_id from a TaskArgs (used for in-process callers)."
+        )
+        .def(
+            "run_prepared_from_blob",
+            [](ChipWorker &self, int32_t callable_id, uint64_t args_blob_ptr, size_t blob_capacity,
+               const CallConfig &config) {
+                // The mailbox region is the on-wire format `write_blob` produced;
+                // `read_blob` is the matching reader that returns a zero-copy
+                // TaskArgsView into the caller-owned bytes. Forwards to the
+                // existing `run_prepared(cid, view, config)` path so chip-child
+                // loops never re-implement the tensor/scalar layout in Python
+                // (where it has historically dropped fields like child_memory).
+                TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(args_blob_ptr), blob_capacity);
+                self.run_prepared(callable_id, view, config);
+            },
+            nb::arg("callable_id"), nb::arg("args_blob_ptr"), nb::arg("blob_capacity"), nb::arg("config"),
+            "Launch a callable_id from a raw mailbox-blob pointer + capacity "
+            "(used by chip-child mailbox loops to avoid Python-side re-deserialisation "
+            "of the per-task tensor/scalar layout). The blob must be in the format "
+            "produced by `write_blob`; read_blob enforces capacity bounds against shm corruption."
+        )
+        .def(
+            "unregister_callable",
+            [](ChipWorker &self, int32_t callable_id) {
+                self.unregister_callable(callable_id);
+            },
+            nb::arg("callable_id"),
+            "Drop the prepared state for callable_id; releases the per-id share "
+            "of the device orch SO buffer (kernel binaries stay resident until "
+            "finalize)."
         )
         .def_prop_ro("device_id", &ChipWorker::device_id)
         .def_prop_ro("initialized", &ChipWorker::initialized)
         .def_prop_ro("device_set", &ChipWorker::device_set)
+        .def_prop_ro(
+            "aicpu_dlopen_count", &ChipWorker::aicpu_dlopen_count,
+            "Number of distinct callable_ids the AICPU has dlopened for on the "
+            "bound device. Equals 0 when no device is set or the runtime "
+            "variant lacks per-cid registration. Tests assert this to verify "
+            "prepare_callable + repeated run_prepared do not redundantly dlopen."
+        )
+        .def_prop_ro(
+            "host_dlopen_count", &ChipWorker::host_dlopen_count,
+            "Number of host-side dlopens triggered by prepare_callable on "
+            "host_build_graph variants. Mirrors aicpu_dlopen_count for the "
+            "host-orchestration path; 0 on device-orch variants."
+        )
         .def("malloc", &ChipWorker::malloc, nb::arg("size"))
         .def("free", &ChipWorker::free, nb::arg("ptr"))
         .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size"))
diff --git a/python/bindings/worker_bind.h b/python/bindings/worker_bind.h
index f9824980f..00355856a 100644
--- a/python/bindings/worker_bind.h
+++ b/python/bindings/worker_bind.h
@@ -98,20 +98,22 @@ inline void bind_worker(nb::module_ &m) {
     nb::class_<Orchestrator>(m, "_Orchestrator")
         .def(
             "submit_next_level",
-            [](Orchestrator &self, uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker) {
-                return self.submit_next_level(callable, args, config, worker);
+            [](Orchestrator &self, int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker) {
+                return self.submit_next_level(callable_id, args, config, worker);
             },
-            nb::arg("callable"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1),
-            "Submit a NEXT_LEVEL (chip) task. worker= pins to a specific next-level worker (-1 = any)."
+            nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1),
+            "Submit a NEXT_LEVEL (chip) task by registered callable id. "
+            "worker= pins to a specific next-level worker (-1 = any)."
         )
         .def(
             "submit_next_level_group",
-            [](Orchestrator &self, uint64_t callable, const std::vector<TaskArgs> &args_list, const CallConfig &config,
-               const std::vector<int8_t> &workers) {
-                return self.submit_next_level_group(callable, args_list, config, workers);
+            [](Orchestrator &self, int32_t callable_id, const std::vector<TaskArgs> &args_list,
+               const CallConfig &config, const std::vector<int8_t> &workers) {
+                return self.submit_next_level_group(callable_id, args_list, config, workers);
             },
-            nb::arg("callable"), nb::arg("args_list"), nb::arg("config"), nb::arg("workers") = std::vector<int8_t>{},
-            "Submit a group of NEXT_LEVEL tasks. workers= per-args affinity (empty = any)."
+            nb::arg("callable_id"), nb::arg("args_list"), nb::arg("config"), nb::arg("workers") = std::vector<int8_t>{},
+            "Submit a group of NEXT_LEVEL tasks by registered callable id. "
+            "workers= per-args affinity (empty = any)."
         )
         .def(
             "submit_sub",
diff --git a/python/simpler/orchestrator.py b/python/simpler/orchestrator.py
index 4a8bec655..29bc84db6 100644
--- a/python/simpler/orchestrator.py
+++ b/python/simpler/orchestrator.py
@@ -17,11 +17,11 @@ def my_orch(orch, args, cfg):
         a = TaskArgs()
         a.add_tensor(make_tensor_arg(input_tensor),  TensorArgType.INPUT)
         a.add_tensor(make_tensor_arg(output_tensor), TensorArgType.OUTPUT)
-        orch.submit_next_level(chip_callable, a, cfg)
+        orch.submit_next_level(chip_cid, a, cfg)  # cid from Worker.register(chip_callable)
 
         sub_args = TaskArgs()
         sub_args.add_tensor(make_tensor_arg(output_tensor), TensorArgType.INPUT)
-        orch.submit_sub(cid, sub_args)
+        orch.submit_sub(sub_cid, sub_args)
 
     w.run(my_orch, my_args, my_config)
 
@@ -35,6 +35,7 @@ def my_orch(orch, args, cfg):
 
 from .task_interface import (
     CallConfig,
+    ChipCallable,
     ContinuousTensor,
     DataType,
     TaskArgs,
@@ -44,11 +45,21 @@ def my_orch(orch, args, cfg):
 )
 
 
-def _resolve_callable_ptr(callable_: Any) -> int:
-    """Accept either a ChipCallable (has buffer_ptr()) or a raw int pointer."""
-    if hasattr(callable_, "buffer_ptr"):
-        return callable_.buffer_ptr()
-    return int(callable_)
+def _require_cid(callable_or_cid: Any, *, kind: str) -> int:
+    """Coerce a submit argument to a registered cid.
+
+    Raises a clear migration error when the caller still passes a
+    ``ChipCallable`` directly — every chip callable must be registered
+    via ``Worker.register(callable)`` *before* ``init()`` so each chip
+    child can pre-warm it on its own device.
+    """
+    if isinstance(callable_or_cid, ChipCallable) or hasattr(callable_or_cid, "buffer_ptr"):
+        raise TypeError(
+            f"{kind} now takes a registered cid, not a ChipCallable. "
+            "Register the callable before init() via "
+            "`cid = worker.register(chip_callable)` and pass `cid` here."
+        )
+    return int(callable_or_cid)
 
 
 class Orchestrator:
@@ -68,18 +79,21 @@ def __init__(self, c_orchestrator: _COrchestrator) -> None:
     # ------------------------------------------------------------------
 
     def submit_next_level(
-        self, callable_: Any, args: TaskArgs, config: Optional[CallConfig] = None, *, worker: int = -1
+        self, callable_id: Any, args: TaskArgs, config: Optional[CallConfig] = None, *, worker: int = -1
     ):
-        """Submit a NEXT_LEVEL (chip) task. Tags inside ``args`` drive deps.
+        """Submit a NEXT_LEVEL (chip) task by registered callable id.
 
+        ``callable_id`` must be the int returned by
+        ``Worker.register(chip_callable)``. Tags inside ``args`` drive deps.
         ``worker``: logical worker id for affinity (-1 = unconstrained).
         """
         cfg = config if config is not None else CallConfig()
-        return self._o.submit_next_level(_resolve_callable_ptr(callable_), args, cfg, int(worker))
+        cid = _require_cid(callable_id, kind="orch.submit_next_level")
+        return self._o.submit_next_level(cid, args, cfg, int(worker))
 
     def submit_next_level_group(
         self,
-        callable_: Any,
+        callable_id: Any,
         args_list: list,
         config: Optional[CallConfig] = None,
         *,
@@ -91,7 +105,8 @@ def submit_next_level_group(
         """
         cfg = config if config is not None else CallConfig()
         w = [int(x) for x in workers] if workers else []
-        return self._o.submit_next_level_group(_resolve_callable_ptr(callable_), args_list, cfg, w)
+        cid = _require_cid(callable_id, kind="orch.submit_next_level_group")
+        return self._o.submit_next_level_group(cid, args_list, cfg, w)
 
     def submit_sub(self, callable_id: int, args: Optional[TaskArgs] = None):
         """Submit a SUB task by registered callable id.
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 71ac81122..3dd918b89 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -240,7 +240,8 @@ class ChipWorker:
                     aicpu_path="build/lib/.../aicpu.so",
                     aicore_path="build/lib/.../aicore.o")
         worker.set_device(device_id=0)
-        worker.run(chip_callable, orch_args, block_dim=24)
+        worker.prepare_callable(callable_id=0, callable=chip_callable)
+        worker.run_prepared(callable_id=0, args=orch_args, config=CallConfig(block_dim=24))
         worker.reset_device()
         worker.finalize()
     """
@@ -305,11 +306,20 @@ def finalize(self):
         """
         self._impl.finalize()
 
-    def run(self, callable, args, config=None, **kwargs):
-        """Execute a callable synchronously.
+    def prepare_callable(self, callable_id, callable):
+        """Stage a ChipCallable under ``callable_id`` for repeated cheap launches.
+
+        Uploads the kernel binaries + the orchestration SO once; subsequent
+        ``run_prepared(callable_id, ...)`` skips that work. ``callable_id``
+        must be in ``[0, 64)``. Requires ``set_device()``.
+        """
+        self._impl.prepare_callable(int(callable_id), callable)
+
+    def run_prepared(self, callable_id, args, config=None, **kwargs):
+        """Launch a ``callable_id`` previously staged via ``prepare_callable``.
 
         Args:
-            callable: ChipCallable built from orchestration + kernel binaries.
+            callable_id: Stable id passed to a prior ``prepare_callable``.
             args: ChipStorageTaskArgs for this invocation.
             config: Optional CallConfig. If None, a default is created.
             **kwargs: Overrides applied to config (e.g. block_dim=24).
@@ -318,16 +328,21 @@ def run(self, callable, args, config=None, **kwargs):
             config = CallConfig()
         for k, v in kwargs.items():
             setattr(config, k, v)
-        self._impl.run(callable, args, config)
+        self._impl.run_prepared(int(callable_id), args, config)
 
-    def run_from_blob(self, callable, blob_ptr, config):
-        """Execute via a serialized args blob in shared memory.
+    def unregister_callable(self, callable_id):
+        """Drop prepared state for ``callable_id`` and release its orch SO share."""
+        self._impl.unregister_callable(int(callable_id))
 
-        Used by `_chip_process_loop` after reading the mailbox: instead of
-        deserializing the args into Python objects, the C++ side parses the
-        POD blob directly at `blob_ptr`.
-        """
-        self._impl.run_from_blob(int(callable), int(blob_ptr), config)
+    @property
+    def aicpu_dlopen_count(self):
+        """Number of distinct callable_ids the AICPU has dlopened for."""
+        return self._impl.aicpu_dlopen_count
+
+    @property
+    def host_dlopen_count(self):
+        """Number of host-side orch SO dlopens (host_build_graph variants)."""
+        return self._impl.host_dlopen_count
 
     def malloc(self, size):
         """Allocate memory. Returns a pointer (uint64)."""
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 073084dc6..8fc2861ac 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -8,23 +8,31 @@
 # -----------------------------------------------------------------------------------------------------------
 """Worker — unified factory for all hierarchy levels.
 
+Callable identity is a ``cid`` (int), allocated exclusively by
+``Worker.register(callable)``. ``Worker.run`` and the orchestrator's
+``submit_next_level`` / ``submit_sub`` all take this cid — never the raw
+``ChipCallable`` / Python function. L≥3 ``register()`` must run **before**
+``init()`` so forked chip / sub children inherit the registry via COW.
+
 Usage::
 
     # L2: one NPU chip
     w = Worker(level=2, device_id=8, platform="a2a3", runtime="tensormap_and_ringbuffer")
     w.init()
-    w.run(chip_callable, chip_args, config)
+    chip_cid = w.register(chip_callable)            # L2 may register pre or post init()
+    w.run(chip_cid, chip_args, config)
     w.close()
 
     # L3: multiple chips + SubWorkers, auto-discovery in init()
     w = Worker(level=3, device_ids=[8, 9], num_sub_workers=2,
                platform="a2a3", runtime="tensormap_and_ringbuffer")
-    cid = w.register(lambda args: postprocess())
+    chip_cid = w.register(chip_callable)            # ChipCallable, before init()
+    sub_cid  = w.register(lambda args: postprocess())  # Python sub, before init()
     w.init()
 
     def my_orch(orch, args, cfg):
-        r = orch.submit_next_level(chip_callable, chip_args_ptr, cfg)
-        orch.submit_sub(cid, sub_args)
+        r = orch.submit_next_level(chip_cid, chip_args_ptr, cfg)
+        orch.submit_sub(sub_cid, sub_args)
 
     w.run(my_orch, my_args, my_config)
     w.close()
@@ -54,7 +62,7 @@ def my_l4_orch(orch, args, config):
 import time
 import traceback
 from multiprocessing.shared_memory import SharedMemory
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 from _task_interface import (  # pyright: ignore[reportMissingImports]
     CHIP_BOOTSTRAP_MAILBOX_SIZE,
@@ -62,6 +70,7 @@ def my_l4_orch(orch, args, config):
     ChipBootstrapMailboxState,
     _mailbox_load_i32,
     _mailbox_store_i32,
+    read_args_from_blob,
 )
 
 from . import _log as _simpler_log
@@ -72,10 +81,9 @@ def my_l4_orch(orch, args, config):
     MAILBOX_SIZE,
     CallConfig,
     ChipBootstrapConfig,
+    ChipCallable,
     ChipContext,
     ChipWorker,
-    ContinuousTensor,
-    DataType,
     TaskArgs,
     _Worker,
 )
@@ -86,6 +94,7 @@ def my_l4_orch(orch, args, config):
 _BOOTSTRAP_WAIT_TIMEOUT_S = 120.0
 _BOOTSTRAP_POLL_INTERVAL_S = 0.001
 
+
 # ---------------------------------------------------------------------------
 # Unified mailbox layout (must match worker_manager.h MAILBOX_OFF_*)
 # ---------------------------------------------------------------------------
@@ -127,6 +136,11 @@ def my_l4_orch(orch, args, config):
 _CTRL_FREE = 1
 _CTRL_COPY_TO = 2
 _CTRL_COPY_FROM = 3
+# Pre-warm a chip child for cid=arg0 by calling
+# `prepare_callable(cid, registry[cid])` so the first run_prepared() does
+# not pay the H2D upload cost.  Sent from the parent right after init()
+# (or whenever a new ChipCallable cid is registered).
+_CTRL_PREPARE = 4
 
 # Control args layout (reuses task mailbox fields when state == _CONTROL_*):
 #   offset  8 (_OFF_CALLABLE):  uint64  sub-command
@@ -190,38 +204,21 @@ def _format_exc(prefix: str, exc: BaseException) -> str:
 def _read_args_from_mailbox(buf) -> TaskArgs:
     """Decode the TaskArgs blob written by C++ write_blob from the mailbox.
 
-    Blob layout at _OFF_ARGS:
-      int32 tensor_count (T), int32 scalar_count (S),
-      ContinuousTensor[T] (40 B each), uint64_t[S] (8 B each).
+    Used by the Python-targeted child loops (sub_worker, nested L4+ child)
+    where the destination of `args` is a Python callable that needs a
+    typed TaskArgs object.  The chip-child loops that immediately forward
+    to C++ run_prepared use the zero-copy `run_prepared_from_blob` path
+    instead — see those loops for the matching comment.
+
+    Delegates to the nanobind helper so the ContinuousTensor layout is
+    parsed by C++ `read_blob` (single source of truth) instead of being
+    reimplemented in Python.  The Python re-implementation that lived
+    here previously dropped the `child_memory` byte (offset 33), which
+    silently broke any tensor carrying a chip-owned device pointer
+    (HCCL window slots etc.) — now structurally impossible.
     """
-    base = _OFF_ARGS
-    t_count = struct.unpack_from("i", buf, base)[0]
-    s_count = struct.unpack_from("i", buf, base + 4)[0]
-    if t_count < 0 or s_count < 0:
-        raise RuntimeError(f"args blob has negative counts: tensors={t_count}, scalars={s_count}")
-    blob_bytes = 8 + t_count * 40 + s_count * 8
-    if blob_bytes > _MAILBOX_ARGS_CAPACITY:
-        raise RuntimeError(
-            f"args blob ({blob_bytes} bytes) exceeds mailbox capacity ({_MAILBOX_ARGS_CAPACITY} bytes); "
-            f"tensors={t_count}, scalars={s_count} — likely a corrupt header or a writer bug"
-        )
-
-    args = TaskArgs()
-    ct_off = base + 8
-    for i in range(t_count):
-        off = ct_off + i * 40
-        data = struct.unpack_from("Q", buf, off)[0]
-        shapes = struct.unpack_from("5I", buf, off + 8)
-        ndims = struct.unpack_from("I", buf, off + 28)[0]
-        dtype_val = struct.unpack_from("B", buf, off + 32)[0]
-        ct = ContinuousTensor.make(data, tuple(shapes[:ndims]), DataType(dtype_val))
-        args.add_tensor(ct)
-
-    sc_off = ct_off + t_count * 40
-    for i in range(s_count):
-        args.add_scalar(struct.unpack_from("Q", buf, sc_off + i * 8)[0])
-
-    return args
+    mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
+    return read_args_from_blob(mailbox_addr + _OFF_ARGS)
 
 
 def _sub_worker_loop(buf, registry: dict) -> None:
@@ -260,6 +257,7 @@ def _chip_process_loop(
     buf: memoryview,
     bins,
     device_id: int,
+    registry: dict,
     log_level: int = 1,
     log_info_v: int = 5,
 ) -> None:
@@ -271,6 +269,13 @@ def _chip_process_loop(
     `log_level` / `log_info_v` are the parent's snapshot of the simpler logger
     (computed via `_log.get_current_config()`); the child cannot read the
     parent's logger after fork, so the values are passed explicitly.
+
+    Per-callable_id dispatch: TASK_READY carries a cid in OFF_CALLABLE; the
+    child looks the cid up in the COW-inherited Python ``registry`` to get
+    the ChipCallable, calls ``cw.prepare_callable(cid, callable)`` once,
+    then ``cw.run_prepared(cid, args, cfg)``.  ``_CTRL_PREPARE`` is the
+    explicit pre-warm path (parent pushes after init() to amortise the
+    first H2D upload).
     """
     import traceback as _tb  # noqa: PLC0415
 
@@ -289,20 +294,39 @@ def _chip_process_loop(
 
     mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
     state_addr = mailbox_addr + _OFF_STATE
-    args_ptr = mailbox_addr + _OFF_ARGS
     sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id}] ready\n")
     sys.stderr.flush()
 
+    # Per-child set of cids already prepared on this device.  The parent
+    # pre-warms via _CTRL_PREPARE, but TASK_READY also lazy-prepares as a
+    # safety net (e.g. registrations that bypassed the prefetch path).
+    prepared: set[int] = set()
+
+    def _ensure_prepared(cid: int) -> None:
+        if cid in prepared:
+            return
+        callable_obj = registry.get(cid)
+        if callable_obj is None:
+            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
+        cw.prepare_callable(cid, callable_obj)
+        prepared.add(cid)
+
     while True:
         state = _mailbox_load_i32(state_addr)
         if state == _TASK_READY:
-            callable_ptr = struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]
+            cid = int(struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]) & 0xFFFFFFFF
             cfg = _read_config_from_mailbox(buf)
 
             code = 0
             msg = ""
             try:
-                cw.run_from_blob(callable_ptr, args_ptr, cfg)
+                _ensure_prepared(cid)
+                # Hand the mailbox bytes straight to C++ (zero-copy zero-decode):
+                # the blob layout is what `write_blob` already wrote, so re-parsing
+                # it in Python is N×40B of avoidable work and a permanent
+                # opportunity to drop a field.  C++ reinterpret_cast<ChipStorageTaskArgs*>
+                # is the source of truth.
+                cw._impl.run_prepared_from_blob(cid, mailbox_addr + _OFF_ARGS, _MAILBOX_ARGS_CAPACITY, cfg)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -330,6 +354,9 @@ def _chip_process_loop(
                     src = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0]
                     n = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0]
                     cw.copy_from(dst, src, n)
+                elif sub_cmd == _CTRL_PREPARE:
+                    cid = int(struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0]) & 0xFFFFFFFF
+                    _ensure_prepared(cid)
             except Exception as e:  # noqa: BLE001
                 code = 1
                 msg = _format_exc(f"chip_process dev={device_id} ctrl={int(sub_cmd)}", e)
@@ -340,13 +367,14 @@ def _chip_process_loop(
             break
 
 
-def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
+def _chip_process_loop_with_bootstrap(  # noqa: PLR0912, PLR0915
     buf: memoryview,
     bins,
     device_id: int,
     bootstrap_cfg: ChipBootstrapConfig,
     bootstrap_mailbox_addr: int,
     max_buffer_count: int,
+    registry: dict,
     log_level: int = 1,
     log_info_v: int = 5,
 ) -> None:
@@ -395,21 +423,36 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
 
     mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
     state_addr = mailbox_addr + _OFF_STATE
-    args_ptr = mailbox_addr + _OFF_ARGS
     sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id} bootstrap] ready\n")
     sys.stderr.flush()
 
+    # Per-child set of cids already prepared on this device.  Mirrors
+    # `_chip_process_loop`'s `prepared`.
+    prepared: set[int] = set()
+
+    def _ensure_prepared(cid: int) -> None:
+        if cid in prepared:
+            return
+        callable_obj = registry.get(cid)
+        if callable_obj is None:
+            raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry")
+        cw._impl.prepare_callable(cid, callable_obj)
+        prepared.add(cid)
+
     try:
         while True:
             state = _mailbox_load_i32(state_addr)
             if state == _TASK_READY:
-                callable_ptr = struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]
+                cid = int(struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]) & 0xFFFFFFFF
                 cfg = _read_config_from_mailbox(buf)
 
                 code = 0
                 msg = ""
                 try:
-                    cw._impl.run_from_blob(callable_ptr, args_ptr, cfg)
+                    _ensure_prepared(cid)
+                    # Hand the mailbox bytes straight to C++ (zero-copy zero-decode);
+                    # see the matching comment in `_chip_process_loop`.
+                    cw._impl.run_prepared_from_blob(cid, mailbox_addr + _OFF_ARGS, _MAILBOX_ARGS_CAPACITY, cfg)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id}", e)
@@ -467,6 +510,9 @@ def _chip_process_loop_with_bootstrap(  # noqa: PLR0912
                         src = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0]
                         n = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0]
                         cw._impl.copy_from(dst, src, n)
+                    elif sub_cmd == _CTRL_PREPARE:
+                        cid = int(struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0]) & 0xFFFFFFFF
+                        _ensure_prepared(cid)
                 except Exception as e:  # noqa: BLE001
                     code = 1
                     msg = _format_exc(f"chip_process dev={device_id} ctrl={int(sub_cmd)}", e)
@@ -561,7 +607,7 @@ def __init__(
     ) -> None:
         self.level = level
         self._config = config
-        self._callable_registry: dict[int, Callable] = {}
+        self._callable_registry: dict[int, Any] = {}
         self._initialized = False
 
         # Level-2 internals
@@ -601,14 +647,38 @@ def __init__(
     # Callable registration (before init)
     # ------------------------------------------------------------------
 
-    def register(self, fn: Callable) -> int:
-        """Register a callable (sub or orch fn). Must be called before init()."""
-        if self.level < 3:
-            raise RuntimeError("Worker.register() is only available at level 3+")
-        if self._initialized:
-            raise RuntimeError("Worker.register() must be called before init()")
+    def register(self, target) -> int:
+        """Register a callable. Returns the cid passed to ``run`` / ``submit_*``.
+
+        A unified id space serves Python functions (sub fn / orch fn) and
+        ``ChipCallable`` instances at every level. L2 returns a cid the
+        user passes to ``Worker.run(cid, args, cfg)``; L3+ returns a cid
+        the orch function passes to ``orch.submit_next_level(cid, …)`` /
+        ``orch.submit_sub(cid, …)``.
+
+        Timing constraints:
+          - L3+: must be called **before** ``init()`` so the COW-inherited
+            registry is visible to forked chip / sub children.  ChipCallables
+            are pre-warmed by pushing ``_CTRL_PREPARE`` to every chip child
+            during ``init()``.
+          - L2: may be called either before or after ``init()`` (no fork,
+            no COW constraint).  When called post-init, ChipCallables are
+            prepared on the device immediately; pre-init registrations are
+            batched and prepared at the end of ``init()``.
+        """
+        if self.level >= 3 and self._initialized:
+            raise RuntimeError(
+                "Worker.register() at level >= 3 must be called before init() "
+                "(forked children inherit the registry via COW)"
+            )
         cid = len(self._callable_registry)
-        self._callable_registry[cid] = fn
+        self._callable_registry[cid] = target
+
+        # L2 post-init: pre-warm immediately so the very first
+        # `Worker.run(cid, …)` is a clean cache hit.
+        if self.level == 2 and self._initialized and isinstance(target, ChipCallable):
+            assert self._chip_worker is not None
+            self._chip_worker.prepare_callable(cid, target)
         return cid
 
     def add_worker(self, worker: "Worker") -> None:
@@ -665,6 +735,13 @@ def _init_level2(self) -> None:
         self._chip_worker.init(binaries)
         self._chip_worker.set_device(device_id)
 
+        # Pre-warm any registered ChipCallable so the first run(cid, …)
+        # does not pay the H2D upload cost.
+        assert self._chip_worker is not None
+        for cid, target in self._callable_registry.items():
+            if isinstance(target, ChipCallable):
+                self._chip_worker.prepare_callable(cid, target)
+
     def _init_hierarchical(self) -> None:
         device_ids = self._config.get("device_ids", [])
         n_sub = self._config.get("num_sub_workers", 0)
@@ -778,6 +855,7 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
                             bootstrap_cfg,
                             bootstrap_addr,
                             max_buffer_count,
+                            registry,
                             chip_log_level,
                             chip_log_info_v,
                         )
@@ -786,6 +864,7 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
                             buf,
                             self._l3_bins,
                             dev_id,
+                            registry,
                             chip_log_level,
                             chip_log_info_v,
                         )
@@ -846,6 +925,17 @@ def _start_hierarchical(self) -> None:  # noqa: PLR0912 -- three parallel fork l
 
         self._orch = Orchestrator(dw.get_orchestrator())
 
+        # Pre-warm every chip child: for each registered ChipCallable cid,
+        # send `_CTRL_PREPARE` to all chip children so the first
+        # `submit_next_level` does not pay the H2D upload cost.  Sub fns /
+        # orch fns do not need pre-warming — the registry is already
+        # COW-inherited.
+        if device_ids:
+            for cid, target in self._callable_registry.items():
+                if isinstance(target, ChipCallable):
+                    for worker_id in range(len(self._chip_shms)):
+                        self._chip_control(worker_id, _CTRL_PREPARE, arg0=cid)
+
     # ------------------------------------------------------------------
     # Bootstrap plumbing
     # ------------------------------------------------------------------
@@ -1042,16 +1132,21 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None:
     def run(self, callable, args=None, config=None) -> None:
         """Execute one task (L2) or one DAG (L3+) synchronously.
 
-        callable: ChipCallable (L2) or Python orch fn (L3+)
-        args:     TaskArgs (optional)
-        config:   CallConfig (optional, default-constructed if None)
+        Dispatch:
+          - L2: ``callable`` is a cid returned by ``Worker.register(chip_callable)``.
+            Routes to ``_chip_worker.run_prepared(cid, args, cfg)``.
+          - L3+: ``callable`` is a Python orch fn invoked with the
+            ``Orchestrator`` handle.
+
+        ``args``  : TaskArgs (optional)
+        ``config``: CallConfig (optional, default-constructed if None)
         """
         assert self._initialized, "Worker not initialized; call init() first"
         cfg = config if config is not None else CallConfig()
 
         if self.level == 2:
             assert self._chip_worker is not None
-            self._chip_worker.run(callable, args, cfg)
+            self._chip_worker.run_prepared(int(callable), args, cfg)
         else:
             self._start_hierarchical()
             assert self._orch is not None
@@ -1075,6 +1170,68 @@ def run(self, callable, args=None, config=None) -> None:
                 self._orch._scope_end()
                 self._orch._drain()
 
+    def prepare_callable(self, callable_id: int, callable) -> None:
+        """L2 only: pre-stage a callable under ``callable_id`` (see
+        ``ChipWorker.prepare_callable``). Subsequent ``run_prepared`` skips
+        per-run kernel/orch SO upload.
+        """
+        assert self._initialized, "Worker not initialized; call init() first"
+        if self.level != 2:
+            raise NotImplementedError("prepare_callable is L2-only")
+        assert self._chip_worker is not None
+        self._chip_worker.prepare_callable(callable_id, callable)
+
+    def run_prepared(self, callable_id: int, args=None, config=None) -> None:
+        """L2 only: launch a callable previously staged via ``prepare_callable``."""
+        assert self._initialized, "Worker not initialized; call init() first"
+        if self.level != 2:
+            raise NotImplementedError("run_prepared is L2-only")
+        assert self._chip_worker is not None
+        cfg = config if config is not None else CallConfig()
+        self._chip_worker.run_prepared(callable_id, args, cfg)
+
+    def unregister_callable(self, callable_id: int) -> None:
+        """L2 only: drop the prepared state for ``callable_id``."""
+        assert self._initialized, "Worker not initialized; call init() first"
+        if self.level != 2:
+            raise NotImplementedError("unregister_callable is L2-only")
+        assert self._chip_worker is not None
+        self._chip_worker.unregister_callable(callable_id)
+
+    @property
+    def aicpu_dlopen_count(self) -> int:
+        """L2 only: number of distinct callable_ids the AICPU has dlopened for.
+
+        Used by tests to assert that ``register`` + repeated ``run(cid)`` calls
+        do not retrigger the AICPU dlopen for an already-seen cid. Returns 0
+        on non-L2 workers (no per-cid registration there).
+        """
+        if self.level != 2 or self._chip_worker is None:
+            return 0
+        return self._chip_worker.aicpu_dlopen_count
+
+    @property
+    def host_dlopen_count(self) -> int:
+        """L2 only: number of host-side orch SO dlopens (hbg variants).
+
+        Mirrors ``aicpu_dlopen_count`` for the host_build_graph path. Returns
+        0 on non-L2 workers or device-orch variants (trb).
+        """
+        if self.level != 2 or self._chip_worker is None:
+            return 0
+        return self._chip_worker.host_dlopen_count
+
+    def _run_as_child(self, cid: int, args, config) -> None:
+        """Called from C++ _Worker::run when this Worker is a THREAD-mode child.
+
+        Looks up the orch function from the callable registry and delegates
+        to ``self.run(orch_fn, args, config)``.
+        """
+        orch_fn = self._callable_registry.get(cid)
+        if orch_fn is None:
+            raise KeyError(f"callable id {cid} not found in registry")
+        self.run(orch_fn, args, config)
+
     # ------------------------------------------------------------------
     # close
     # ------------------------------------------------------------------
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 54c6519ef..9241bae92 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -918,6 +918,15 @@ def _run_and_validate_l2(
         config_dict = case.get("config", {})
         orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
 
+        # The L2 entry point is `Worker.run(cid, args, cfg)`.  Reuse the
+        # cid registered by the st_worker fixture / standalone path.  For
+        # first-time callers (worker reused across rounds), `_st_l2_cid`
+        # caches the cid so subsequent runs skip re-registration.
+        cid = getattr(type(self), "_st_l2_cid", None)
+        if cid is None:
+            cid = worker.register(callable_obj)
+            type(self)._st_l2_cid = cid
+
         # Build args
         test_args = self.generate_args(params)
         chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
@@ -949,7 +958,7 @@ def _run_and_validate_l2(
             )
 
             with _temporary_env(self._resolve_env()):
-                worker.run(callable_obj, chip_args, config=config)
+                worker.run(cid, chip_args, config=config)
 
             if not skip_golden:
                 _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
@@ -1057,6 +1066,11 @@ def test_run(self, st_platform, st_worker, request):
         cls_name = type(self).__name__
         callable_obj = self.build_callable(st_platform)
         sub_ids = getattr(type(self), "_st_sub_ids", {})
+        # For L3, use pre-registered chip cids instead of raw ChipCallable
+        # objects.
+        chip_cids = getattr(type(self), "_st_chip_cids", {})
+        if self._st_level == 3 and chip_cids:
+            callable_obj = {**chip_cids}
 
         # Primary device id: prefer the one actually allocated by st_worker
         # (each test class can hold a different slot from DevicePool); fall back
@@ -1319,12 +1333,19 @@ def run_module(module_name):  # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch
         ok = True
         for (runtime, level), group in by_rt_level.items():
             print(f"\n=== Runtime: {runtime}  Level: {level} ===")
-            worker, per_class_sub_ids = _create_standalone_worker(group, level, args, selected_by_cls)
+            worker, per_class_sub_ids, per_class_chip_cids = _create_standalone_worker(
+                group, level, args, selected_by_cls
+            )
             try:
                 for cls in group:
                     inst = cls()
                     callable_obj = inst.build_callable(args.platform)
                     sub_ids = per_class_sub_ids.get(cls, {})
+                    chip_cids = per_class_chip_cids.get(cls, {})
+                    # For L3: merge chip cids into callable_obj (replacing
+                    # ChipCallable objects with their registered cid).
+                    if level == 3 and chip_cids:
+                        callable_obj = {**chip_cids}
                     for case in selected_by_cls[cls]:
                         label = f"{cls.__name__}::{case['name']}"
                         print(f"  {label} ... ", end="", flush=True)
@@ -1557,11 +1578,15 @@ def _create_standalone_worker(group, level, args, selected_by_cls):
     ``max_sub_workers`` must be computed from these, not from ``cls.CASES``:
     otherwise a manual case with a larger ``device_count`` inflates the
     allocation even when it isn't scheduled.
+
+    Returns ``(worker, per_class_sub_ids, per_class_chip_cids)`` for both
+    L2 and L3 so the caller can unpack uniformly. L2 has neither sub
+    callables nor pre-registered chip callables, so both dicts are empty.
     """
     first_cls = group[0]
     build = getattr(args, "build", False)
     if level == 2:
-        return first_cls._create_worker(args.platform, args.device, build=build), {}
+        return first_cls._create_worker(args.platform, args.device, build=build), {}, {}
 
     from simpler.worker import Worker  # noqa: PLC0415
 
@@ -1590,12 +1615,24 @@ def _create_standalone_worker(group, level, args, selected_by_cls):
     )
     # Register sub callables per-class to avoid name collisions
     per_class_sub_ids: dict[type, dict] = {}
+    # Also register ChipCallables here (before init) so the chip children
+    # pre-warm them via _CTRL_PREPARE.
+    per_class_chip_cids: dict[type, dict] = {}
     for cls in group:
         cls_sub_ids = {}
+        cls_chip_cids = {}
         for entry in cls.CALLABLE.get("callables", []):
             if "callable" in entry:
                 cid = worker.register(entry["callable"])
                 cls_sub_ids[entry["name"]] = cid
+            elif "orchestration" in entry:
+                name = entry["name"]
+                cache_key = (cls.__qualname__, name, args.platform, cls._st_runtime)
+                chip = _compile_chip_callable_from_spec(entry, args.platform, cls._st_runtime, cache_key)
+                cid = worker.register(chip)
+                cls_chip_cids[name] = cid
+                cls_chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", [])
         per_class_sub_ids[cls] = cls_sub_ids
+        per_class_chip_cids[cls] = cls_chip_cids
     worker.init()
-    return worker, per_class_sub_ids
+    return worker, per_class_sub_ids, per_class_chip_cids
diff --git a/src/a2a3/platform/include/aicpu/orch_so_file.h b/src/a2a3/platform/include/aicpu/orch_so_file.h
index a305ab8fa..29318f5ea 100644
--- a/src/a2a3/platform/include/aicpu/orch_so_file.h
+++ b/src/a2a3/platform/include/aicpu/orch_so_file.h
@@ -39,10 +39,15 @@
  * Caller is expected to try the next candidate directory.
  *
  * @param dir            Candidate directory (e.g. "/tmp")
+ * @param callable_id    Per-callable_id table slot id (>= 0). Required for
+ *                       uniqueness on the onboard path so concurrently-
+ *                       resident orch SOs (one per cid) do not collide on
+ *                       the same on-disk file. Pass -1 for the legacy
+ *                       single-slot dispatch path.
  * @param out_path       Buffer that receives the full file path on success
  * @param out_path_size  Size of `out_path` in bytes
  * @return Open writable fd on success, -1 on failure
  */
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size);
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size);
 
 #endif  // PLATFORM_AICPU_ORCH_SO_FILE_H_
diff --git a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
index 322cb7dcc..4e7f55232 100644
--- a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
+++ b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
@@ -15,10 +15,20 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
-    // Pid-based naming: AICPU device libc may lack mkstemps, and only one
-    // runtime runs per device process, so pid uniqueness is sufficient.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
+    // Pid + callable_id naming: AICPU device libc may lack mkstemps. With
+    // per-callable_id dispatch, multiple orch SOs can be resident in the
+    // same device process at once (one per cid in `orch_so_table_`), so
+    // the on-disk file name must be unique per cid — otherwise the
+    // second cid's `O_TRUNC` would silently shred the first cid's already
+    // dlopen'd file image and the next launch on cid=0 would SIGBUS.
+    // callable_id < 0 is the legacy single-slot path: pid alone is fine.
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d_%d.so", dir, getpid(), callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index b43e5c5d7..b3c924e64 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -28,6 +28,7 @@
 // Include HAL constants from CANN (header only, library loaded dynamically)
 #include "ascend_hal.h"
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "host/host_regs.h"  // Register address retrieval
 #include "host/raii_scope_guard.h"
@@ -746,6 +747,45 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+    // Per-callable_id path: when run_prepared bound a known callable_id,
+    // the SO bytes were already H2D'd at prepare_callable time.
+    // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
+    // whether the AICPU has seen this id since registration.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        // hbg variant: orch SO never crosses the host/device boundary, so the
+        // AICPU does no per-cid dlopen. Skip the orch_so_table_ bookkeeping
+        // (and the AICPU dlopen counter) and clear the device-orch metadata.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
+        // The c_api caller passed is_new=false; refresh with the authoritative
+        // first_sighting flag before AICPU consumes register_new_callable_id_.
+        runtime.set_active_callable_id(cid, first_sighting);
+        // Pending fields must be empty in the prepared path — runtime_maker's
+        // bind_prepared_to_runtime_impl never stages them. Defensive clear:
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
@@ -753,7 +793,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (host_so_data == nullptr || host_so_size == 0) {
         // Host-orchestration mode (no device SO needed).
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -761,7 +801,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -797,11 +837,174 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     }
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+    // (declared in src/common/task_interface/callable_protocol.h) and indexes it by
+    // callable_id; rejecting an out-of-range id here keeps the host and
+    // AICPU sides in sync and avoids an OOB access at run time.
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    // Hash dedup: share device buffer across callable_ids that carry the same
+    // SO bytes. Refcount drops in unregister_prepared_callable; we only free
+    // when the count hits zero.
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        int rc = rtMemcpy(buf, orch_so_size, orch_so_data, orch_so_size, RT_MEMCPY_HOST_TO_DEVICE);
+        if (rc != 0) {
+            LOG_ERROR("register_prepared_callable: rtMemcpy failed: %d", rc);
+            mem_alloc_.free(buf);
+            return rc;
+        }
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    return 0;
+}
+
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    PreparedCallableState state = std::move(it->second);
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg path: no orch SO refcount, just dlclose the host handle.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+
+    // Replay kernel addresses directly into runtime->func_id_to_addr_ without
+    // going through set_function_bin_addr. The latter records func_ids in
+    // registered_kernel_func_ids_, which validate_runtime_impl iterates to
+    // free kernel binaries — but prepared kernels must survive across runs.
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.replay_function_bin_addr(kv.first, kv.second);
+    }
+    // Replay both paths unconditionally — the runtime carries staging fields
+    // for both trb (device-side dlopen via entry-symbol names) and hbg (host-
+    // side dlopen handle + fn ptr). Whichever set was populated by
+    // register_prepared_callable / register_prepared_callable_host_orch wins;
+    // the other set stays at its initial value (empty string / nullptr).
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
+    // with the authoritative first_sighting answer right before launch.
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+
 int DeviceRunner::finalize() {
     if (device_id_ == -1) {
         return 0;
@@ -821,17 +1024,27 @@ int DeviceRunner::finalize() {
     // Cleanup AICPU SO
     so_info_.finalize();
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs (shared by func_id) and relies on
+    // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the
+    // legacy regression signal is preserved for callers that never went
+    // through prepare_callable.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
-        // Cleanup leaked binaries to prevent memory leaks
+        const bool prepared_path_used = prepared_callable_path_used_;
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
+        }
         for (const auto &pair : func_id_to_addr_) {
             void *gm_addr = reinterpret_cast<void *>(pair.second);
             mem_alloc_.free(gm_addr);
-            LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
+            LOG_DEBUG("Freed kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
         }
     }
     func_id_to_addr_.clear();
+    func_id_to_hash_.clear();
     binaries_loaded_ = false;
 
     // Release the cached orchestration SO buffer.
@@ -844,6 +1057,29 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers that callers forgot to
+    // unregister. Refcounts no longer matter at this point — the device is
+    // about to be reset.
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
+
     // Cleanup performance profiling
     if (l2_perf_collector_.is_initialized()) {
         auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
@@ -1008,11 +1244,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and DeviceRunner) and can pick distinct kernel
+    // binaries for the same func_id. Naively reusing the cached entry hands
+    // the AICore the previous callable's kernel: dispatch never completes
+    // the new task and the AICPU spins forever.
+    const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size);
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return it->second;
+        auto hash_it = func_id_to_hash_.find(func_id);
+        if (hash_it != func_id_to_hash_.end() && hash_it->second == new_hash) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching hash), returning cached address", func_id);
+            return it->second;
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        mem_alloc_.free(reinterpret_cast<void *>(it->second));
+        func_id_to_addr_.erase(it);
+        func_id_to_hash_.erase(func_id);
     }
 
     LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size);
@@ -1042,6 +1291,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     func_id_to_addr_[func_id] = callable_addr;
+    func_id_to_hash_[func_id] = new_hash;
 
     LOG_DEBUG("  func_id=%d -> callable_addr=0x%lx, binary_code_addr=0x%lx", func_id, callable_addr, binary_code_addr);
 
@@ -1059,6 +1309,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 
     mem_alloc_.free(gm_addr);
     func_id_to_addr_.erase(it);
+    func_id_to_hash_.erase(func_id);
 
     LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr);
 }
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 8dd4dc816..c910e47c5 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -33,6 +33,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/kernel_args.h"
@@ -420,6 +422,99 @@ class DeviceRunner {
      */
     void release_run_context();
 
+    /**
+     * Stage a per-callable_id orchestration SO into device memory and remember
+     * the supporting metadata (entry/config symbol names, kernel func_id ↔
+     * dev_addr table). Identical SO bytes across two callable_ids share one
+     * device buffer (refcounted by hash) so the worst case for an N-cid pool
+     * is N distinct device buffers, not N copies of the same SO.
+     *
+     * @param callable_id   Caller-stable id, must be in [0, MAX_REGISTERED_CALLABLE_IDS).
+     * @param orch_so_data  Host pointer to orchestration SO bytes (owned by caller).
+     * @param orch_so_size  Size of orchestration SO in bytes.
+     * @param func_name     Entry symbol name (copied).
+     * @param config_name   Config symbol name (copied).
+     * @param kernel_addrs  func_id ↔ dev_addr pairs already uploaded by the
+     *                      caller. Stored verbatim so run_prepared can replay
+     *                      them onto a fresh Runtime without re-uploading.
+     * @return 0 on success, negative on failure.
+     */
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Host-orchestration variant of register_prepared_callable: stores a
+     * dlopen handle + entry-symbol pointer that runtime_maker resolved on the
+     * host (host_build_graph variant). Mutually exclusive with the trb-shaped
+     * `register_prepared_callable` overload — exactly one is invoked for a
+     * given callable_id, picked by the C ABI based on which staging fields the
+     * runtime carries after prepare_callable_impl. dlopen handle is owned by
+     * DeviceRunner from this call onward and dlclose'd by
+     * unregister_prepared_callable. Increments `host_dlopen_count_`.
+     */
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Drop the prepared state for `callable_id`. trb path: decrement the orch
+     * SO buffer's hash-keyed refcount and free when it hits zero. hbg path:
+     * dlclose the host dlopen handle. Kernel binaries are shared across
+     * callables and only released by finalize().
+     *
+     * @param callable_id  Id previously passed to one of the
+     *                     register_prepared_callable* overloads.
+     * @return 0 on success or if the id was not registered.
+     */
+    int unregister_prepared_callable(int32_t callable_id);
+
+    /**
+     * True iff `callable_id` has prepared state staged via
+     * register_prepared_callable. Lets the c_api layer reject `run_prepared`
+     * calls without a matching `prepare_callable`.
+     */
+    bool has_prepared_callable(int32_t callable_id) const;
+
+    /**
+     * Replay the prepared state for `callable_id` onto a freshly-constructed
+     * Runtime: restores kernel func_id ↔ dev_addr table, the orch entry/config
+     * symbol names, and stamps `runtime.set_active_callable_id` so the
+     * subsequent `run` dispatches via the AICPU per-cid table. The kernel
+     * addresses are written directly into func_id_to_addr_ (bypassing
+     * registered_kernel_func_ids_) so validate_runtime_impl will not free them
+     * — they survive until unregister_prepared_callable / finalize().
+     *
+     * Marks the cid as seen so the upcoming prepare_orch_so resolves
+     * `register_new_callable_id_` correctly (true exactly on first sighting
+     * after registration).
+     *
+     * @return 0 on success, -1 if the cid is not registered.
+     */
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+
+    /**
+     * Number of distinct callable_ids the AICPU has been asked to dlopen for.
+     * Monotonically increases on every first-sighting bind; `unregister_callable`
+     * does NOT decrement it. So a `prepare → run → unregister → re-prepare → run`
+     * sequence reports 2 (each AICPU dlopen counted once), even though only one
+     * cid is currently registered. Tests assert this to verify per-cid
+     * registration eliminates duplicate dlopens across repeated runs.
+     */
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+
+    /**
+     * Number of host-side dlopen() invocations triggered by
+     * `register_prepared_callable_host_orch`. Mirrors `aicpu_dlopen_count` but
+     * counts the host_build_graph variant's host-side dlopens; it never
+     * decrements (re-prepare after unregister still counts). Tests assert
+     * `host_dlopen_count == distinct_registered_cids` to verify the prepared
+     * path doesn't dlopen on every run.
+     */
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
+
 private:
     // Internal state
     int device_id_{-1};
@@ -441,6 +536,10 @@ class DeviceRunner {
     // Kernel binary management
     bool binaries_loaded_{false};              // true after AICPU SO loaded
     std::map<int, uint64_t> func_id_to_addr_;  // func_id -> function_bin_addr (device GM)
+    // Parallel hash map for upload_kernel_binary() to detect when the same
+    // func_id is re-uploaded with different binary bytes (different
+    // ChipCallable sharing the same func_id under the per-callable_id path).
+    std::map<int, uint64_t> func_id_to_hash_;
 
     // Orchestration SO cache. `cached_orch_so_hash_ == 0` means "no cache".
     // The device buffer grows monotonically — cache miss with a larger SO
@@ -451,6 +550,50 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state.
+    //
+    // `prepared_callables_` maps the caller-stable callable_id to the orch
+    // SO slice + symbol names needed to launch it. `orch_so_dedup_` shares
+    // device buffers across callable_ids whose orch SO bytes have the same
+    // ELF Build-ID hash (refcounted; freed when the count hits zero).
+    // `aicpu_seen_callable_ids_` tracks which ids have already been delivered
+    // to the AICPU at least once so prepare_orch_so can set
+    // register_new_callable_id_ correctly on first sighting.
+    struct PreparedCallableState {
+        // trb path (AICPU dlopens orch SO from device buffer)
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        // common
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path (host already dlopen'd the orch SO)
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    // Monotonic count of AICPU dlopens triggered (incremented on each
+    // first-sighting bind; never decremented). Diverges from
+    // aicpu_seen_callable_ids_.size() once any cid is unregistered and
+    // re-prepared. Exposed via aicpu_dlopen_count() for tests.
+    size_t aicpu_dlopen_total_{0};
+    // Monotonic count of host-side dlopens triggered (incremented on every
+    // register_prepared_callable_host_orch call; never decremented). Same
+    // re-prepare semantics as aicpu_dlopen_total_, but for hbg variants.
+    size_t host_dlopen_total_{0};
+    // Sticky flag: prepare_callable was called at least once. Distinguishes
+    // legacy-path "kernel still cached at finalize" leaks from prepared-path
+    // kernels that legitimately live until finalize.
+    bool prepared_callable_path_used_{false};
+
     // ACL lifecycle (process-wide). aclInit must run exactly once; ensure_acl_ready
     // gates it behind this flag. finalize() drives aclFinalize only if we observed
     // acl_ready_, so runtimes that never ask for ACL (e.g. pure rt-layer) stay unaffected.
@@ -498,8 +641,8 @@ class DeviceRunner {
     );
 
     /**
-     * Populate runtime.{dev_orch_so_addr_, dev_orch_so_size_, has_new_orch_so_}
-     * from `runtime.pending_orch_so_data_` / `_size_`.
+     * Populate runtime.{dev_orch_so_addr_, dev_orch_so_size_} from
+     * `runtime.pending_orch_so_data_` / `_size_`.
      *
      * The host tracks the SO identity via a 64-bit hash derived from the ELF
      * GNU Build-ID. When the hash matches the previous run, the device-side
diff --git a/src/a2a3/platform/onboard/host/host_regs.cpp b/src/a2a3/platform/onboard/host/host_regs.cpp
index 0a90e4b07..f519392e1 100644
--- a/src/a2a3/platform/onboard/host/host_regs.cpp
+++ b/src/a2a3/platform/onboard/host/host_regs.cpp
@@ -135,8 +135,11 @@ get_aicore_reg_info(std::vector<int64_t> &aic, std::vector<int64_t> &aiv, const
 
 /**
  * Get one flat AIC-then-AIV address array for the requested register kind.
- * Returns a negative code on HAL failure; does NOT generate placeholder
- * addresses (callers must treat failure as fatal for that kind).
+ * For Ctrl kind, falls back to placeholder addresses on HAL failure to
+ * preserve historical behavior on hardware where halMemCtl rejects
+ * ADDR_MAP_TYPE_REG_AIC_CTRL queries (the dispatch path does not actually
+ * dereference these addresses).  For Pmu kind, propagates the HAL error so
+ * the caller can disable PMU collection cleanly.
  */
 static int get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id, AicoreRegKind kind) {
     std::vector<int64_t> aic;
@@ -144,8 +147,19 @@ static int get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id, Aicor
 
     int rc = get_aicore_reg_info(aic, aiv, kind_to_addr_type(kind), device_id);
     if (rc != 0) {
-        LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc);
-        return rc;
+        if (kind == AicoreRegKind::Ctrl) {
+            LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d, using placeholder addresses", kind_to_name(kind), rc);
+            aic.clear();
+            aiv.clear();
+            for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
+                aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000));
+                aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
+                aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
+            }
+        } else {
+            LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc);
+            return rc;
+        }
     }
 
     // AIC cores first, then AIV cores
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index e4b7d3b20..c647f4887 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -21,6 +21,8 @@
 #include "task_args.h"
 
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -39,7 +41,8 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -195,16 +198,137 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
+int finalize_device(DeviceContextHandle ctx) {
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->finalize();
+    } catch (...) {
+        return -1;
+    }
+}
+
+/* ===========================================================================
+ * Internal helpers called from runtime_maker.cpp via Runtime.host_api
+ * =========================================================================== */
+
+void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    if (runtime == NULL) return;
+    Runtime *r = static_cast<Runtime *>(runtime);
+    r->record_tensor_pair(host_ptr, dev_ptr, size);
+}
+
+void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
+    if (ctx == NULL) return;
+
+    // CANN dlog: derive from simpler logger choice unless ASCEND_GLOBAL_LOG_LEVEL
+    // is externally configured.
+    if (std::getenv("ASCEND_GLOBAL_LOG_LEVEL") == NULL) {
+        dlog_setlevel(-1, log_level, /*enableEvent*/ 0);
+    }
+
+    HostLogger::get_instance().set_level(static_cast<simpler::log::LogLevel>(log_level));
+    HostLogger::get_instance().set_info_v(log_info_v);
+
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+    runner->set_log_level(log_level);
+    runner->set_log_info_v(log_info_v);
+}
+
+/* ===========================================================================
+ * Per-callable_id preparation
+ * =========================================================================== */
+
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    // AICPU/AICore executor binaries are only consumed by run()/run_prepared();
+    // prepare_callable just uploads kernel + orch SO state.
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB,
+        // larger than the default thread stack.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            return rc;
+        }
+
+        // Extract kernel func_id ↔ dev_addr pairs uploaded by prepare_callable_impl.
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        // Clear registered kernels so the Runtime destructor (or any accidental
+        // validate call) does NOT free the kernel binaries we just uploaded —
+        // they belong to the prepared state now.
+        r->clear_registered_kernels();
+
+        // Pick the path by inspecting which staging fields the runtime carries:
+        // hbg's prepare_callable_impl populates pending_host_dlopen_handle_;
+        // trb's leaves it null and instead populates pending_orch_so_data_ +
+        // device_orch_func_name_/config_name_.
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
     size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
 ) {
     if (ctx == NULL || runtime == NULL) return -1;
-    if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1;
-
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
+
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
     auto tsd_guard = RAIIScopeGuard([]() {
@@ -226,11 +350,15 @@ int run_runtime(
         r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
         r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
 
-        LOG_DEBUG("About to call init_runtime_impl, r=%p", (void *)r);
-        rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
-        LOG_DEBUG("init_runtime_impl returned: %d", rc);
+        // Restore kernel addrs + orch symbol names + active_callable_id
+        rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            return rc;
+        }
+
+        // Per-run binding (tensor args, GM heap, SM alloc)
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
         if (rc != 0) {
             r->set_gm_sm_ptr(nullptr);
             validate_runtime_impl(r);
@@ -260,40 +388,31 @@ int run_runtime(
     }
 }
 
-int finalize_device(DeviceContextHandle ctx) {
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
     if (ctx == NULL) return -1;
     try {
-        return static_cast<DeviceRunner *>(ctx)->finalize();
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
     } catch (...) {
         return -1;
     }
 }
 
-/* ===========================================================================
- * Internal helpers called from runtime_maker.cpp via Runtime.host_api
- * =========================================================================== */
-
-void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) {
-    if (runtime == NULL) return;
-    Runtime *r = static_cast<Runtime *>(runtime);
-    r->record_tensor_pair(host_ptr, dev_ptr, size);
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
 }
 
-void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
-    if (ctx == NULL) return;
-
-    // CANN dlog: derive from simpler logger choice unless ASCEND_GLOBAL_LOG_LEVEL
-    // is externally configured.
-    if (std::getenv("ASCEND_GLOBAL_LOG_LEVEL") == NULL) {
-        dlog_setlevel(-1, log_level, /*enableEvent*/ 0);
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
     }
-
-    HostLogger::get_instance().set_level(static_cast<simpler::log::LogLevel>(log_level));
-    HostLogger::get_instance().set_info_v(log_info_v);
-
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-    runner->set_log_level(log_level);
-    runner->set_log_info_v(log_info_v);
 }
 
 }  // extern "C"
diff --git a/src/a2a3/platform/sim/aicpu/orch_so_file.cpp b/src/a2a3/platform/sim/aicpu/orch_so_file.cpp
index 4da92d7de..114fe4826 100644
--- a/src/a2a3/platform/sim/aicpu/orch_so_file.cpp
+++ b/src/a2a3/platform/sim/aicpu/orch_so_file.cpp
@@ -24,10 +24,17 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
     // mkstemps: multiple sim workers can share a process, so names must be
     // unique per call.  The "XXXXXX" template is replaced in-place.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    // callable_id is embedded purely for log readability (mkstemps already
+    // guarantees uniqueness regardless).
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_cid%d_XXXXXX.so", dir, callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 26db1e3d6..8cee9029e 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -36,6 +36,7 @@
 
 #include "aicpu/platform_aicpu_affinity.h"
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "cpu_sim_context.h"
 #include "host/raii_scope_guard.h"
@@ -673,13 +674,46 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+    // Per-callable_id path: mirror onboard. Bytes were staged at
+    // register_prepared_callable time; here we only stamp metadata onto
+    // the runtime and resolve `register_new_callable_id_` from first sighting.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        // hbg: orch SO never crosses host/device — clear device-orch metadata
+        // and skip AICPU bookkeeping. See onboard/device_runner.cpp.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
+        runtime.set_active_callable_id(cid, first_sighting);
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
     runtime.pending_orch_so_size_ = 0;
 
     if (host_so_data == nullptr || host_so_size == 0) {
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -687,7 +721,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -715,11 +749,156 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     std::memcpy(dev_orch_so_buffer_, host_orch_so_copy_.data(), host_so_size);
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+    // (declared in src/common/task_interface/callable_protocol.h) and indexes it by
+    // callable_id; rejecting an out-of-range id here keeps the host and
+    // AICPU sides in sync and avoids an OOB access at run time.
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        // Sim shares an address space with the simulated AICPU thread, so a
+        // plain memcpy is the moral equivalent of rtMemcpy on hardware.
+        std::memcpy(buf, orch_so_data, orch_so_size);
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    return 0;
+}
+
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    PreparedCallableState state = std::move(it->second);
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg: dlclose the host handle; no orch SO refcount to decrement.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.replay_function_bin_addr(kv.first, kv.second);
+    }
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+
 int DeviceRunner::finalize() {
     // Skip if already finalized
     if (device_id_ == -1 && aicpu_so_handle_ == nullptr && aicore_so_handle_ == nullptr) {
@@ -744,15 +923,22 @@ int DeviceRunner::finalize() {
         pmu_collector_.finalize(nullptr, free_cb, &mem_alloc_);
     }
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs and relies on finalize() to reclaim them;
+    // that is not a leak.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
-        // Cleanup leaked handles and host copies
+        const bool prepared_path_used = prepared_callable_path_used_;
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
+        }
         for (auto &pair : func_id_to_addr_) {
             MappedKernel &kernel = pair.second;
             if (kernel.dl_handle != nullptr) {
                 dlclose(kernel.dl_handle);
-                LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first);
+                LOG_DEBUG("Closed kernel: func_id=%d", pair.first);
             }
             delete[] kernel.callable_buf;
         }
@@ -769,6 +955,27 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers callers forgot to drop.
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
+
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
@@ -794,11 +1001,25 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and hence DeviceRunner) and can pick distinct
+    // kernel binaries for the same func_id.  Naively reusing the cached
+    // entry hands the AICore the previous callable's kernel and segfaults
+    // at dispatch.
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        const auto &cached_callable = *reinterpret_cast<const CoreCallable *>(it->second.callable_buf);
+        const auto *new_callable = reinterpret_cast<const CoreCallable *>(bin_data);
+        if (cached_callable.binary_size() == new_callable->binary_size() &&
+            std::memcmp(cached_callable.binary_data(), new_callable->binary_data(), new_callable->binary_size()) == 0) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching bytes), returning cached address", func_id);
+            return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        if (it->second.dl_handle != nullptr) dlclose(it->second.dl_handle);
+        delete[] it->second.callable_buf;
+        func_id_to_addr_.erase(it);
     }
 
     // Extract binary from CoreCallable envelope
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 210aeb9ba..994d92c3b 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -39,6 +39,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/core_type.h"
@@ -210,6 +212,23 @@ class DeviceRunner {
      */
     void remove_kernel_binary(int func_id);
 
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+    // Host-orchestration sibling of register_prepared_callable; see
+    // src/a2a3/platform/onboard/host/device_runner.h for the contract. Sim
+    // shares the host-only dlopen path verbatim (no AICPU side effects).
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+    int unregister_prepared_callable(int32_t callable_id);
+    bool has_prepared_callable(int32_t callable_id) const;
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
+
 private:
     // Configuration
     int device_id_{-1};
@@ -232,6 +251,38 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state. Mirrors onboard.
+    struct PreparedCallableState {
+        // trb path
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        // common
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    size_t aicpu_dlopen_total_{0};
+    size_t host_dlopen_total_{0};
+    // Sticky flag: prepare_callable was called at least once in this
+    // DeviceRunner's lifetime. unregister_prepared_callable clears the maps
+    // above, so we cannot use them at finalize() time to decide whether a
+    // remaining func_id_to_addr_ entry is a legacy-path leak or a kernel
+    // legitimately staged by prepare_callable (which is owned until finalize
+    // by design).
+    bool prepared_callable_path_used_{false};
+
     // AICPU executor SO: load-once, matching onboard's binaries_loaded_ pattern.
     // The aicpu_executor g_aicpu_executor static lives inside the dlopen'd DSO;
     // reloading it destroys orch_so_handle_ and breaks the orch-SO cache-hit path.
@@ -279,8 +330,8 @@ class DeviceRunner {
      * Stage the orchestration SO bytes into a host-resident buffer that
      * `aicpu_executor` can dlopen. Identical contract to the onboard
      * version: `runtime.pending_orch_so_data_/size_` are consumed and
-     * `runtime.{dev_orch_so_addr_, dev_orch_so_size_, has_new_orch_so_}`
-     * are populated with the cache-aware result.
+     * `runtime.{dev_orch_so_addr_, dev_orch_so_size_}` are populated with
+     * the cache-aware result.
      */
     int prepare_orch_so(Runtime &runtime);
 
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index b8315b31a..79b54bf51 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -22,6 +22,8 @@
 
 #include <new>
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -35,7 +37,8 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -156,19 +159,149 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
+int finalize_device(DeviceContextHandle ctx) {
+    if (ctx == NULL) return -1;
+    try {
+        int rc = static_cast<DeviceRunner *>(ctx)->finalize();
+        int dev = pto_cpu_sim_get_bound_device();
+        if (dev >= 0) {
+            pto_cpu_sim_release_device(dev);
+        }
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+/* ===========================================================================
+ * ACL lifecycle stubs.  Sim has no ACL / aclrtStream concept, so these
+ * no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the
+ * full extension surface unconditionally).  The paired comm_init / barrier /
+ * destroy entry points already live in comm_sim.cpp.
+ * =========================================================================== */
+
+int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
+    (void)ctx;
+    (void)device_id;
+    return 0;
+}
+
+void *create_comm_stream_ctx(DeviceContextHandle ctx) {
+    (void)ctx;
+    return NULL;
+}
+
+int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
+    (void)ctx;
+    (void)stream;
+    return 0;
+}
+
+/* ===========================================================================
+ * Internal helpers called from runtime_maker.cpp via Runtime.host_api
+ * =========================================================================== */
+
+void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    if (runtime == NULL) return;
+    Runtime *r = static_cast<Runtime *>(runtime);
+    r->record_tensor_pair(host_ptr, dev_ptr, size);
+}
+
+void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
+    if (ctx == NULL) return;
+    // No CANN dlog on sim.
+    HostLogger::get_instance().set_level(static_cast<simpler::log::LogLevel>(log_level));
+    HostLogger::get_instance().set_info_v(log_info_v);
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+    runner->set_log_level(log_level);
+    runner->set_log_info_v(log_info_v);
+}
+
+/* ===========================================================================
+ * Per-callable_id preparation
+ * =========================================================================== */
+
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+    (void)device_id;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+
+    try {
+        // Heap-allocate the temp Runtime — sizeof(Runtime) is in the tens of MB
+        // for hbg variants (RUNTIME_MAX_TASKS=131072), well past the stack
+        // budget. unique_ptr keeps the cleanup symmetric on every exit.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        int rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        r->clear_registered_kernels();
+
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
+        pthread_setspecific(g_runner_key, nullptr);
+        return rc;
+    } catch (...) {
+        pthread_setspecific(g_runner_key, nullptr);
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
     size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
 ) {
     if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
 
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
     try {
-        // Phase 1: placement new + build graph
         Runtime *r = new (runtime) Runtime();
         r->host_api.device_malloc = device_malloc;
         r->host_api.device_free = device_free;
@@ -177,9 +310,14 @@ int run_runtime(
         r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
         r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
 
-        int rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
+        int rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
         if (rc != 0) {
             r->set_gm_sm_ptr(nullptr);
             validate_runtime_impl(r);
@@ -188,15 +326,11 @@ int run_runtime(
             return rc;
         }
 
-        // Phase 2: publish diagnostics enablement to the DeviceRunner so run()
-        // and its helpers can read the three sub-features uniformly (via
-        // members, not Runtime / run() args).
         runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
         runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
         runner->set_pmu_enabled(enable_pmu);
         runner->set_output_prefix(output_prefix);
 
-        // Phase 3: launch
         std::vector<uint8_t> aicpu_vec;
         std::vector<uint8_t> aicore_vec;
         if (aicpu_binary != NULL && aicpu_size > 0) {
@@ -213,7 +347,6 @@ int run_runtime(
             return rc;
         }
 
-        // Phase 4: finalize (copy results back)
         rc = validate_runtime_impl(r);
         r->~Runtime();
         pthread_setspecific(g_runner_key, nullptr);
@@ -224,62 +357,31 @@ int run_runtime(
     }
 }
 
-int finalize_device(DeviceContextHandle ctx) {
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
     if (ctx == NULL) return -1;
     try {
-        int rc = static_cast<DeviceRunner *>(ctx)->finalize();
-        int dev = pto_cpu_sim_get_bound_device();
-        if (dev >= 0) {
-            pto_cpu_sim_release_device(dev);
-        }
-        return rc;
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
     } catch (...) {
         return -1;
     }
 }
 
-/* ===========================================================================
- * ACL lifecycle stubs.  Sim has no ACL / aclrtStream concept, so these
- * no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the
- * full extension surface unconditionally).  The paired comm_init / barrier /
- * destroy entry points already live in comm_sim.cpp.
- * =========================================================================== */
-
-int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
-    (void)ctx;
-    (void)device_id;
-    return 0;
-}
-
-void *create_comm_stream_ctx(DeviceContextHandle ctx) {
-    (void)ctx;
-    return NULL;
-}
-
-int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
-    (void)ctx;
-    (void)stream;
-    return 0;
-}
-
-/* ===========================================================================
- * Internal helpers called from runtime_maker.cpp via Runtime.host_api
- * =========================================================================== */
-
-void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) {
-    if (runtime == NULL) return;
-    Runtime *r = static_cast<Runtime *>(runtime);
-    r->record_tensor_pair(host_ptr, dev_ptr, size);
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
 }
 
-void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
-    if (ctx == NULL) return;
-    // No CANN dlog on sim.
-    HostLogger::get_instance().set_level(static_cast<simpler::log::LogLevel>(log_level));
-    HostLogger::get_instance().set_info_v(log_info_v);
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-    runner->set_log_level(log_level);
-    runner->set_log_info_v(log_info_v);
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
 }
 
 }  // extern "C"
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index cf6618170..390ad3d19 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -276,31 +276,27 @@ extern "C" {
 #endif
 
 /**
- * Initialize a pre-allocated runtime with dynamic orchestration.
- *
- * This function loads the orchestration SO from binary data via a temp file,
- * resolves the orchestration function via dlsym, then calls it to build the
- * task graph. The orchestration function is responsible for:
- * - Allocating device memory via device_malloc()
- * - Copying data to device via copy_to_device()
- * - Building the task graph
- * - Recording tensor pairs via record_tensor_pair()
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
- * @return 0 on success, -1 on failure
+ * Stage the per-callable resources for the host_build_graph variant: upload
+ * kernel binaries and dlopen the orchestration SO on the host. The dlopen
+ * handle and resolved entry-symbol pointer are parked on the runtime via
+ * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the
+ * platform layer can hoist them into PreparedCallableState. Splitting this
+ * out of init_runtime_impl is what the hbg prepare_callable / run_prepared
+ * path rests on — the dlopen runs once per cid instead of every run.
  */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -329,7 +325,9 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    // Load orchestration SO from binary data via temp file
+    // Load orchestration SO from binary data via temp file. Held open across
+    // the lifetime of the prepared callable; closed by
+    // DeviceRunner::unregister_prepared_callable.
     std::string fd_path;
     if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) {
         LOG_ERROR("Failed to create temp SO file");
@@ -343,7 +341,7 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    dlerror();  // Clear any existing error
+    dlerror();
     OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(dlsym(handle, orch_func_name));
     const char *dlsym_error = dlerror();
     if (dlsym_error != nullptr) {
@@ -354,11 +352,42 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
 
     LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name);
 
-    // Clear any previous tensor pairs
+    runtime->pending_host_dlopen_handle_ = handle;
+    runtime->pending_host_orch_func_ptr_ = reinterpret_cast<void *>(orch_func);
+    // hbg never uploads orch SO bytes to the device; clear the trb staging
+    // fields so DeviceRunner::register_prepared_callable cannot mistake this
+    // for a trb-shaped registration.
+    runtime->pending_orch_so_data_ = nullptr;
+    runtime->pending_orch_so_size_ = 0;
+    return 0;
+}
+
+/**
+ * Per-run binding for hbg: invoke the previously-resolved orchestration entry
+ * point against the supplied args, then upload tensor info / allocation
+ * storage. Assumes prepare_callable_impl populated
+ * `pending_host_orch_func_ptr_` (either freshly during prepare_callable, or
+ * via DeviceRunner::bind_prepared_callable_to_runtime when run_prepared
+ * replays a prepared cid onto a fresh Runtime).
+ */
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(runtime->pending_host_orch_func_ptr_);
+    if (orch_func == nullptr) {
+        LOG_ERROR("bind_prepared_to_runtime_impl: host orch_func pointer is null");
+        return -1;
+    }
+
     runtime->clear_tensor_pairs();
 
     LOG_INFO_V0("=== Calling Orchestration Function ===");
-
     LOG_DEBUG(
         "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(),
         orch_args->tensor_count(), orch_args->scalar_count()
@@ -370,13 +399,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder
     };
 
-    // Call orchestration function to build task graph
-    // The orchestration function handles device memory allocation and copy-to-device
     int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
     if (rc != 0) {
         LOG_ERROR("Orchestration function failed with code %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -384,7 +410,6 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
     if (rc != 0) {
         LOG_ERROR("Failed to upload tensor allocations: %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -396,16 +421,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
             runtime->clear_tensor_allocation_storage();
         }
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
     LOG_INFO_V0("Runtime initialized. Ready for execution from Python.");
-
-    // Host orchestration is complete once orch_func returns. The task graph now
-    // lives in Runtime, so the orchestration SO can be closed immediately.
-    dlclose(handle);
-
     return 0;
 }
 
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 46b673878..25d25dc76 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -434,6 +434,19 @@ class Runtime {
      */
     void set_function_bin_addr(int func_id, uint64_t addr);
 
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_prepared_callable_to_runtime when restoring kernels
+     * across run_prepared invocations: the prepared callable owns the
+     * kernel binaries' device memory until unregister, so
+     * validate_runtime_impl must NOT free them.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr) {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        func_id_to_addr_[func_id] = addr;
+    }
+
     int get_registered_kernel_count() const { return registered_kernel_count_; }
 
     int get_registered_kernel_func_id(int index) const {
@@ -451,24 +464,67 @@ class Runtime {
     // NOTE: Placed at end of class to avoid affecting device memory layout
     HostApi host_api;
 
-    // Device orchestration SO metadata: device buffer + dirty flag (host
+    // Device orchestration SO metadata: device buffer pointer + size (host
     // populates these via DeviceRunner::prepare_orch_so before launch).
     // host_build_graph runtime variant currently does not load device
     // orchestration SOs, but DeviceRunner is shared with the other variants
     // and unconditionally writes these fields, so they must exist.
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
-    bool has_new_orch_so_{false};
+
+    // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads
+    // `active_callable_id_`; the field exists for parity with the
+    // shared platform layer (DeviceRunner stamps it on every run).
+    int32_t active_callable_id_{-1};
+    bool register_new_callable_id_{false};
 
     // Host-only staging fields (mirror tensormap_and_ringbuffer variant).
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+    // Host-orchestration staging (hbg path). prepare_callable_impl
+    // dlopens the orch SO on the host and parks the handle + entry-symbol
+    // pointer here so DeviceRunner::register_prepared_callable_host_orch can
+    // claim them; bind_prepared_callable_to_runtime restores them onto a fresh
+    // Runtime so bind_prepared_to_runtime_impl can call orch_func without a
+    // second dlopen. Distinct from `pending_orch_so_data_` (which is unused on
+    // hbg — host orchestration never uploads the SO bytes to the device).
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
+
+    // Device-orchestration entry/config symbol names (trb path). Always
+    // empty on this hbg variant — included for API parity so the shared
+    // platform layer can call set_device_orch_func_name unconditionally.
+    char device_orch_func_name_[64]{};
+    char device_orch_config_name_[64]{};
+
+    void set_device_orch_func_name(const char *name) {
+        device_orch_func_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_func_name_, name, sizeof(device_orch_func_name_) - 1);
+            device_orch_func_name_[sizeof(device_orch_func_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_func_name() const { return device_orch_func_name_; }
+    void set_device_orch_config_name(const char *name) {
+        device_orch_config_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_config_name_, name, sizeof(device_orch_config_name_) - 1);
+            device_orch_config_name_[sizeof(device_orch_config_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_config_name() const { return device_orch_config_name_; }
+
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
-        has_new_orch_so_ = is_new;
     }
+    void set_active_callable_id(int32_t callable_id, bool is_new) {
+        active_callable_id_ = callable_id;
+        register_new_callable_id_ = is_new;
+    }
+    int32_t get_active_callable_id() const { return active_callable_id_; }
+    bool register_new_callable_id() const { return register_new_callable_id_; }
 };
 
 #endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index ab795b6f8..a15584829 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -24,6 +24,7 @@
 
 #include "aicpu/device_time.h"
 #include "aicpu/orch_so_file.h"
+#include "callable_protocol.h"
 #include "pto2_dispatch_payload.h"
 #include "runtime.h"
 #include "spin_hint.h"
@@ -89,6 +90,23 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
+// Per-callable_id orchestration SO table. The executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of
+// that callable_id, kept warm across runs).
+// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
+// (mailbox uint32 callable_id, register() returns small ints) and is shared
+// with the host bounds check in DeviceRunner::register_prepared_callable —
+// see src/common/task_interface/callable_protocol.h.
+
+struct OrchSoEntry {
+    bool in_use{false};
+    void *handle{nullptr};
+    char path[256]{};
+    DeviceOrchestrationFunc func{nullptr};
+    DeviceOrchestrationBindRuntimeFunc bind{nullptr};
+    DeviceOrchestrationConfigFunc config_func{nullptr};
+};
+
 struct AicpuExecutor {
     int32_t sched_thread_num_;
     bool orch_to_sched_{false};
@@ -107,16 +125,15 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Orchestration SO handle - defer dlclose until all tasks complete
-    void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};  // Path to orchestration SO file for cleanup
-
-    // Shared orchestration function pointer (loaded by first orch thread, used by all)
-    DeviceOrchestrationFunc orch_func_{nullptr};
-    DeviceOrchestrationBindRuntimeFunc orch_bind_runtime_{nullptr};
-    DeviceOrchestrationConfigFunc orch_config_func_{nullptr};
+    // Cached orch args pointer set by the orchestration thread before scheduler
+    // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
+    // Per-callable_id table. Single orch thread today, so first-write/read
+    // race is not possible; if multiple orch threads are ever introduced,
+    // guard the in_use=false→true transition with a mutex.
+    OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
+
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
     SchedulerContext sched_ctx_;
 
@@ -126,15 +143,14 @@ struct AicpuExecutor {
     void deinit(Runtime *runtime);
 
     ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). The
-        // handle is otherwise kept alive across runs for cache-hit reuse.
-        if (orch_so_handle_ != nullptr) {
-            dlclose(orch_so_handle_);
-            orch_so_handle_ = nullptr;
-        }
-        if (orch_so_path_[0] != '\0') {
-            unlink(orch_so_path_);
-            orch_so_path_[0] = '\0';
+        // Process-wide teardown (the single static instance dies here). Every
+        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
+        // alive across runs for cache-hit reuse.
+        for (auto &e : orch_so_table_) {
+            if (!e.in_use) continue;
+            if (e.handle != nullptr) dlclose(e.handle);
+            if (e.path[0] != '\0') unlink(e.path);
+            e = OrchSoEntry{};
         }
     }
 };
@@ -197,29 +213,37 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Two paths:
-            //   1) has_new_orch_so == true → host believes the SO identity
-            //      changed, so we drop the cached handle (if any), write the
-            //      new bytes to disk, and dlopen + dlsym a fresh handle.
-            //   2) has_new_orch_so == false → host detected a cache hit, so
-            //      we reuse `orch_so_handle_` / `orch_func_` / `orch_bind_runtime_`
-            //      from the previous run untouched. sm_handle / rt below are
-            //      always recreated because they bind this run's memory.
-            const bool reload_so = runtime->has_new_orch_so();
+            // Per-callable_id dispatch: the orch SO state lives in
+            // `orch_so_table_[callable_id]` keyed by registration order;
+            // reload is governed by `register_new_callable_id_`.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+                LOG_ERROR(
+                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
+                );
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            void **p_handle = &orch_so_table_[callable_id].handle;
+            char *p_path = orch_so_table_[callable_id].path;
+            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
+            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
+            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
+            const bool reload_so = runtime->register_new_callable_id();
 
             if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected, (re)loading", thread_idx);
-                if (orch_so_handle_ != nullptr) {
-                    dlclose(orch_so_handle_);
-                    orch_so_handle_ = nullptr;
-                    orch_func_ = nullptr;
-                    orch_bind_runtime_ = nullptr;
-                    if (orch_so_path_[0] != '\0') {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+                if (*p_handle != nullptr) {
+                    dlclose(*p_handle);
+                    *p_handle = nullptr;
+                    *p_func = nullptr;
+                    *p_bind = nullptr;
+                    if (p_path[0] != '\0') {
                         // Unlink the old file so the new open() lands on a
                         // fresh inode — protects against SIGBUS / ETXTBSY when
                         // the kernel still has the old mapping pinned.
-                        unlink(orch_so_path_);
-                        orch_so_path_[0] = '\0';
+                        unlink(p_path);
+                        p_path[0] = '\0';
                     }
                 }
 
@@ -242,7 +266,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
                 for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path));
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
                     if (fd < 0) {
                         LOG_INFO_V0(
                             "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
@@ -281,6 +305,14 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 }
                 LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
 
+                // Unlink the on-disk SO immediately: dlopen has already mmap'd
+                // the image, so the kernel keeps the inode alive until the
+                // matching dlclose / process exit. This prevents stale
+                // libdevice_orch_<pid>_<cid>.so files from accumulating in
+                // /tmp when child processes exit via os._exit(0), which skips
+                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+                unlink(so_path);
+
                 const char *entry_symbol = runtime->get_device_orch_func_name();
                 if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
                     entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
@@ -333,15 +365,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     bind_runtime_func = nullptr;
                 }
 
-                orch_so_handle_ = handle;
-                orch_func_ = orch_func;
-                orch_bind_runtime_ = bind_runtime_func;
-                orch_config_func_ = config_func;
-                snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path);
+                *p_handle = handle;
+                *p_func = orch_func;
+                *p_bind = bind_runtime_func;
+                *p_config_func = config_func;
+                snprintf(p_path, 256, "%s", so_path);
+                orch_so_table_[callable_id].in_use = true;
             } else {
-                LOG_INFO_V0("Thread %d: Reusing cached orch SO handle=%p", thread_idx, orch_so_handle_);
-                if (orch_so_handle_ == nullptr || orch_func_ == nullptr) {
-                    LOG_ERROR("Thread %d: has_new_orch_so=false but no cached SO handle/func", thread_idx);
+                LOG_INFO_V0(
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
+                );
+                if (*p_handle == nullptr || *p_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        callable_id
+                    );
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -349,8 +387,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
             // Validate arg count on every run (reload or cache hit).
-            if (orch_config_func_ != nullptr) {
-                PTO2OrchestrationConfig cfg = orch_config_func_(runtime->get_orch_args());
+            if (*p_config_func != nullptr) {
+                PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
                 LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
                 if (cfg.expected_arg_count > 0) {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
@@ -361,17 +399,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                             cfg.expected_arg_count
                         );
                         // Clean up cached state so a subsequent run does a full reload.
-                        if (orch_so_handle_ != nullptr) {
-                            dlclose(orch_so_handle_);
-                            orch_so_handle_ = nullptr;
+                        if (*p_handle != nullptr) {
+                            dlclose(*p_handle);
+                            *p_handle = nullptr;
                         }
-                        if (orch_so_path_[0] != '\0') {
-                            unlink(orch_so_path_);
-                            orch_so_path_[0] = '\0';
+                        if (p_path[0] != '\0') {
+                            unlink(p_path);
+                            p_path[0] = '\0';
                         }
-                        orch_func_ = nullptr;
-                        orch_bind_runtime_ = nullptr;
-                        orch_config_func_ = nullptr;
+                        *p_func = nullptr;
+                        *p_bind = nullptr;
+                        *p_config_func = nullptr;
+                        orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
@@ -473,11 +512,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             orch_cycle_start = get_sys_cnt_aicpu();
 #endif
             framework_bind_runtime(rt);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(rt);
+            if (*p_bind != nullptr) {
+                (*p_bind)(rt);
             }
             rt_scope_begin(rt);
-            orch_func_(*orch_args_cached_);
+            (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
@@ -633,13 +672,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     if (prev_finished + 1 == thread_num_) {
         finished_.store(true, std::memory_order_release);
         // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep orch_so_handle_ alive for
-        // the next run's cache-hit reuse (see run() reload_so branch).
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
         if (!runtime->get_orch_built_on_host() && rt != nullptr) {
             // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
             framework_bind_runtime(nullptr);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
             }
             runtime_destroy(rt);
         }
@@ -665,10 +708,9 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     orch_to_sched_ = false;
 
     orch_args_cached_ = nullptr;
-    // orch_so_handle_ / orch_func_ / orch_bind_runtime_ / orch_config_func_ / orch_so_path_ are
-    // intentionally preserved across deinit: the next run reuses them when
-    // has_new_orch_so() == false. The destructor releases them at process
-    // teardown.
+    // orch_so_table_ entries are intentionally preserved across deinit: the
+    // next run reuses cached handles when register_new_callable_id() returns
+    // false. The destructor releases them at process teardown.
 
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 5b1ca640b..b93ac103b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -92,31 +92,29 @@ static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader
 }
 
 /**
- * Initialize a pre-allocated runtime for device orchestration.
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
  *
- * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side).
- * This function:
- * - Copies tensor metadata and replaces host pointers with device pointers
- * - Copies all tensor data to device
- * - Records all tensors for copy-back
- * - Copies orchestration SO to device memory
- * - Sets up runtime state for device orchestration
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
  * @return 0 on success, -1 on failure
  */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -146,6 +144,32 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
         return -1;
     }
 
+    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
+    runtime->pending_orch_so_data_ = orch_so_binary;
+    runtime->pending_orch_so_size_ = orch_so_size;
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
+    return 0;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by prepare_callable_impl.
+ *
+ * Splitting this from prepare_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
     if (orch_args == nullptr) {
         LOG_ERROR("orch_args pointer is null");
         return -1;
@@ -153,7 +177,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int tensor_count = orch_args->tensor_count();
     int scalar_count = orch_args->scalar_count();
-    LOG_INFO_V0("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
 
     int64_t t_total_start = _now_ms();
 
@@ -196,16 +220,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
     }
     int64_t t_args_end = _now_ms();
 
-    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
-    // DeviceRunner hashes the bytes, skips the rtMemcpy when the identity is
-    // unchanged, and overwrites dev_orch_so_addr_ / size / has_new_orch_so_
-    // on Runtime before the struct is sent to device.
-    int64_t t_so_start = _now_ms();
-    runtime->pending_orch_so_data_ = orch_so_binary;
-    runtime->pending_orch_so_size_ = orch_so_size;
-    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
-    int64_t t_so_end = _now_ms();
-
     // Read ready queue shard count from environment for AICPU scheduler
     {
         const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
@@ -285,7 +299,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int64_t t_total_end = _now_ms();
     LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-    LOG_INFO_V0("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start);
     LOG_INFO_V0("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 105f1601f..ad70a259a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -189,12 +189,14 @@ class Runtime {
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
-    // `has_new_orch_so_` tells AICPU whether the host believes the SO identity
-    // changed since the previous run — when false AICPU reuses its cached
-    // dlopen handle and skips writing the file again.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    bool has_new_orch_so_;
+    // Per-callable_id dispatch. AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
+    // signals whether the host is delivering a freshly-registered
+    // callable_id (write+dlopen) or reusing an already-loaded one.
+    int32_t active_callable_id_;
+    bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
@@ -247,10 +249,16 @@ class Runtime {
     void set_orch_args(const ChipStorageTaskArgs &args);
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new);
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
-    bool has_new_orch_so() const;
+    // Per-callable_id dispatch. callable_id must be in
+    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
+    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
+    // reuse the cached entry.
+    void set_active_callable_id(int32_t callable_id, bool is_new);
+    int32_t get_active_callable_id() const;
+    bool register_new_callable_id() const;
     void set_device_orch_func_name(const char *name);
     const char *get_device_orch_func_name() const;
     void set_device_orch_config_name(const char *name);
@@ -258,6 +266,13 @@ class Runtime {
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_prepared_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr);
 
     int get_registered_kernel_count() const;
     int get_registered_kernel_func_id(int index) const;
@@ -285,11 +300,18 @@ class Runtime {
     // Host-only staging for orchestration SO. runtime_maker publishes the
     // callable-owned pointer here; DeviceRunner consumes it before launching
     // the device-side execution and replaces it with the device-resident
-    // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_). The fields
+    // buffer metadata (dev_orch_so_addr_, dev_orch_so_size_). The fields
     // below are zeroed on the device because DeviceRunner clears them before
     // the memcpy, but their values while running on device are irrelevant.
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
+
+    // Host-orchestration staging (hbg path). Always nullptr on this trb
+    // variant — included for API parity with host_build_graph so the
+    // shared platform layer can branch on `pending_host_dlopen_handle_ !=
+    // nullptr` at runtime instead of via a build-time macro.
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
 };
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 68d374e32..98d464549 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -52,7 +52,8 @@ Runtime::Runtime() {
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
     dev_orch_so_size_ = 0;
-    has_new_orch_so_ = false;
+    active_callable_id_ = -1;
+    register_new_callable_id_ = false;
     device_orch_func_name_[0] = '\0';
     device_orch_config_name_[0] = '\0';
 
@@ -102,18 +103,24 @@ void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
 // Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+// owned by DeviceRunner; only the address/size travels in Runtime).
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
     dev_orch_so_addr_ = dev_addr;
     dev_orch_so_size_ = size;
-    has_new_orch_so_ = is_new;
 }
 
 uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
 
 uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
 
-bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
+void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
+    active_callable_id_ = callable_id;
+    register_new_callable_id_ = is_new;
+}
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
 
 void Runtime::set_device_orch_func_name(const char *name) {
     if (name == nullptr) {
@@ -160,6 +167,14 @@ void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
     func_id_to_addr_[func_id] = addr;
 }
 
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
 int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
 
 int Runtime::get_registered_kernel_func_id(int index) const {
diff --git a/src/a5/platform/include/aicpu/orch_so_file.h b/src/a5/platform/include/aicpu/orch_so_file.h
index 40bec7411..33862527e 100644
--- a/src/a5/platform/include/aicpu/orch_so_file.h
+++ b/src/a5/platform/include/aicpu/orch_so_file.h
@@ -39,10 +39,15 @@
  * Caller is expected to try the next candidate directory.
  *
  * @param dir            Candidate directory (e.g. "/tmp")
+ * @param callable_id    Per-callable_id table slot id (>= 0). Required for
+ *                       uniqueness on the onboard path so concurrently-
+ *                       resident orch SOs (one per cid) do not collide on
+ *                       the same on-disk file. Pass -1 for the legacy
+ *                       single-slot dispatch path.
  * @param out_path       Buffer that receives the full file path on success
  * @param out_path_size  Size of `out_path` in bytes
  * @return Open writable fd on success, -1 on failure
  */
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size);
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size);
 
 #endif  // PLATFORM_AICPU_ORCH_SO_FILE_H_
diff --git a/src/a5/platform/onboard/aicpu/orch_so_file.cpp b/src/a5/platform/onboard/aicpu/orch_so_file.cpp
index 322cb7dcc..4e7f55232 100644
--- a/src/a5/platform/onboard/aicpu/orch_so_file.cpp
+++ b/src/a5/platform/onboard/aicpu/orch_so_file.cpp
@@ -15,10 +15,20 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
-    // Pid-based naming: AICPU device libc may lack mkstemps, and only one
-    // runtime runs per device process, so pid uniqueness is sufficient.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
+    // Pid + callable_id naming: AICPU device libc may lack mkstemps. With
+    // per-callable_id dispatch, multiple orch SOs can be resident in the
+    // same device process at once (one per cid in `orch_so_table_`), so
+    // the on-disk file name must be unique per cid — otherwise the
+    // second cid's `O_TRUNC` would silently shred the first cid's already
+    // dlopen'd file image and the next launch on cid=0 would SIGBUS.
+    // callable_id < 0 is the legacy single-slot path: pid alone is fine.
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d_%d.so", dir, getpid(), callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid());
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 149feb7da..068e3d6bc 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -17,6 +17,8 @@
 
 #include "device_runner.h"
 
+#include <dlfcn.h>
+
 #include <cassert>
 #include <cstring>
 #include <iostream>
@@ -24,6 +26,7 @@
 #include <vector>
 
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "host/host_regs.h"  // Register address retrieval
 #include "host/raii_scope_guard.h"
@@ -599,13 +602,51 @@ void DeviceRunner::print_handshake_results() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+    // Per-callable_id path: when run_prepared bound a known callable_id,
+    // the SO bytes were already H2D'd at prepare_callable time.
+    // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on
+    // whether the AICPU has seen this id since registration.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        // hbg variant: orch SO never crosses host/device, so AICPU does no
+        // per-cid dlopen. Skip orch_so_table_ bookkeeping and clear metadata.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
+        // The c_api caller passed is_new=false; refresh with the authoritative
+        // first_sighting flag before AICPU consumes register_new_callable_id_.
+        runtime.set_active_callable_id(cid, first_sighting);
+        // Pending fields must be empty in the prepared path — runtime_maker's
+        // bind_prepared_to_runtime_impl never stages them. Defensive clear:
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
     runtime.pending_orch_so_size_ = 0;
 
     if (host_so_data == nullptr || host_so_size == 0) {
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -613,7 +654,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -645,11 +686,170 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     }
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+    // (declared in src/common/task_interface/callable_protocol.h) and indexes
+    // it by callable_id; rejecting an out-of-range id here keeps host and AICPU
+    // in sync and avoids an OOB access at run time.
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    // Hash dedup: share device buffer across callable_ids that carry the same
+    // SO bytes. Refcount drops in unregister_prepared_callable; we only free
+    // when the count hits zero.
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        int rc = rtMemcpy(buf, orch_so_size, orch_so_data, orch_so_size, RT_MEMCPY_HOST_TO_DEVICE);
+        if (rc != 0) {
+            LOG_ERROR("register_prepared_callable: rtMemcpy failed: %d", rc);
+            mem_alloc_.free(buf);
+            return rc;
+        }
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    return 0;
+}
+
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    PreparedCallableState state = std::move(it->second);
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg path: dlclose the host handle; no orch SO refcount to decrement.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+
+    // Replay kernel addresses directly into runtime.func_id_to_addr_ without
+    // going through set_function_bin_addr — the latter would record func_ids
+    // in registered_kernel_func_ids_, which validate_runtime_impl iterates to
+    // free kernel binaries. Prepared kernels must survive across runs and only
+    // be freed by finalize().
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.replay_function_bin_addr(kv.first, kv.second);
+    }
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag
+    // with the authoritative first_sighting answer right before launch.
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+
 int DeviceRunner::finalize() {
     if (device_id_ == -1) {
         return 0;
@@ -669,17 +869,27 @@ int DeviceRunner::finalize() {
     // Cleanup AICPU SO
     so_info_.finalize();
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs (shared by func_id) and relies on
+    // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the
+    // legacy regression signal is preserved for callers that never went
+    // through prepare_callable.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
-        // Cleanup leaked binaries to prevent memory leaks
+        const bool prepared_path_used = prepared_callable_path_used_;
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size());
+        }
         for (const auto &pair : func_id_to_addr_) {
             void *gm_addr = reinterpret_cast<void *>(pair.second);
             mem_alloc_.free(gm_addr);
-            LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
+            LOG_DEBUG("Freed kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
         }
     }
     func_id_to_addr_.clear();
+    func_id_to_hash_.clear();
     binaries_loaded_ = false;
 
     if (dev_orch_so_buffer_ != nullptr) {
@@ -691,6 +901,29 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers that callers forgot to
+    // unregister. Refcounts no longer matter at this point — the device is
+    // about to be reset.
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
+
     // Cleanup performance profiling (frees L2PerfSetupHeader + all per-core/per-thread buffers)
     if (l2_perf_collector_.is_initialized()) {
         auto free_cb = [](void *dev_ptr) -> int {
@@ -817,11 +1050,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and DeviceRunner) and can pick distinct kernel
+    // binaries for the same func_id. Naively reusing the cached entry hands
+    // the AICore the previous callable's kernel: dispatch never completes
+    // the new task and the AICPU spins forever.
+    const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size);
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return it->second;
+        auto hash_it = func_id_to_hash_.find(func_id);
+        if (hash_it != func_id_to_hash_.end() && hash_it->second == new_hash) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching hash), returning cached address", func_id);
+            return it->second;
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        mem_alloc_.free(reinterpret_cast<void *>(it->second));
+        func_id_to_addr_.erase(it);
+        func_id_to_hash_.erase(func_id);
     }
 
     LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size);
@@ -851,6 +1097,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
     }
 
     func_id_to_addr_[func_id] = callable_addr;
+    func_id_to_hash_[func_id] = new_hash;
 
     LOG_DEBUG("  func_id=%d -> callable_addr=0x%lx, binary_code_addr=0x%lx", func_id, callable_addr, binary_code_addr);
 
@@ -868,6 +1115,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 
     mem_alloc_.free(gm_addr);
     func_id_to_addr_.erase(it);
+    func_id_to_hash_.erase(func_id);
 
     LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr);
 }
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 4c5fab748..12c1dab84 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -33,6 +33,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/kernel_args.h"
@@ -370,6 +372,67 @@ class DeviceRunner {
      */
     void release_run_context();
 
+    /**
+     * Stage a per-callable_id orchestration SO into device memory and remember
+     * the supporting metadata (entry/config symbol names, kernel func_id ↔
+     * dev_addr table). Identical SO bytes across two callable_ids share one
+     * device buffer (refcounted by hash) so the worst case for an N-cid pool
+     * is N distinct device buffers, not N copies of the same SO.
+     *
+     * @param callable_id   Caller-stable id, must be in [0, MAX_REGISTERED_CALLABLE_IDS).
+     * @param orch_so_data  Host pointer to orchestration SO bytes (owned by caller).
+     * @param orch_so_size  Size of orchestration SO in bytes.
+     * @param func_name     Entry symbol name (copied).
+     * @param config_name   Config symbol name (copied).
+     * @param kernel_addrs  func_id ↔ dev_addr pairs already uploaded by the
+     *                      caller. Stored verbatim so run_prepared can replay
+     *                      them onto a fresh Runtime without re-uploading.
+     * @return 0 on success, negative on failure.
+     */
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Host-orchestration sibling for hbg variants. See a2a3 onboard
+     * device_runner.h for full contract. Mutually exclusive with the
+     * trb-shaped overload.
+     */
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /**
+     * Drop the prepared state for `callable_id`. trb path: decrement orch SO
+     * refcount, free when zero. hbg path: dlclose the host handle. Kernel
+     * binaries are shared and only released by finalize().
+     */
+    int unregister_prepared_callable(int32_t callable_id);
+
+    /** True iff `callable_id` has prepared state staged. */
+    bool has_prepared_callable(int32_t callable_id) const;
+
+    /**
+     * Replay the prepared state for `callable_id` onto a freshly-constructed
+     * Runtime. See a2a3 onboard documentation for full contract.
+     */
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+
+    /**
+     * Number of distinct callable_ids the AICPU has been asked to dlopen for.
+     * Monotonically increases on first-sighting bind; never decremented.
+     */
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+
+    /**
+     * Number of host-side dlopens triggered by
+     * `register_prepared_callable_host_orch` (hbg variant). Mirrors
+     * `aicpu_dlopen_count` for the host-orchestration path.
+     */
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
+
 private:
     // Internal state
     int device_id_{-1};
@@ -391,6 +454,7 @@ class DeviceRunner {
     // Kernel binary management
     bool binaries_loaded_{false};              // true after AICPU SO loaded
     std::map<int, uint64_t> func_id_to_addr_;  // func_id -> function_bin_addr (device GM)
+    std::map<int, uint64_t> func_id_to_hash_;  // func_id -> elf_build_id_64(bin_data)
 
     // Orchestration SO cache (host-tracked, device-resident).
     uint64_t cached_orch_so_hash_{0};
@@ -398,6 +462,39 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state. See a2a3 onboard device_runner.h for
+    // the full design narrative; mirrored here so a5 shares the same
+    // dispatch surface.
+    struct PreparedCallableState {
+        // trb path
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        // common
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    // Monotonic AICPU dlopen counter (first-sighting bind only; never decremented).
+    size_t aicpu_dlopen_total_{0};
+    // Monotonic host-side dlopen counter for hbg variants.
+    size_t host_dlopen_total_{0};
+    // Sticky flag: prepare_callable was called at least once. Lets finalize()
+    // distinguish legacy-path leaks from prepared-path kernels that legitimately
+    // live until finalize.
+    bool prepared_callable_path_used_{false};
+
     // Performance profiling
     L2PerfCollector l2_perf_collector_;
 
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index fa151b1ab..e3d8660be 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -21,6 +21,8 @@
 #include "task_args.h"
 
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -39,7 +41,8 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -162,71 +165,6 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-) {
-    if (ctx == NULL || runtime == NULL) return -1;
-    if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1;
-
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-
-    pthread_once(&g_runner_key_once, create_runner_key);
-    pthread_setspecific(g_runner_key, ctx);
-    auto tsd_guard = RAIIScopeGuard([]() {
-        pthread_setspecific(g_runner_key, nullptr);
-    });
-
-    try {
-        int rc = runner->prepare_run_context(device_id);
-        if (rc != 0) return rc;
-        auto run_context_guard = RAIIScopeGuard([runner]() {
-            runner->release_run_context();
-        });
-
-        Runtime *r = new (runtime) Runtime();
-        r->host_api.device_malloc = device_malloc;
-        r->host_api.device_free = device_free;
-        r->host_api.copy_to_device = copy_to_device;
-        r->host_api.copy_from_device = copy_from_device;
-        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
-        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
-
-        LOG_DEBUG("About to call init_runtime_impl, r=%p", (void *)r);
-        rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
-        LOG_DEBUG("init_runtime_impl returned: %d", rc);
-        if (rc != 0) {
-            r->set_gm_sm_ptr(nullptr);
-            validate_runtime_impl(r);
-            r->~Runtime();
-            return rc;
-        }
-
-        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
-        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
-        runner->set_pmu_enabled(enable_pmu);
-        runner->set_output_prefix(output_prefix);
-
-        std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
-        std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
-        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
-        if (rc != 0) {
-            validate_runtime_impl(r);
-            r->~Runtime();
-            return rc;
-        }
-
-        rc = validate_runtime_impl(r);
-        r->~Runtime();
-        return rc;
-    } catch (...) {
-        return -1;
-    }
-}
-
 int finalize_device(DeviceContextHandle ctx) {
     if (ctx == NULL) return -1;
     try {
@@ -330,5 +268,181 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
     runner->set_log_level(log_level);
     runner->set_log_info_v(log_info_v);
 }
+/* ===========================================================================
+ * Per-callable_id preparation
+ * =========================================================================== */
+
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    // AICPU/AICore executor binaries are only consumed by run()/run_prepared();
+    // prepare_callable just uploads kernel + orch SO state.
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB,
+        // larger than the default thread stack.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            return rc;
+        }
+
+        // Extract kernel func_id ↔ dev_addr pairs uploaded by prepare_callable_impl.
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        // Clear registered kernels so the Runtime destructor (or any accidental
+        // validate call) does NOT free the kernel binaries we just uploaded —
+        // they belong to the prepared state now.
+        r->clear_registered_kernels();
+
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+) {
+    if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+    auto tsd_guard = RAIIScopeGuard([]() {
+        pthread_setspecific(g_runner_key, nullptr);
+    });
+
+    try {
+        int rc = runner->prepare_run_context(device_id);
+        if (rc != 0) return rc;
+        auto run_context_guard = RAIIScopeGuard([runner]() {
+            runner->release_run_context();
+        });
+
+        Runtime *r = new (runtime) Runtime();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        // Restore kernel addrs + orch symbol names + active_callable_id
+        rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            return rc;
+        }
+
+        // Per-run binding (tensor args, GM heap, SM alloc)
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
+        if (rc != 0) {
+            r->set_gm_sm_ptr(nullptr);
+            validate_runtime_impl(r);
+            r->~Runtime();
+            return rc;
+        }
+
+        runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
+        runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
+        runner->set_pmu_enabled(enable_pmu);
+        runner->set_output_prefix(output_prefix);
+
+        std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
+        std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
+        rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num);
+        if (rc != 0) {
+            validate_runtime_impl(r);
+            r->~Runtime();
+            return rc;
+        }
+
+        rc = validate_runtime_impl(r);
+        r->~Runtime();
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
+    } catch (...) {
+        return -1;
+    }
+}
+
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
+
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
+}
 
 }  // extern "C"
diff --git a/src/a5/platform/sim/aicpu/orch_so_file.cpp b/src/a5/platform/sim/aicpu/orch_so_file.cpp
index 4da92d7de..114fe4826 100644
--- a/src/a5/platform/sim/aicpu/orch_so_file.cpp
+++ b/src/a5/platform/sim/aicpu/orch_so_file.cpp
@@ -24,10 +24,17 @@
 
 #include <cstdio>
 
-int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) {
+int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) {
     // mkstemps: multiple sim workers can share a process, so names must be
     // unique per call.  The "XXXXXX" template is replaced in-place.
-    int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    // callable_id is embedded purely for log readability (mkstemps already
+    // guarantees uniqueness regardless).
+    int32_t written;
+    if (callable_id >= 0) {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_cid%d_XXXXXX.so", dir, callable_id);
+    } else {
+        written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir);
+    }
     if (written < 0 || static_cast<size_t>(written) >= out_path_size) {
         return -1;
     }
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 015419665..ea325c7f9 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -36,6 +36,7 @@
 
 #include "aicpu/platform_aicpu_affinity.h"
 #include "callable.h"
+#include "callable_protocol.h"
 #include "utils/elf_build_id.h"
 #include "cpu_sim_context.h"
 #include "host/raii_scope_guard.h"
@@ -653,13 +654,45 @@ void DeviceRunner::unload_executor_binaries() {
 }
 
 int DeviceRunner::prepare_orch_so(Runtime &runtime) {
+    // Per-callable_id path: mirror onboard. Bytes were staged at
+    // register_prepared_callable time; here we only stamp metadata onto
+    // the runtime and resolve `register_new_callable_id_` from first sighting.
+    const int32_t cid = runtime.get_active_callable_id();
+    if (cid >= 0) {
+        auto it = prepared_callables_.find(cid);
+        if (it == prepared_callables_.end()) {
+            LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid);
+            return -1;
+        }
+        const auto &state = it->second;
+        // hbg variant: orch SO never crosses host/device boundary.
+        if (state.host_dlopen_handle != nullptr) {
+            runtime.set_dev_orch_so(0, 0);
+            runtime.set_active_callable_id(cid, /*is_new=*/false);
+            return 0;
+        }
+        const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second;
+        if (first_sighting) {
+            ++aicpu_dlopen_total_;
+        }
+        runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size);
+        runtime.set_active_callable_id(cid, first_sighting);
+        runtime.pending_orch_so_data_ = nullptr;
+        runtime.pending_orch_so_size_ = 0;
+        LOG_INFO_V0(
+            "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size,
+            first_sighting ? 1 : 0
+        );
+        return 0;
+    }
+
     const void *host_so_data = runtime.pending_orch_so_data_;
     const size_t host_so_size = runtime.pending_orch_so_size_;
     runtime.pending_orch_so_data_ = nullptr;
     runtime.pending_orch_so_size_ = 0;
 
     if (host_so_data == nullptr || host_so_size == 0) {
-        runtime.set_dev_orch_so(0, 0, false);
+        runtime.set_dev_orch_so(0, 0);
         return 0;
     }
 
@@ -667,7 +700,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
 
     if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) {
         LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size);
-        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/false);
+        runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
         return 0;
     }
 
@@ -695,11 +728,152 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) {
     std::memcpy(dev_orch_so_buffer_, host_orch_so_copy_.data(), host_so_size);
 
     cached_orch_so_hash_ = new_hash;
-    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size, /*is_new=*/true);
+    runtime.set_dev_orch_so(reinterpret_cast<uint64_t>(dev_orch_so_buffer_), host_so_size);
     LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size);
     return 0;
 }
 
+int DeviceRunner::register_prepared_callable(
+    int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (orch_so_data == nullptr || orch_so_size == 0) {
+        LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size);
+
+    auto buf_it = orch_so_dedup_.find(hash);
+    uint64_t dev_addr = 0;
+    if (buf_it == orch_so_dedup_.end()) {
+        void *buf = mem_alloc_.alloc(orch_so_size);
+        if (buf == nullptr) {
+            LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size);
+            return -1;
+        }
+        // Sim shares an address space with the simulated AICPU thread, so a
+        // plain memcpy is the moral equivalent of rtMemcpy on hardware.
+        std::memcpy(buf, orch_so_data, orch_so_size);
+        OrchSoBuffer entry;
+        entry.dev_addr = buf;
+        entry.capacity = orch_so_size;
+        entry.refcount = 1;
+        orch_so_dedup_.emplace(hash, entry);
+        dev_addr = reinterpret_cast<uint64_t>(buf);
+        LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size);
+    } else {
+        buf_it->second.refcount++;
+        dev_addr = reinterpret_cast<uint64_t>(buf_it->second.dev_addr);
+        LOG_INFO_V0(
+            "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount
+        );
+    }
+
+    PreparedCallableState state;
+    state.hash = hash;
+    state.dev_orch_so_addr = dev_addr;
+    state.dev_orch_so_size = orch_so_size;
+    state.func_name = (func_name != nullptr) ? func_name : "";
+    state.config_name = (config_name != nullptr) ? config_name : "";
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    return 0;
+}
+
+int DeviceRunner::register_prepared_callable_host_orch(
+    int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+    std::vector<std::pair<int, uint64_t>> kernel_addrs
+) {
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id,
+            MAX_REGISTERED_CALLABLE_IDS
+        );
+        return -1;
+    }
+    if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) {
+        LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id);
+        return -1;
+    }
+    if (prepared_callables_.count(callable_id) != 0) {
+        LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id);
+        return -1;
+    }
+
+    PreparedCallableState state;
+    state.host_dlopen_handle = host_dlopen_handle;
+    state.host_orch_func_ptr = host_orch_func_ptr;
+    state.kernel_addrs = std::move(kernel_addrs);
+    prepared_callables_.emplace(callable_id, std::move(state));
+    prepared_callable_path_used_ = true;
+    ++host_dlopen_total_;
+    LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_);
+    return 0;
+}
+
+int DeviceRunner::unregister_prepared_callable(int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        return 0;
+    }
+    PreparedCallableState state = std::move(it->second);
+    prepared_callables_.erase(it);
+    aicpu_seen_callable_ids_.erase(callable_id);
+
+    if (state.host_dlopen_handle != nullptr) {
+        // hbg path: dlclose host handle; no orch SO refcount.
+        dlclose(state.host_dlopen_handle);
+        return 0;
+    }
+
+    auto buf_it = orch_so_dedup_.find(state.hash);
+    if (buf_it != orch_so_dedup_.end()) {
+        if (--buf_it->second.refcount <= 0) {
+            mem_alloc_.free(buf_it->second.dev_addr);
+            orch_so_dedup_.erase(buf_it);
+        }
+    }
+    return 0;
+}
+
+bool DeviceRunner::has_prepared_callable(int32_t callable_id) const {
+    return prepared_callables_.count(callable_id) != 0;
+}
+
+int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) {
+    auto it = prepared_callables_.find(callable_id);
+    if (it == prepared_callables_.end()) {
+        LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id);
+        return -1;
+    }
+    const auto &state = it->second;
+    for (const auto &kv : state.kernel_addrs) {
+        if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) {
+            LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first);
+            return -1;
+        }
+        runtime.replay_function_bin_addr(kv.first, kv.second);
+    }
+    runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle;
+    runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr;
+    runtime.set_device_orch_func_name(state.func_name.c_str());
+    runtime.set_device_orch_config_name(state.config_name.c_str());
+    runtime.set_active_callable_id(callable_id, /*is_new=*/false);
+    return 0;
+}
+
 int DeviceRunner::finalize() {
     // Skip if already finalized
     if (device_id_ == -1 && aicpu_so_handle_ == nullptr && aicore_so_handle_ == nullptr) {
@@ -736,15 +910,22 @@ int DeviceRunner::finalize() {
         pmu_collector_.finalize(nullptr, free_cb, nullptr);
     }
 
-    // Kernel binaries should have been removed by validate_runtime_impl()
+    // Kernel binaries are normally released by validate_runtime_impl on the
+    // legacy run() path. The prepared-callable path intentionally leaves
+    // them resident across runs and relies on finalize() to reclaim them;
+    // that is not a leak.
     if (!func_id_to_addr_.empty()) {
-        LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
-        // Cleanup leaked handles and host copies
+        const bool prepared_path_used = prepared_callable_path_used_;
+        if (prepared_path_used) {
+            LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size());
+        } else {
+            LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size());
+        }
         for (auto &pair : func_id_to_addr_) {
             MappedKernel &kernel = pair.second;
             if (kernel.dl_handle != nullptr) {
                 dlclose(kernel.dl_handle);
-                LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first);
+                LOG_DEBUG("Closed kernel: func_id=%d", pair.first);
             }
             delete[] kernel.callable_buf;
         }
@@ -761,6 +942,27 @@ int DeviceRunner::finalize() {
     host_orch_so_copy_.clear();
     host_orch_so_copy_.shrink_to_fit();
 
+    // Release any prepared-callable orch SO buffers callers forgot to drop.
+    for (auto &kv : orch_so_dedup_) {
+        if (kv.second.dev_addr != nullptr) {
+            mem_alloc_.free(kv.second.dev_addr);
+        }
+    }
+    orch_so_dedup_.clear();
+    // hbg path: dlclose any host orch handles callers forgot to unregister.
+    // finalize() is the last chance; Worker.close() does not auto-unregister
+    // each callable_id, so without this loop the host process leaks one
+    // dlopen handle per (re)created Worker — observable in long-running
+    // pytest sessions.
+    for (auto &kv : prepared_callables_) {
+        if (kv.second.host_dlopen_handle != nullptr) {
+            dlclose(kv.second.host_dlopen_handle);
+        }
+    }
+    prepared_callables_.clear();
+    aicpu_seen_callable_ids_.clear();
+    aicpu_dlopen_total_ = 0;
+
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
@@ -786,11 +988,25 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data
         return 0;
     }
 
-    // Return cached callable address if already uploaded
+    // Return cached callable address if already uploaded *and* the new bytes
+    // match. With the prepared-callable path, multiple ChipCallables share a
+    // single ChipWorker (and hence DeviceRunner) and can pick distinct
+    // kernel binaries for the same func_id.  Naively reusing the cached
+    // entry hands the AICore the previous callable's kernel and segfaults
+    // at dispatch.
     auto it = func_id_to_addr_.find(func_id);
     if (it != func_id_to_addr_.end()) {
-        LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id);
-        return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        const auto &cached_callable = *reinterpret_cast<const CoreCallable *>(it->second.callable_buf);
+        const auto *new_callable = reinterpret_cast<const CoreCallable *>(bin_data);
+        if (cached_callable.binary_size() == new_callable->binary_size() &&
+            std::memcmp(cached_callable.binary_data(), new_callable->binary_data(), new_callable->binary_size()) == 0) {
+            LOG_INFO_V0("Kernel func_id=%d already uploaded (matching bytes), returning cached address", func_id);
+            return reinterpret_cast<uint64_t>(it->second.callable_buf);
+        }
+        LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id);
+        if (it->second.dl_handle != nullptr) dlclose(it->second.dl_handle);
+        delete[] it->second.callable_buf;
+        func_id_to_addr_.erase(it);
     }
 
     // Extract binary from CoreCallable envelope
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 636149f18..a153a18a1 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -37,6 +37,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "common/core_type.h"
@@ -208,6 +210,36 @@ class DeviceRunner {
      */
     void remove_kernel_binary(int func_id);
 
+    /**
+     * Stage a per-callable_id orchestration SO and its supporting metadata.
+     * See a5 onboard or a2a3 device_runner.h for full contract.
+     */
+    int register_prepared_callable(
+        int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name,
+        const char *config_name, std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /** Host-orchestration sibling for hbg variants. See a2a3 onboard. */
+    int register_prepared_callable_host_orch(
+        int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr,
+        std::vector<std::pair<int, uint64_t>> kernel_addrs
+    );
+
+    /** Drop prepared state for `callable_id`; trb refcounts SO, hbg dlcloses handle. */
+    int unregister_prepared_callable(int32_t callable_id);
+
+    /** True iff `callable_id` has prepared state staged. */
+    bool has_prepared_callable(int32_t callable_id) const;
+
+    /** Replay prepared state onto a freshly-constructed Runtime. */
+    int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id);
+
+    /** Monotonic AICPU dlopen counter (first-sighting only; never decremented). */
+    size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; }
+
+    /** Monotonic host-side dlopen counter for hbg variants. */
+    size_t host_dlopen_count() const { return host_dlopen_total_; }
+
 private:
     // Configuration
     int device_id_{-1};
@@ -230,6 +262,32 @@ class DeviceRunner {
     size_t dev_orch_so_capacity_{0};
     std::vector<uint8_t> host_orch_so_copy_;
 
+    // Per-callable_id prepared state. Mirrors onboard.
+    struct PreparedCallableState {
+        // trb path
+        uint64_t hash{0};
+        uint64_t dev_orch_so_addr{0};
+        size_t dev_orch_so_size{0};
+        std::string func_name;
+        std::string config_name;
+        // common
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        // hbg path
+        void *host_dlopen_handle{nullptr};
+        void *host_orch_func_ptr{nullptr};
+    };
+    struct OrchSoBuffer {
+        void *dev_addr{nullptr};
+        size_t capacity{0};
+        int refcount{0};
+    };
+    std::unordered_map<int32_t, PreparedCallableState> prepared_callables_;
+    std::unordered_map<uint64_t, OrchSoBuffer> orch_so_dedup_;
+    std::unordered_set<int32_t> aicpu_seen_callable_ids_;
+    size_t aicpu_dlopen_total_{0};
+    size_t host_dlopen_total_{0};
+    bool prepared_callable_path_used_{false};
+
     // Runtime pointer for print_handshake_results
     Runtime *last_runtime_{nullptr};
 
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index cd16e3734..db05b3ac1 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -22,6 +22,8 @@
 
 #include <new>
 #include <pthread.h>
+
+#include <memory>
 #include <vector>
 
 #include "common/unified_log.h"
@@ -35,7 +37,8 @@ extern "C" {
 /* ===========================================================================
  * Runtime Implementation Functions (defined in runtime_maker.cpp)
  * =========================================================================== */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args);
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable);
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args);
 int validate_runtime_impl(Runtime *runtime);
 
 /* ===========================================================================
@@ -156,19 +159,146 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de
     }
 }
 
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
+int finalize_device(DeviceContextHandle ctx) {
+    if (ctx == NULL) return -1;
+    try {
+        int rc = static_cast<DeviceRunner *>(ctx)->finalize();
+        int dev = pto_cpu_sim_get_bound_device();
+        if (dev >= 0) {
+            pto_cpu_sim_release_device(dev);
+        }
+        return rc;
+    } catch (...) {
+        return -1;
+    }
+}
+
+/* ===========================================================================
+ * ACL lifecycle stubs.  Sim has no ACL / aclrtStream concept, so these no-op
+ * to satisfy the uniform host_runtime.so ABI that ChipWorker dlsym's.  The
+ * real comm_* entry points come from src/common/platform_comm/comm_sim.cpp,
+ * which is compiled into this runtime via CMakeLists.
+ * =========================================================================== */
+
+int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
+    (void)ctx;
+    (void)device_id;
+    return 0;
+}
+
+void *create_comm_stream_ctx(DeviceContextHandle ctx) {
+    (void)ctx;
+    return NULL;
+}
+
+int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
+    (void)ctx;
+    (void)stream;
+    return 0;
+}
+
+/* ===========================================================================
+ * Internal helpers called from runtime_maker.cpp via Runtime.host_api
+ * =========================================================================== */
+
+void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    if (runtime == NULL) return;
+    Runtime *r = static_cast<Runtime *>(runtime);
+    r->record_tensor_pair(host_ptr, dev_ptr, size);
+}
+
+void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
+    if (ctx == NULL) return;
+    // No CANN dlog on sim; only HostLogger + runner state.
+    HostLogger::get_instance().set_level(static_cast<simpler::log::LogLevel>(log_level));
+    HostLogger::get_instance().set_info_v(log_info_v);
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+    runner->set_log_level(log_level);
+    runner->set_log_info_v(log_info_v);
+}
+/* ===========================================================================
+ * Per-callable_id preparation
+ * =========================================================================== */
+
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+) {
+    if (ctx == NULL || callable == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    (void)aicpu_binary;
+    (void)aicpu_size;
+    (void)aicore_binary;
+    (void)aicore_size;
+    (void)device_id;
+
+    pthread_once(&g_runner_key_once, create_runner_key);
+    pthread_setspecific(g_runner_key, ctx);
+
+    try {
+        // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB.
+        std::unique_ptr<Runtime> r_owner = std::make_unique<Runtime>();
+        Runtime *r = r_owner.get();
+        r->host_api.device_malloc = device_malloc;
+        r->host_api.device_free = device_free;
+        r->host_api.copy_to_device = copy_to_device;
+        r->host_api.copy_from_device = copy_from_device;
+        r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
+        r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
+
+        int rc = prepare_callable_impl(r, reinterpret_cast<const ChipCallable *>(callable));
+        if (rc != 0) {
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        std::vector<std::pair<int, uint64_t>> kernel_addrs;
+        int kcount = r->get_registered_kernel_count();
+        kernel_addrs.reserve(kcount);
+        for (int i = 0; i < kcount; i++) {
+            int fid = r->get_registered_kernel_func_id(i);
+            kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid));
+        }
+        r->clear_registered_kernels();
+
+        if (r->pending_host_dlopen_handle_ != nullptr) {
+            rc = runner->register_prepared_callable_host_orch(
+                callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs)
+            );
+            r->pending_host_dlopen_handle_ = nullptr;
+            r->pending_host_orch_func_ptr_ = nullptr;
+        } else {
+            rc = runner->register_prepared_callable(
+                callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(),
+                r->get_device_orch_config_name(), std::move(kernel_addrs)
+            );
+        }
+        pthread_setspecific(g_runner_key, nullptr);
+        return rc;
+    } catch (...) {
+        pthread_setspecific(g_runner_key, nullptr);
+        return -1;
+    }
+}
+
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
     size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
 ) {
     if (ctx == NULL || runtime == NULL) return -1;
+    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
+
+    if (!runner->has_prepared_callable(callable_id)) {
+        LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id);
+        return -1;
+    }
 
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
     try {
-        // Phase 1: placement new + build graph
         Runtime *r = new (runtime) Runtime();
         r->host_api.device_malloc = device_malloc;
         r->host_api.device_free = device_free;
@@ -177,9 +307,14 @@ int run_runtime(
         r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
         r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;
 
-        int rc = init_runtime_impl(
-            r, reinterpret_cast<const ChipCallable *>(callable), reinterpret_cast<const ChipStorageTaskArgs *>(args)
-        );
+        int rc = runner->bind_prepared_callable_to_runtime(*r, callable_id);
+        if (rc != 0) {
+            r->~Runtime();
+            pthread_setspecific(g_runner_key, nullptr);
+            return rc;
+        }
+
+        rc = bind_prepared_to_runtime_impl(r, reinterpret_cast<const ChipStorageTaskArgs *>(args));
         if (rc != 0) {
             r->set_gm_sm_ptr(nullptr);
             validate_runtime_impl(r);
@@ -188,15 +323,11 @@ int run_runtime(
             return rc;
         }
 
-        // Phase 2: publish diagnostics enablement to the DeviceRunner so run()
-        // and its helpers can read the three sub-features uniformly (via
-        // members, not Runtime / run() args).
         runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0);
         runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
         runner->set_pmu_enabled(enable_pmu);
         runner->set_output_prefix(output_prefix);
 
-        // Phase 3: launch
         std::vector<uint8_t> aicpu_vec;
         std::vector<uint8_t> aicore_vec;
         if (aicpu_binary != NULL && aicpu_size > 0) {
@@ -213,7 +344,6 @@ int run_runtime(
             return rc;
         }
 
-        // Phase 4: finalize (copy results back)
         rc = validate_runtime_impl(r);
         r->~Runtime();
         pthread_setspecific(g_runner_key, nullptr);
@@ -224,62 +354,31 @@ int run_runtime(
     }
 }
 
-int finalize_device(DeviceContextHandle ctx) {
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) {
     if (ctx == NULL) return -1;
     try {
-        int rc = static_cast<DeviceRunner *>(ctx)->finalize();
-        int dev = pto_cpu_sim_get_bound_device();
-        if (dev >= 0) {
-            pto_cpu_sim_release_device(dev);
-        }
-        return rc;
+        return static_cast<DeviceRunner *>(ctx)->unregister_prepared_callable(callable_id);
     } catch (...) {
         return -1;
     }
 }
 
-/* ===========================================================================
- * ACL lifecycle stubs.  Sim has no ACL / aclrtStream concept, so these no-op
- * to satisfy the uniform host_runtime.so ABI that ChipWorker dlsym's.  The
- * real comm_* entry points come from src/common/platform_comm/comm_sim.cpp,
- * which is compiled into this runtime via CMakeLists.
- * =========================================================================== */
-
-int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
-    (void)ctx;
-    (void)device_id;
-    return 0;
-}
-
-void *create_comm_stream_ctx(DeviceContextHandle ctx) {
-    (void)ctx;
-    return NULL;
-}
-
-int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
-    (void)ctx;
-    (void)stream;
-    return 0;
-}
-
-/* ===========================================================================
- * Internal helpers called from runtime_maker.cpp via Runtime.host_api
- * =========================================================================== */
-
-void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) {
-    if (runtime == NULL) return;
-    Runtime *r = static_cast<Runtime *>(runtime);
-    r->record_tensor_pair(host_ptr, dev_ptr, size);
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->aicpu_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
 }
 
-void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) {
-    if (ctx == NULL) return;
-    // No CANN dlog on sim; only HostLogger + runner state.
-    HostLogger::get_instance().set_level(static_cast<simpler::log::LogLevel>(log_level));
-    HostLogger::get_instance().set_info_v(log_info_v);
-    DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
-    runner->set_log_level(log_level);
-    runner->set_log_info_v(log_info_v);
+size_t get_host_dlopen_count(DeviceContextHandle ctx) {
+    if (ctx == NULL) return 0;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->host_dlopen_count();
+    } catch (...) {
+        return 0;
+    }
 }
 
 }  // extern "C"
diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
index cf6618170..390ad3d19 100644
--- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
@@ -276,31 +276,27 @@ extern "C" {
 #endif
 
 /**
- * Initialize a pre-allocated runtime with dynamic orchestration.
- *
- * This function loads the orchestration SO from binary data via a temp file,
- * resolves the orchestration function via dlsym, then calls it to build the
- * task graph. The orchestration function is responsible for:
- * - Allocating device memory via device_malloc()
- * - Copying data to device via copy_to_device()
- * - Building the task graph
- * - Recording tensor pairs via record_tensor_pair()
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
- * @return 0 on success, -1 on failure
+ * Stage the per-callable resources for the host_build_graph variant: upload
+ * kernel binaries and dlopen the orchestration SO on the host. The dlopen
+ * handle and resolved entry-symbol pointer are parked on the runtime via
+ * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the
+ * platform layer can hoist them into PreparedCallableState. Splitting this
+ * out of init_runtime_impl is what the hbg prepare_callable / run_prepared
+ * path rests on — the dlopen runs once per cid instead of every run.
  */
-int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -329,7 +325,9 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    // Load orchestration SO from binary data via temp file
+    // Load orchestration SO from binary data via temp file. Held open across
+    // the lifetime of the prepared callable; closed by
+    // DeviceRunner::unregister_prepared_callable.
     std::string fd_path;
     if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) {
         LOG_ERROR("Failed to create temp SO file");
@@ -343,7 +341,7 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         return -1;
     }
 
-    dlerror();  // Clear any existing error
+    dlerror();
     OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(dlsym(handle, orch_func_name));
     const char *dlsym_error = dlerror();
     if (dlsym_error != nullptr) {
@@ -354,11 +352,42 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
 
     LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name);
 
-    // Clear any previous tensor pairs
+    runtime->pending_host_dlopen_handle_ = handle;
+    runtime->pending_host_orch_func_ptr_ = reinterpret_cast<void *>(orch_func);
+    // hbg never uploads orch SO bytes to the device; clear the trb staging
+    // fields so DeviceRunner::register_prepared_callable cannot mistake this
+    // for a trb-shaped registration.
+    runtime->pending_orch_so_data_ = nullptr;
+    runtime->pending_orch_so_size_ = 0;
+    return 0;
+}
+
+/**
+ * Per-run binding for hbg: invoke the previously-resolved orchestration entry
+ * point against the supplied args, then upload tensor info / allocation
+ * storage. Assumes prepare_callable_impl populated
+ * `pending_host_orch_func_ptr_` (either freshly during prepare_callable, or
+ * via DeviceRunner::bind_prepared_callable_to_runtime when run_prepared
+ * replays a prepared cid onto a fresh Runtime).
+ */
+int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    OrchestrationFunc orch_func = reinterpret_cast<OrchestrationFunc>(runtime->pending_host_orch_func_ptr_);
+    if (orch_func == nullptr) {
+        LOG_ERROR("bind_prepared_to_runtime_impl: host orch_func pointer is null");
+        return -1;
+    }
+
     runtime->clear_tensor_pairs();
 
     LOG_INFO_V0("=== Calling Orchestration Function ===");
-
     LOG_DEBUG(
         "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(),
         orch_args->tensor_count(), orch_args->scalar_count()
@@ -370,13 +399,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder
     };
 
-    // Call orchestration function to build task graph
-    // The orchestration function handles device memory allocation and copy-to-device
     int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
     if (rc != 0) {
         LOG_ERROR("Orchestration function failed with code %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -384,7 +410,6 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
     if (rc != 0) {
         LOG_ERROR("Failed to upload tensor allocations: %d", rc);
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
@@ -396,16 +421,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
             runtime->clear_tensor_allocation_storage();
         }
         runtime->clear_tensor_pairs();
-        dlclose(handle);
         return rc;
     }
 
     LOG_INFO_V0("Runtime initialized. Ready for execution from Python.");
-
-    // Host orchestration is complete once orch_func returns. The task graph now
-    // lives in Runtime, so the orchestration SO can be closed immediately.
-    dlclose(handle);
-
     return 0;
 }
 
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index 607783733..73e201494 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -448,6 +448,16 @@ class Runtime {
      */
     void set_function_bin_addr(int func_id, uint64_t addr);
 
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. See a2a3 hbg
+     * runtime.h for the full contract.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr) {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        func_id_to_addr_[func_id] = addr;
+    }
+
     int get_registered_kernel_count() const { return registered_kernel_count_; }
 
     int get_registered_kernel_func_id(int index) const {
@@ -468,15 +478,56 @@ class Runtime {
     // Device orchestration SO metadata (see a2a3 host_build_graph runtime.h).
     uint64_t dev_orch_so_addr_{0};
     uint64_t dev_orch_so_size_{0};
-    bool has_new_orch_so_{false};
+    // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads
+    // `active_callable_id_`; the field exists for parity with the
+    // shared platform layer (DeviceRunner stamps it on every run).
+    int32_t active_callable_id_{-1};
+    bool register_new_callable_id_{false};
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
 
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+    // Host-orchestration staging (hbg path). prepare_callable_impl
+    // dlopens the orch SO on the host and parks the handle + entry-symbol
+    // pointer here so DeviceRunner::register_prepared_callable_host_orch can
+    // claim them; bind_prepared_callable_to_runtime restores them onto a fresh
+    // Runtime so bind_prepared_to_runtime_impl can call orch_func without a
+    // second dlopen.
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
+
+    // Device-orchestration entry/config symbol names (trb path). Always
+    // empty on this hbg variant — included for API parity so the shared
+    // platform layer can call set_device_orch_func_name unconditionally.
+    char device_orch_func_name_[64]{};
+    char device_orch_config_name_[64]{};
+
+    void set_device_orch_func_name(const char *name) {
+        device_orch_func_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_func_name_, name, sizeof(device_orch_func_name_) - 1);
+            device_orch_func_name_[sizeof(device_orch_func_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_func_name() const { return device_orch_func_name_; }
+    void set_device_orch_config_name(const char *name) {
+        device_orch_config_name_[0] = '\0';
+        if (name) {
+            strncpy(device_orch_config_name_, name, sizeof(device_orch_config_name_) - 1);
+            device_orch_config_name_[sizeof(device_orch_config_name_) - 1] = '\0';
+        }
+    }
+    const char *get_device_orch_config_name() const { return device_orch_config_name_; }
+
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
         dev_orch_so_addr_ = dev_addr;
         dev_orch_so_size_ = size;
-        has_new_orch_so_ = is_new;
     }
+    void set_active_callable_id(int32_t callable_id, bool is_new) {
+        active_callable_id_ = callable_id;
+        register_new_callable_id_ = is_new;
+    }
+    int32_t get_active_callable_id() const { return active_callable_id_; }
+    bool register_new_callable_id() const { return register_new_callable_id_; }
 };
 
 #endif  // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index f1936d467..e9b97d5ff 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -24,6 +24,7 @@
 
 #include "aicpu/device_time.h"
 #include "aicpu/orch_so_file.h"
+#include "callable_protocol.h"
 #include "pto2_dispatch_payload.h"
 #include "runtime.h"
 #include "spin_hint.h"
@@ -89,6 +90,23 @@ static int32_t read_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
+// Per-callable_id orchestration SO table. The executor dispatches
+// `orch_so_table_[active_callable_id_]` (created on first sighting of
+// that callable_id, kept warm across runs).
+// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
+// (mailbox uint32 callable_id, register() returns small ints) and is shared
+// with the host bounds check in DeviceRunner::register_prepared_callable —
+// see src/common/task_interface/callable_protocol.h.
+
+struct OrchSoEntry {
+    bool in_use{false};
+    void *handle{nullptr};
+    char path[256]{};
+    DeviceOrchestrationFunc func{nullptr};
+    DeviceOrchestrationBindRuntimeFunc bind{nullptr};
+    DeviceOrchestrationConfigFunc config_func{nullptr};
+};
+
 struct AicpuExecutor {
     int32_t sched_thread_num_;
     bool orch_to_sched_{false};
@@ -107,16 +125,15 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Orchestration SO handle - defer dlclose until all tasks complete
-    void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};  // Path to orchestration SO file for cleanup
-
-    // Shared orchestration function pointer (loaded by first orch thread, used by all)
-    DeviceOrchestrationFunc orch_func_{nullptr};
-    DeviceOrchestrationBindRuntimeFunc orch_bind_runtime_{nullptr};
-    DeviceOrchestrationConfigFunc orch_config_func_{nullptr};
+    // Cached orch args pointer set by the orchestration thread before scheduler
+    // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
+    // Per-callable_id table. Single orch thread today, so first-write/read
+    // race is not possible; if multiple orch threads are ever introduced,
+    // guard the in_use=false→true transition with a mutex.
+    OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
+
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
     SchedulerContext sched_ctx_;
 
@@ -126,15 +143,14 @@ struct AicpuExecutor {
     void deinit(Runtime *runtime);
 
     ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). The
-        // handle is otherwise kept alive across runs for cache-hit reuse.
-        if (orch_so_handle_ != nullptr) {
-            dlclose(orch_so_handle_);
-            orch_so_handle_ = nullptr;
-        }
-        if (orch_so_path_[0] != '\0') {
-            unlink(orch_so_path_);
-            orch_so_path_[0] = '\0';
+        // Process-wide teardown (the single static instance dies here). Every
+        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
+        // alive across runs for cache-hit reuse.
+        for (auto &e : orch_so_table_) {
+            if (!e.in_use) continue;
+            if (e.handle != nullptr) dlclose(e.handle);
+            if (e.path[0] != '\0') unlink(e.path);
+            e = OrchSoEntry{};
         }
     }
 };
@@ -197,29 +213,37 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (runtime->get_orch_built_on_host()) {
             LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx);
         } else {
-            // Two paths:
-            //   1) has_new_orch_so == true → host believes the SO identity
-            //      changed, so we drop the cached handle (if any), write the
-            //      new bytes to disk, and dlopen + dlsym a fresh handle.
-            //   2) has_new_orch_so == false → host detected a cache hit, so
-            //      we reuse `orch_so_handle_` / `orch_func_` / `orch_bind_runtime_`
-            //      from the previous run untouched. sm_handle / rt below are
-            //      always recreated because they bind this run's memory.
-            const bool reload_so = runtime->has_new_orch_so();
+            // Per-callable_id dispatch: the orch SO state lives in
+            // `orch_so_table_[callable_id]` keyed by registration order;
+            // reload is governed by `register_new_callable_id_`.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+                LOG_ERROR(
+                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
+                );
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            void **p_handle = &orch_so_table_[callable_id].handle;
+            char *p_path = orch_so_table_[callable_id].path;
+            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
+            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
+            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
+            const bool reload_so = runtime->register_new_callable_id();
 
             if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected, (re)loading", thread_idx);
-                if (orch_so_handle_ != nullptr) {
-                    dlclose(orch_so_handle_);
-                    orch_so_handle_ = nullptr;
-                    orch_func_ = nullptr;
-                    orch_bind_runtime_ = nullptr;
-                    if (orch_so_path_[0] != '\0') {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+                if (*p_handle != nullptr) {
+                    dlclose(*p_handle);
+                    *p_handle = nullptr;
+                    *p_func = nullptr;
+                    *p_bind = nullptr;
+                    if (p_path[0] != '\0') {
                         // Unlink the old file so the new open() lands on a
                         // fresh inode — protects against SIGBUS / ETXTBSY when
                         // the kernel still has the old mapping pinned.
-                        unlink(orch_so_path_);
-                        orch_so_path_[0] = '\0';
+                        unlink(p_path);
+                        p_path[0] = '\0';
                     }
                 }
 
@@ -242,7 +266,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
                 for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path));
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
                     if (fd < 0) {
                         LOG_INFO_V0(
                             "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
@@ -281,6 +305,14 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 }
                 LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
 
+                // Unlink the on-disk SO immediately: dlopen has already mmap'd
+                // the image, so the kernel keeps the inode alive until the
+                // matching dlclose / process exit. This prevents stale
+                // libdevice_orch_<pid>_<cid>.so files from accumulating in
+                // /tmp when child processes exit via os._exit(0), which skips
+                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+                unlink(so_path);
+
                 const char *entry_symbol = runtime->get_device_orch_func_name();
                 if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
                     entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
@@ -333,15 +365,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     bind_runtime_func = nullptr;
                 }
 
-                orch_so_handle_ = handle;
-                orch_func_ = orch_func;
-                orch_bind_runtime_ = bind_runtime_func;
-                orch_config_func_ = config_func;
-                snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path);
+                *p_handle = handle;
+                *p_func = orch_func;
+                *p_bind = bind_runtime_func;
+                *p_config_func = config_func;
+                snprintf(p_path, 256, "%s", so_path);
+                orch_so_table_[callable_id].in_use = true;
             } else {
-                LOG_INFO_V0("Thread %d: Reusing cached orch SO handle=%p", thread_idx, orch_so_handle_);
-                if (orch_so_handle_ == nullptr || orch_func_ == nullptr) {
-                    LOG_ERROR("Thread %d: has_new_orch_so=false but no cached SO handle/func", thread_idx);
+                LOG_INFO_V0(
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
+                );
+                if (*p_handle == nullptr || *p_func == nullptr) {
+                    LOG_ERROR(
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        callable_id
+                    );
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -349,8 +387,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
             // Validate arg count on every run (reload or cache hit).
-            if (orch_config_func_ != nullptr) {
-                PTO2OrchestrationConfig cfg = orch_config_func_(runtime->get_orch_args());
+            if (*p_config_func != nullptr) {
+                PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
                 LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
                 if (cfg.expected_arg_count > 0) {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
@@ -361,17 +399,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                             cfg.expected_arg_count
                         );
                         // Clean up cached state so a subsequent run does a full reload.
-                        if (orch_so_handle_ != nullptr) {
-                            dlclose(orch_so_handle_);
-                            orch_so_handle_ = nullptr;
+                        if (*p_handle != nullptr) {
+                            dlclose(*p_handle);
+                            *p_handle = nullptr;
                         }
-                        if (orch_so_path_[0] != '\0') {
-                            unlink(orch_so_path_);
-                            orch_so_path_[0] = '\0';
+                        if (p_path[0] != '\0') {
+                            unlink(p_path);
+                            p_path[0] = '\0';
                         }
-                        orch_func_ = nullptr;
-                        orch_bind_runtime_ = nullptr;
-                        orch_config_func_ = nullptr;
+                        *p_func = nullptr;
+                        *p_bind = nullptr;
+                        *p_config_func = nullptr;
+                        orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
@@ -473,11 +512,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             orch_cycle_start = get_sys_cnt_aicpu();
 #endif
             framework_bind_runtime(rt);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(rt);
+            if (*p_bind != nullptr) {
+                (*p_bind)(rt);
             }
             rt_scope_begin(rt);
-            orch_func_(*orch_args_cached_);
+            (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
@@ -637,13 +676,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     if (prev_finished + 1 == thread_num_) {
         finished_.store(true, std::memory_order_release);
         // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep orch_so_handle_ alive for
-        // the next run's cache-hit reuse (see run() reload_so branch).
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
         if (!runtime->get_orch_built_on_host() && rt != nullptr) {
             // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
             framework_bind_runtime(nullptr);
-            if (orch_bind_runtime_ != nullptr) {
-                orch_bind_runtime_(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
             }
             runtime_destroy(rt);
         }
@@ -669,10 +712,9 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     orch_to_sched_ = false;
 
     orch_args_cached_ = nullptr;
-    // orch_so_handle_ / orch_func_ / orch_bind_runtime_ / orch_config_func_ / orch_so_path_ are
-    // intentionally preserved across deinit: the next run reuses them when
-    // has_new_orch_so() == false. The destructor releases them at process
-    // teardown.
+    // orch_so_table_ entries are intentionally preserved across deinit: the
+    // next run reuses cached handles when register_new_callable_id() returns
+    // false. The destructor releases them at process teardown.
 
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 4c4e8dd9c..e70f9a309 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -92,31 +92,29 @@ static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *hos
 }
 
 /**
- * Initialize a pre-allocated runtime for device orchestration.
+ * Stage the per-callable resources (kernel binaries + orchestration SO) into
+ * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use
+ * them. This is the cacheable half of init_runtime_impl: nothing here depends
+ * on per-run argument values, so the prepare_callable / run_prepared split
+ * lets us run this once per callable_id and amortize across runs.
  *
- * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side).
- * This function:
- * - Copies tensor metadata and replaces host pointers with device pointers
- * - Copies all tensor data to device
- * - Records all tensors for copy-back
- * - Copies orchestration SO to device memory
- * - Sets up runtime state for device orchestration
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
+ * @param runtime   Pointer to pre-constructed Runtime (host_api populated)
+ * @param callable  ChipCallable carrying the orch SO + child kernel binaries
  * @return 0 on success, -1 on failure
  */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
+extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
         return -1;
     }
+    if (callable == nullptr) {
+        LOG_ERROR("Callable pointer is null");
+        return -1;
+    }
 
     // Register kernel binaries from ChipCallable children
     if (callable->child_count() > 0) {
-        LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
+        LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count());
         for (int32_t i = 0; i < callable->child_count(); i++) {
             int func_id = callable->child_func_id(i);
             if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
@@ -146,6 +144,32 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
         return -1;
     }
 
+    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
+    runtime->pending_orch_so_data_ = orch_so_binary;
+    runtime->pending_orch_so_size_ = orch_so_size;
+    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
+    return 0;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by prepare_callable_impl.
+ *
+ * Splitting this from prepare_callable_impl matches the per-callable_id
+ * design: register/run_prepared invokes this every call, while the prep
+ * half runs only once per callable_id.
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
     if (orch_args == nullptr) {
         LOG_ERROR("orch_args pointer is null");
         return -1;
@@ -153,7 +177,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int tensor_count = orch_args->tensor_count();
     int scalar_count = orch_args->scalar_count();
-    LOG_INFO_V0("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
 
     int64_t t_total_start = _now_ms();
 
@@ -196,13 +220,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
     }
     int64_t t_args_end = _now_ms();
 
-    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
-    int64_t t_so_start = _now_ms();
-    runtime->pending_orch_so_data_ = orch_so_binary;
-    runtime->pending_orch_so_size_ = orch_so_size;
-    LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
-    int64_t t_so_end = _now_ms();
-
     // Read ready queue shard count from environment for AICPU scheduler
     {
         const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
@@ -282,7 +299,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     int64_t t_total_end = _now_ms();
     LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-    LOG_INFO_V0("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start);
     LOG_INFO_V0("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index e8bd2ff85..48e3c82b6 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -203,12 +203,14 @@ class Runtime {
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
-    // `has_new_orch_so_` tells AICPU whether the host believes the SO identity
-    // changed since the previous run — when false AICPU reuses its cached
-    // dlopen handle and skips writing the file again.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    bool has_new_orch_so_;
+    // Per-callable_id dispatch. AICPU dispatches via
+    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
+    // signals whether the host is delivering a freshly-registered
+    // callable_id (write+dlopen) or reusing an already-loaded one.
+    int32_t active_callable_id_;
+    bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
@@ -261,10 +263,16 @@ class Runtime {
     void set_orch_args(const ChipStorageTaskArgs &args);
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new);
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
     uint64_t get_dev_orch_so_size() const;
-    bool has_new_orch_so() const;
+    // Per-callable_id dispatch. callable_id must be in
+    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
+    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
+    // reuse the cached entry.
+    void set_active_callable_id(int32_t callable_id, bool is_new);
+    int32_t get_active_callable_id() const;
+    bool register_new_callable_id() const;
     void set_device_orch_func_name(const char *name);
     const char *get_device_orch_func_name() const;
     void set_device_orch_config_name(const char *name);
@@ -272,6 +280,13 @@ class Runtime {
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
+    /**
+     * Replay a previously-uploaded kernel address onto a fresh Runtime
+     * without recording it in registered_kernel_func_ids_. Used by
+     * DeviceRunner::bind_prepared_callable_to_runtime so prepared kernel
+     * binaries are not freed by validate_runtime_impl across runs.
+     */
+    void replay_function_bin_addr(int func_id, uint64_t addr);
 
     int get_registered_kernel_count() const;
     int get_registered_kernel_func_id(int index) const;
@@ -299,9 +314,16 @@ class Runtime {
     // Host-only staging for orchestration SO. runtime_maker publishes the
     // callable-owned pointer here; DeviceRunner consumes it before launching
     // the device-side execution and replaces it with the device-resident
-    // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_).
+    // buffer metadata (dev_orch_so_addr_, dev_orch_so_size_).
     const void *pending_orch_so_data_{nullptr};
     size_t pending_orch_so_size_{0};
+
+    // Host-orchestration staging (hbg path). Always nullptr on this trb
+    // variant — included for API parity with host_build_graph so the
+    // shared platform layer can branch on `pending_host_dlopen_handle_ !=
+    // nullptr` at runtime instead of via a build-time macro.
+    void *pending_host_dlopen_handle_{nullptr};
+    void *pending_host_orch_func_ptr_{nullptr};
 };
 
 #endif  // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 8f595e1a3..714ba3955 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -54,7 +54,8 @@ Runtime::Runtime() {
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
     dev_orch_so_size_ = 0;
-    has_new_orch_so_ = false;
+    active_callable_id_ = -1;
+    register_new_callable_id_ = false;
     device_orch_func_name_[0] = '\0';
     device_orch_config_name_[0] = '\0';
 
@@ -104,18 +105,24 @@ void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
 // Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
+// owned by DeviceRunner; only the address/size travels in Runtime).
+void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
     dev_orch_so_addr_ = dev_addr;
     dev_orch_so_size_ = size;
-    has_new_orch_so_ = is_new;
 }
 
 uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
 
 uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
 
-bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
+void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
+    active_callable_id_ = callable_id;
+    register_new_callable_id_ = is_new;
+}
+
+int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
+
+bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
 
 void Runtime::set_device_orch_func_name(const char *name) {
     if (name == nullptr) {
@@ -162,6 +169,14 @@ void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
     func_id_to_addr_[func_id] = addr;
 }
 
+void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
+    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
+        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
+        return;
+    }
+    func_id_to_addr_[func_id] = addr;
+}
+
 int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
 
 int Runtime::get_registered_kernel_func_id(int index) const {
diff --git a/src/common/hierarchical/orchestrator.cpp b/src/common/hierarchical/orchestrator.cpp
index c5912a5b9..5a6e710f9 100644
--- a/src/common/hierarchical/orchestrator.cpp
+++ b/src/common/hierarchical/orchestrator.cpp
@@ -137,25 +137,25 @@ ContinuousTensor Orchestrator::alloc(const std::vector<uint32_t> &shape, DataTyp
 // =============================================================================
 
 SubmitResult
-Orchestrator::submit_next_level(uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker) {
+Orchestrator::submit_next_level(int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker) {
     std::vector<int8_t> affinities;
     if (worker >= 0) affinities = {worker};
-    return submit_impl(WorkerType::NEXT_LEVEL, callable, /*callable_id=*/-1, config, {args}, std::move(affinities));
+    return submit_impl(WorkerType::NEXT_LEVEL, callable_id, config, {args}, std::move(affinities));
 }
 
 SubmitResult Orchestrator::submit_next_level_group(
-    uint64_t callable, const std::vector<TaskArgs> &args_list, const CallConfig &config,
+    int32_t callable_id, const std::vector<TaskArgs> &args_list, const CallConfig &config,
     const std::vector<int8_t> &workers
 ) {
-    return submit_impl(WorkerType::NEXT_LEVEL, callable, /*callable_id=*/-1, config, args_list, workers);
+    return submit_impl(WorkerType::NEXT_LEVEL, callable_id, config, args_list, workers);
 }
 
 SubmitResult Orchestrator::submit_sub(int32_t callable_id, const TaskArgs &args) {
-    return submit_impl(WorkerType::SUB, /*callable_ptr=*/0, callable_id, CallConfig{}, {args});
+    return submit_impl(WorkerType::SUB, callable_id, CallConfig{}, {args});
 }
 
 SubmitResult Orchestrator::submit_sub_group(int32_t callable_id, const std::vector<TaskArgs> &args_list) {
-    return submit_impl(WorkerType::SUB, /*callable_ptr=*/0, callable_id, CallConfig{}, args_list);
+    return submit_impl(WorkerType::SUB, callable_id, CallConfig{}, args_list);
 }
 
 // =============================================================================
@@ -163,8 +163,8 @@ SubmitResult Orchestrator::submit_sub_group(int32_t callable_id, const std::vect
 // =============================================================================
 
 SubmitResult Orchestrator::submit_impl(
-    WorkerType worker_type, uint64_t callable_ptr, int32_t callable_id, const CallConfig &config,
-    std::vector<TaskArgs> args_list, std::vector<int8_t> affinities
+    WorkerType worker_type, int32_t callable_id, const CallConfig &config, std::vector<TaskArgs> args_list,
+    std::vector<int8_t> affinities
 ) {
     if (args_list.empty()) throw std::invalid_argument("Orchestrator: args_list must not be empty");
     config.validate();
@@ -198,7 +198,6 @@ SubmitResult Orchestrator::submit_impl(
     s.reset();
 
     s.worker_type = worker_type;
-    s.callable = callable_ptr;
     s.callable_id = callable_id;
     s.config = config;
 
diff --git a/src/common/hierarchical/orchestrator.h b/src/common/hierarchical/orchestrator.h
index b6880d3c1..f8abdb424 100644
--- a/src/common/hierarchical/orchestrator.h
+++ b/src/common/hierarchical/orchestrator.h
@@ -92,18 +92,19 @@ class Orchestrator {
     void copy_to(int worker_id, uint64_t dst, uint64_t src, size_t size);
     void copy_from(int worker_id, uint64_t dst, uint64_t src, size_t size);
 
-    // Submit a NEXT_LEVEL task. `callable` is the chip callable buffer pointer
-    // (uint64_t handle from Python — typically ChipCallable.buffer_ptr()).
-    // Tags inside `args` drive dependency inference; OUTPUT tensors with null
-    // data are auto-allocated from the HeapRing.
+    // Submit a NEXT_LEVEL task. `callable_id` is a cid registered via
+    // Worker.register(): the chip child looks it up in its COW-inherited
+    // Python registry to get the actual ChipCallable.
+    // Tags inside `args` drive dependency inference; OUTPUT tensors with
+    // null data are auto-allocated from the HeapRing.
     // `worker`: logical worker id for affinity (-1 = unconstrained).
     SubmitResult
-    submit_next_level(uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker = -1);
+    submit_next_level(int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker = -1);
 
     // Submit a group of NEXT_LEVEL tasks: N args -> N workers, 1 DAG node.
     // `workers`: per-args affinity (empty = all unconstrained).
     SubmitResult submit_next_level_group(
-        uint64_t callable, const std::vector<TaskArgs> &args_list, const CallConfig &config,
+        int32_t callable_id, const std::vector<TaskArgs> &args_list, const CallConfig &config,
         const std::vector<int8_t> &workers = {}
     );
 
@@ -178,8 +179,8 @@ class Orchestrator {
     // Shared submit machinery. Takes `args_list` by value so the Orchestrator
     // can patch `tensor.data` on OUTPUT tensors flagged for auto-allocation.
     SubmitResult submit_impl(
-        WorkerType worker_type, uint64_t callable_ptr, int32_t callable_id, const CallConfig &config,
-        std::vector<TaskArgs> args_list, std::vector<int8_t> affinities = {}
+        WorkerType worker_type, int32_t callable_id, const CallConfig &config, std::vector<TaskArgs> args_list,
+        std::vector<int8_t> affinities = {}
     );
 
     // Size, in aligned bytes, an OUTPUT tensor should occupy in the HeapRing.
diff --git a/src/common/hierarchical/types.cpp b/src/common/hierarchical/types.cpp
index e04f883f9..882a630c6 100644
--- a/src/common/hierarchical/types.cpp
+++ b/src/common/hierarchical/types.cpp
@@ -28,7 +28,6 @@ void TaskSlotState::reset() {
     output_keys.clear();
     fanin_producers.clear();
     worker_type = WorkerType::NEXT_LEVEL;
-    callable = 0;
     callable_id = -1;
     config = CallConfig{};
     task_args.clear();
diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h
index dbd91659e..f67fa6028 100644
--- a/src/common/hierarchical/types.h
+++ b/src/common/hierarchical/types.h
@@ -145,9 +145,12 @@ struct TaskSlotState {
 
     // --- Task data (stored on parent heap, lives until slot CONSUMED) ---
     WorkerType worker_type{WorkerType::NEXT_LEVEL};
-    uint64_t callable{0};     // NEXT_LEVEL: ChipCallable buffer ptr; SUB: unused
-    int32_t callable_id{-1};  // SUB: registered callable id
-    CallConfig config{};      // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features)
+    // Unified callable id: NEXT_LEVEL chip callables and SUB fns share the
+    // same Worker.register() id space. The mailbox wire format writes this
+    // as a uint64 with the cid in the low 32 bits; dispatch_process reads
+    // it identically for both worker types.
+    int32_t callable_id{-1};
+    CallConfig config{};  // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features)
 
     // Unified task-args storage: `task_args` is the single-task builder;
     // when `is_group_` is true, `task_args_list` carries one TaskArgs per
diff --git a/src/common/hierarchical/worker_manager.cpp b/src/common/hierarchical/worker_manager.cpp
index cb2f31b6e..2d0c40017 100644
--- a/src/common/hierarchical/worker_manager.cpp
+++ b/src/common/hierarchical/worker_manager.cpp
@@ -139,7 +139,7 @@ void WorkerThread::loop() {
 }
 
 void WorkerThread::dispatch_process(TaskSlotState &s, int32_t group_index) {
-    uint64_t callable = (s.worker_type == WorkerType::SUB) ? static_cast<uint64_t>(s.callable_id) : s.callable;
+    uint64_t callable = static_cast<uint64_t>(static_cast<uint32_t>(s.callable_id));
     TaskArgsView view = s.args_view(group_index);
 
     // Hold mailbox_mu_ for the entire round trip (write payload + state +
diff --git a/src/common/task_interface/callable_protocol.h b/src/common/task_interface/callable_protocol.h
new file mode 100644
index 000000000..4e3898804
--- /dev/null
+++ b/src/common/task_interface/callable_protocol.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Per-callable_id protocol constants
+ *
+ * Single source of truth for the host↔AICPU per-callable_id dispatch protocol.
+ * Kept separate from callable.h so the AICPU side can include it without
+ * pulling in <vector>/<stdexcept>.
+ *
+ * Both sides must agree on these bounds:
+ *   - Host: DeviceRunner::register_prepared_callable rejects out-of-range ids.
+ *   - AICPU: AicpuExecutor::run guards `orch_so_table_[callable_id]` access.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+// Hard cap on the number of distinct callable_ids that can be registered
+// via Worker.register / DeviceRunner::register_prepared_callable. The AICPU
+// executor reserves a fixed-size `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]`
+// keyed by callable_id, so this bound is part of the host↔AICPU protocol.
+constexpr int32_t MAX_REGISTERED_CALLABLE_IDS = 64;
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 38680e77a..7e8fc72b6 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -148,8 +148,12 @@ void ChipWorker::init(
         copy_to_device_ctx_fn_ = load_symbol<CopyToDeviceCtxFn>(handle, "copy_to_device_ctx");
         copy_from_device_ctx_fn_ = load_symbol<CopyFromDeviceCtxFn>(handle, "copy_from_device_ctx");
         get_runtime_size_fn_ = load_symbol<GetRuntimeSizeFn>(handle, "get_runtime_size");
-        run_runtime_fn_ = load_symbol<RunRuntimeFn>(handle, "run_runtime");
         simpler_init_fn_ = load_symbol<SimplerInitFn>(handle, "simpler_init");
+        prepare_callable_fn_ = load_symbol<PrepareCallableFn>(handle, "prepare_callable");
+        run_prepared_fn_ = load_symbol<RunPreparedFn>(handle, "run_prepared");
+        unregister_callable_fn_ = load_symbol<UnregisterCallableFn>(handle, "unregister_callable");
+        get_aicpu_dlopen_count_fn_ = load_symbol<GetAicpuDlopenCountFn>(handle, "get_aicpu_dlopen_count");
+        get_host_dlopen_count_fn_ = load_symbol<GetAicpuDlopenCountFn>(handle, "get_host_dlopen_count");
         finalize_device_fn_ = load_symbol<FinalizeDeviceFn>(handle, "finalize_device");
         // ACL lifecycle + comm_* are part of the uniform host_runtime.so ABI.
         // Every platform runtime exports all of them — runtimes that do not
@@ -242,7 +246,11 @@ void ChipWorker::finalize() {
     copy_to_device_ctx_fn_ = nullptr;
     copy_from_device_ctx_fn_ = nullptr;
     get_runtime_size_fn_ = nullptr;
-    run_runtime_fn_ = nullptr;
+    prepare_callable_fn_ = nullptr;
+    run_prepared_fn_ = nullptr;
+    unregister_callable_fn_ = nullptr;
+    get_aicpu_dlopen_count_fn_ = nullptr;
+    get_host_dlopen_count_fn_ = nullptr;
     finalize_device_fn_ = nullptr;
     ensure_acl_ready_fn_ = nullptr;
     create_comm_stream_fn_ = nullptr;
@@ -261,15 +269,35 @@ void ChipWorker::finalize() {
 }
 
 void ChipWorker::run(uint64_t callable, TaskArgsView args, const CallConfig &config) {
-    // L2 ABI edge: assemble the fixed-size ChipStorageTaskArgs POD from the
-    // view and hand it to the runtime. This conversion used to happen at
-    // submit time (stored on the slot); it now runs lazily in the worker so
-    // the slot can carry a single TaskArgs irrespective of the destination.
+    // The hierarchical layer (worker_manager.cpp) packs the cid produced by
+    // Worker.register() into this uint64. ChipWorker treats it as such — it
+    // must already have been prepared via prepare_callable. The legacy
+    // "callable buffer ptr → run_runtime" path is gone.
+    run_prepared(static_cast<int32_t>(static_cast<uint32_t>(callable)), args, config);
+}
+
+void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) {
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+    if (callable == nullptr) {
+        throw std::runtime_error("prepare_callable: callable must not be null");
+    }
+    int rc = prepare_callable_fn_(
+        device_ctx_, callable_id, callable, device_id_, aicpu_binary_.data(), aicpu_binary_.size(),
+        aicore_binary_.data(), aicore_binary_.size()
+    );
+    if (rc != 0) {
+        throw std::runtime_error("prepare_callable failed with code " + std::to_string(rc));
+    }
+}
+
+void ChipWorker::run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config) {
     ChipStorageTaskArgs chip_storage = view_to_chip_storage(args);
-    run(reinterpret_cast<const void *>(callable), &chip_storage, config);
+    run_prepared(callable_id, &chip_storage, config);
 }
 
-void ChipWorker::run(const void *callable, const void *args, const CallConfig &config) {
+void ChipWorker::run_prepared(int32_t callable_id, const void *args, const CallConfig &config) {
     config.validate();
     if (!device_set_) {
         throw std::runtime_error("ChipWorker device not set; call set_device() first");
@@ -277,14 +305,38 @@ void ChipWorker::run(const void *callable, const void *args, const CallConfig &c
 
     void *rt = runtime_buf_.data();
 
-    int rc = run_runtime_fn_(
-        device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(),
+    int rc = run_prepared_fn_(
+        device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(),
         aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_l2_swimlane,
         config.enable_dump_tensor, config.enable_pmu, config.output_prefix
     );
     if (rc != 0) {
-        throw std::runtime_error("run_runtime failed with code " + std::to_string(rc));
+        throw std::runtime_error("run_prepared failed with code " + std::to_string(rc));
+    }
+}
+
+void ChipWorker::unregister_callable(int32_t callable_id) {
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+    int rc = unregister_callable_fn_(device_ctx_, callable_id);
+    if (rc != 0) {
+        throw std::runtime_error("unregister_callable failed with code " + std::to_string(rc));
+    }
+}
+
+size_t ChipWorker::aicpu_dlopen_count() const {
+    if (!device_set_) {
+        return 0;
+    }
+    return get_aicpu_dlopen_count_fn_(device_ctx_);
+}
+
+size_t ChipWorker::host_dlopen_count() const {
+    if (!device_set_) {
+        return 0;
     }
+    return get_host_dlopen_count_fn_(device_ctx_);
 }
 
 uint64_t ChipWorker::malloc(size_t size) {
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 3e529a511..c08b1c618 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -52,14 +52,28 @@ class ChipWorker : public IWorker {
     /// Terminal — the object cannot be reused after this.
     void finalize();
 
-    // IWorker: build a ChipStorageTaskArgs POD from `args` and execute the
-    // runtime synchronously. `callable` is a ChipCallable buffer pointer
-    // cast to uint64.
+    // IWorker: dispatch the cid `callable` (packed into uint64 by the
+    // hierarchical layer) by delegating to run_prepared. The cid must
+    // already have been prepared via prepare_callable.
     void run(uint64_t callable, TaskArgsView args, const CallConfig &config) override;
 
-    // Direct invocation (used by Python wrapper and internal tests) — bypasses
-    // the TaskArgsView path and takes a ready-made ChipStorageTaskArgs POD.
-    void run(const void *callable, const void *args, const CallConfig &config);
+    // Per-callable_id preparation. Requires set_device() and a callable_id
+    // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64).
+    void prepare_callable(int32_t callable_id, const void *callable);
+    void run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config);
+    void run_prepared(int32_t callable_id, const void *args, const CallConfig &config);
+    void unregister_callable(int32_t callable_id);
+
+    /// Number of distinct callable_ids the AICPU has been asked to dlopen for
+    /// on the bound device. Returns 0 when no device is set or the runtime
+    /// variant has no per-cid registration support. Used by tests to assert
+    /// that prepare_callable + repeated run_prepared do not trigger redundant
+    /// AICPU dlopens.
+    size_t aicpu_dlopen_count() const;
+
+    /// Number of host-side dlopens (host_build_graph variant). Mirrors
+    /// `aicpu_dlopen_count` for the trb path; returns 0 on device-orch variants.
+    size_t host_dlopen_count() const;
 
     uint64_t malloc(size_t size);
     void free(uint64_t ptr);
@@ -102,11 +116,15 @@ class ChipWorker : public IWorker {
     using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using CopyFromDeviceCtxFn = int (*)(void *, void *, const void *, size_t);
     using GetRuntimeSizeFn = size_t (*)();
-    using RunRuntimeFn = int (*)(
-        void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t,
-        int, int, int, const char *
-    );
     using SimplerInitFn = void (*)(void *, int, int);
+    using PrepareCallableFn =
+        int (*)(void *, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t);
+    using RunPreparedFn = int (*)(
+        void *, void *, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int,
+        int, int, const char *
+    );
+    using UnregisterCallableFn = int (*)(void *, int32_t);
+    using GetAicpuDlopenCountFn = size_t (*)(void *);
     using FinalizeDeviceFn = int (*)(void *);
     using EnsureAclReadyFn = int (*)(void *, int);
     using CreateCommStreamFn = void *(*)(void *);
@@ -127,8 +145,12 @@ class ChipWorker : public IWorker {
     CopyToDeviceCtxFn copy_to_device_ctx_fn_ = nullptr;
     CopyFromDeviceCtxFn copy_from_device_ctx_fn_ = nullptr;
     GetRuntimeSizeFn get_runtime_size_fn_ = nullptr;
-    RunRuntimeFn run_runtime_fn_ = nullptr;
     SimplerInitFn simpler_init_fn_ = nullptr;
+    PrepareCallableFn prepare_callable_fn_ = nullptr;
+    RunPreparedFn run_prepared_fn_ = nullptr;
+    UnregisterCallableFn unregister_callable_fn_ = nullptr;
+    GetAicpuDlopenCountFn get_aicpu_dlopen_count_fn_ = nullptr;
+    GetAicpuDlopenCountFn get_host_dlopen_count_fn_ = nullptr;
     FinalizeDeviceFn finalize_device_fn_ = nullptr;
     EnsureAclReadyFn ensure_acl_ready_fn_ = nullptr;
     CreateCommStreamFn create_comm_stream_fn_ = nullptr;
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index b6588dc45..0ef16a13c 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -17,11 +17,12 @@
  *
  * Public API — resolved by ChipWorker via dlsym:
  *   create_device_context, destroy_device_context,
- *   get_runtime_size, set_device, run_runtime, finalize_device,
- *   device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx
+ *   get_runtime_size, set_device, finalize_device,
+ *   device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx,
+ *   prepare_callable, run_prepared, unregister_callable
  *
  * Memory management: caller allocates a buffer of get_runtime_size() bytes
- * and passes it to run_runtime(). Error codes: 0 = success, negative = error.
+ * and passes it to run_prepared(). Error codes: 0 = success, negative = error.
  */
 
 #ifndef SRC_COMMON_WORKER_PTO_RUNTIME_C_API_H_
@@ -57,7 +58,7 @@ void destroy_device_context(DeviceContextHandle ctx);
 /** Return sizeof(Runtime) for caller buffer allocation. */
 size_t get_runtime_size(void);
 
-/** Set the target device. Must be called before the first run_runtime(). */
+/** Set the target device. Must be called before the first run_prepared(). */
 int set_device(DeviceContextHandle ctx, int device_id);
 
 /** Allocate device memory in the given device context. */
@@ -72,42 +73,10 @@ int copy_to_device_ctx(DeviceContextHandle ctx, void *dev_ptr, const void *host_
 /** Copy device memory to a host pointer within the given device context. */
 int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *dev_ptr, size_t size);
 
-/**
- * Build the task graph, execute on device, copy results back, and clean up.
- *
- * @param ctx               Device context from create_device_context()
- * @param runtime           Caller-allocated buffer (size from get_runtime_size())
- * @param callable          Opaque ChipCallable pointer (orchestration + kernel binaries)
- * @param args              Opaque ChipStorageTaskArgs pointer (tensor/scalar arguments)
- * @param block_dim         Number of AICore blocks
- * @param aicpu_thread_num  Number of AICPU scheduler threads
- * @param device_id         Target device
- * @param aicpu_binary      AICPU executor binary blob
- * @param aicpu_size        Size of AICPU binary
- * @param aicore_binary     AICore executor binary blob
- * @param aicore_size       Size of AICore binary
- * @param enable_l2_swimlane       1 to enable perf swimlane collection, 0 to disable
- * @param enable_dump_tensor 1 to enable tensor dump, 0 to disable
- * @param enable_pmu        0 = PMU disabled; >0 = enabled, value selects event type
- * @param output_prefix     NUL-terminated directory path under which diagnostic
- *                          artifacts (l2_perf_records.json / tensor_dump/ /
- *                          pmu.csv) are written. Required (non-empty) whenever
- *                          any diagnostic flag is enabled; ignored otherwise.
- *
- * Log configuration is applied separately via simpler_init() at ChipWorker
- * init time and read from runner state when populating KernelArgs.
- * @return 0 on success, negative on error
- */
-int run_runtime(
-    DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
-    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
-    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
-);
-
 /**
  * One-shot platform-side log init. Called once by ChipWorker::init() right
  * after dlopen, before any other entry. Pushes the user's chosen severity +
- * INFO verbosity into HostLogger and into runner state (which run_runtime
+ * INFO verbosity into HostLogger and into runner state (which run_prepared
  * later forwards to AICPU via KernelArgs).
  *
  * On onboard, also calls dlog_setlevel(-1, log_level, 0) so CANN's runtime
@@ -127,6 +96,84 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v);
  */
 int finalize_device(DeviceContextHandle ctx);
 
+/* ===========================================================================
+ * Per-callable_id preparation
+ *
+ * The triplet below decouples the one-shot prep work (kernel upload + orch SO
+ * H2D + caching keyed by `callable_id`) from each `run_prepared` invocation,
+ * so the per-run cost shrinks to "rebuild Runtime args + launch". Callers
+ * keep a stable small-int `callable_id` per ChipCallable; the platform side
+ * caches the prepared state in a fixed-size table (cap 64, see
+ * MAX_REGISTERED_CALLABLE_IDS in the AICPU executor) and rejects ids outside
+ * `[0, 64)`. Lifetime: caller must `unregister_callable` before
+ * `finalize_device` to release the device-side orch SO buffer; kernels stay
+ * resident until finalize regardless.
+ * =========================================================================== */
+
+/**
+ * Stage a callable for repeated cheap launches under the given `callable_id`.
+ *
+ * Uploads child kernels into the DeviceRunner's func_id-keyed cache and
+ * copies the orchestration SO bytes into a device-resident buffer keyed by
+ * the SO's ELF Build-ID hash (so two callable_ids with identical SO share
+ * one buffer). Subsequent `run_prepared(callable_id, ...)` calls reuse this
+ * state.
+ *
+ * @return 0 on success, negative on error (NULL ctx, callable_id out of
+ *         range, or upload/copy failure).
+ */
+int prepare_callable(
+    DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary,
+    size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size
+);
+
+/**
+ * Launch a callable previously staged via `prepare_callable`.
+ *
+ * Looks up the prepared state by `callable_id`, restores the kernel func_id ↔
+ * dev_addr table onto a fresh Runtime, and dispatches without re-uploading
+ * kernels or re-copying the orch SO. The AICPU side dispatches via
+ * `orch_so_table_[callable_id]` (see runtime.h::set_active_callable_id). The
+ * first run for a given callable_id sets `register_new_callable_id_` so the
+ * AICPU does its one-time dlopen.
+ *
+ * @return 0 on success, negative on error (no prep state, NULL ctx, etc.).
+ */
+int run_prepared(
+    DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
+    int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
+    size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix
+);
+
+/**
+ * Drop the prepared state for `callable_id` and release the per-id share of
+ * the device orch SO buffer. The buffer itself is freed only when its
+ * hash-keyed refcount drops to zero (different callable_ids with identical
+ * SO share one allocation).
+ *
+ * Kernel binaries uploaded by `prepare_callable` remain resident — they are
+ * shared across callables by func_id and only released by `finalize_device`.
+ *
+ * @return 0 on success or if callable_id was not registered, negative on error.
+ */
+int unregister_callable(DeviceContextHandle ctx, int32_t callable_id);
+
+/**
+ * Number of distinct callable_ids the AICPU has been asked to dlopen for on
+ * the device bound to `ctx`. Returns 0 on runtime variants without per-cid
+ * registration support. Used by tests to assert that `prepare_callable` +
+ * repeated `run_prepared` calls do not trigger redundant AICPU dlopens.
+ */
+size_t get_aicpu_dlopen_count(DeviceContextHandle ctx);
+
+/**
+ * Number of host-side dlopens triggered by `prepare_callable` on the host
+ * orchestration variants (host_build_graph). Mirrors `get_aicpu_dlopen_count`
+ * for the trb path. Returns 0 on runtime variants whose orchestration runs on
+ * the device.
+ */
+size_t get_host_dlopen_count(DeviceContextHandle ctx);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py b/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..00a658cc6
--- /dev/null
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared on host_build_graph.
+
+Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable for the hbg
+variant: instead of the AICPU dlopening the orch SO once per cid, hbg dlopens
+on the host inside prepare_callable and replays the cached handle/fn pointer
+on every run_prepared. The dlopen counter to assert is `host_dlopen_count`,
+not `aicpu_dlopen_count` (which stays 0 — AICPU never sees the orch SO).
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_VECTOR_KERNELS = "../vector_example/kernels"
+
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPreparedCallableHbg(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable on hbg.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orch.cpp",
+            "function_name": "build_example_graph",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3}
+    _PLATFORMS = ["a2a3sim", "a2a3"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # vector_example orchestration computes (a + b + 1) * (a + b + 2)
+        a, b = args.a, args.b
+        args.f[:] = (a + b + 1) * (a + b + 2)
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
+
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
+
+    # ------------------------------------------------------------------
+    # host_dlopen_count assertions (hbg path).
+    #
+    # hbg increments host_dlopen_count on every register_prepared_callable_host_orch
+    # invocation (i.e. each `prepare_callable` call), independent of how many
+    # times run_prepared is invoked afterwards. AICPU never dlopens the orch
+    # SO on this variant, so aicpu_dlopen_count stays at 0.
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        """prepare(primary) + run × 5 → host_dlopen delta == 1, aicpu == 0."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new host dlopen for 5 runs of primary cid, "
+                f"got delta {st_worker.host_dlopen_count - baseline}"
+            )
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu, "hbg must not trigger any AICPU orch SO dlopens"
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        """prepare(primary)+prepare(secondary) + alternating runs × 5 → host_dlopen delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new host dlopens for two cids interleaved, "
+                f"got delta {st_worker.host_dlopen_count - baseline}"
+            )
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """prepare(primary) twice → second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """prepare+run+unregister+prepare+run on the same cid → host_dlopen delta == 2.
+
+        Counter is monotonic — re-prepare always counts a fresh dlopen.
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1
+            st_worker.unregister_callable(_CID_PRIMARY)
+            registered = False
+            after_unreg = st_worker.host_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the host dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct host dlopens), "
+                f"got delta {st_worker.host_dlopen_count - baseline}"
+            )
+        finally:
+            if registered:
+                st_worker.unregister_callable(_CID_PRIMARY)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..62ced849b
--- /dev/null
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable.
+
+Reuses the vector_example orchestration + AIV kernels. Exercises:
+  - prepare_callable once, then run_prepared twice (second run proves the
+    AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload).
+  - Two distinct callable_ids sharing the same orch SO binary: verifies both
+    produce correct output independently.
+  - unregister_callable after runs complete: should not raise.
+  - aicpu_dlopen_count assertions covering: same-cid repeat, multi-cid
+    interleaving, double-prepare rejection, and unregister + re-prepare.
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_VECTOR_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPreparedCallable(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable ABI.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
+    _PLATFORMS = ["a2a3sim", "a2a3"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        # 1) prepare two callable_ids with the SAME callable (shared orch SO)
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
+
+        # 2) run_prepared primary cid twice (second run proves dedup/cache hit)
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 3) run_prepared secondary cid — different slot, same SO, must also work
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 4) unregister both — should not raise
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
+
+    # ------------------------------------------------------------------
+    # aicpu_dlopen_count assertions.
+    #
+    # The class-scope L2 worker is shared across test methods in this
+    # class (see ./conftest.py), so the counter can be non-zero on entry
+    # from prior methods. Each test below snapshots the counter on entry,
+    # asserts the *delta* introduced by the scenario, then unregisters
+    # everything it staged. unregister_callable does NOT decrement the
+    # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare).
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        """Common fixture: build callable + config, return (callable, config, case)."""
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        """Case A: prepare(primary) + run × 5 → dlopen_count delta == 1."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new dlopen for 5 runs of primary cid, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        """Case B: prepare(primary)+prepare(secondary) + alternating runs × 5 → delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new dlopens for two cids interleaved, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """Case C: prepare(primary) + prepare(primary) → second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """Case D: prepare+run+unregister+prepare+run on the same cid → delta == 2.
+
+        unregister erases the cid from aicpu_seen_callable_ids_, so the second
+        prepare/run pair sets register_new_callable_id_ again and the AICPU
+        does a fresh dlopen. The counter is monotonic (does NOT decrement on
+        unregister), so the delta after the second cycle is 2.
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1
+            st_worker.unregister_callable(_CID_PRIMARY)
+            registered = False
+            after_unreg = st_worker.aicpu_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if registered:
+                st_worker.unregister_callable(_CID_PRIMARY)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a5/host_build_graph/prepared_callable/conftest.py b/tests/st/a5/host_build_graph/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp
new file mode 100644
index 000000000..8e2094807
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]);
+    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]);
+
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(vRows, vCols);
+    TileData src1Tile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp
new file mode 100644
index 000000000..056442e21
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#include "pipe_sync.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ float *inout = reinterpret_cast<__gm__ float *>(args[0]);
+
+    union {
+        uint64_t u64;
+        float f32;
+    } converter;
+    converter.u64 = args[1];
+    float scalar = converter.f32;
+
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData srcTile(vRows, vCols);
+    TileData dstTile(vRows, vCols);
+    TASSIGN(srcTile, 0x0);
+    TASSIGN(dstTile, 0x10000);
+
+    GlobalData inoutGlobal(inout);
+
+    TLOAD(srcTile, inoutGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dstTile, srcTile, scalar);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(inoutGlobal, dstTile);
+
+    pipe_sync();
+}
diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp
new file mode 100644
index 000000000..8c8d807c4
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Dump-tensor interface demo for host_build_graph.
+ *
+ * Demonstrates the two ways to register tensor metadata for dump:
+ *   Task 0 (add):                add_task() + set_tensor_info_to_task()
+ *   Task 1 (add_scalar_inplace): add_task_with_tensor_info()
+ *
+ * Computation: f = (a + b) + 1  (a=2, b=3 → f=6)
+ */
+
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+int build_dump_tensor_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
+    void *host_a = orch_args.tensor(0).data_as<void>();
+    void *host_b = orch_args.tensor(1).data_as<void>();
+    void *host_f = orch_args.tensor(2).data_as<void>();
+    size_t size_a = orch_args.tensor(0).nbytes();
+    size_t size_b = orch_args.tensor(1).nbytes();
+    size_t size_f = orch_args.tensor(2).nbytes();
+    uint32_t size = orch_args.tensor(0).shapes[0];
+
+    TensorInfo ext_a_info = make_tensor_info_from_tensor_arg(orch_args.tensor(0));
+    TensorInfo ext_b_info = make_tensor_info_from_tensor_arg(orch_args.tensor(1));
+    TensorInfo ext_f_info = make_tensor_info_from_tensor_arg(orch_args.tensor(2));
+
+    void *dev_a = device_malloc(runtime, size_a);
+    copy_to_device(runtime, dev_a, host_a, size_a);
+
+    void *dev_b = device_malloc(runtime, size_b);
+    copy_to_device(runtime, dev_b, host_b, size_b);
+
+    void *dev_f = device_malloc(runtime, size_f);
+    record_tensor_pair(runtime, host_f, dev_f, size_f);
+
+    // Task 0: a + b → f  (add_task + set_tensor_info_to_task)
+    uint64_t args_t0[4] = {
+        reinterpret_cast<uint64_t>(dev_a),
+        reinterpret_cast<uint64_t>(dev_b),
+        reinterpret_cast<uint64_t>(dev_f),
+        size,
+    };
+    int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV);
+    TensorInfo t0_info[] = {ext_a_info, ext_b_info, ext_f_info};
+    set_tensor_info_to_task(runtime, t0, t0_info, 3);
+
+    // Task 1: f += 1.0  (add_task_with_tensor_info)
+    union {
+        float f32;
+        uint64_t u64;
+    } sc;
+    sc.f32 = 1.0f;
+    uint64_t args_t1[3] = {reinterpret_cast<uint64_t>(dev_f), sc.u64, size};
+    TensorInfo t1_info[] = {ext_f_info};
+    int t1 = add_task_with_tensor_info(runtime, args_t1, 3, 1, CoreType::AIV, t1_info, 1);
+
+    add_successor(runtime, t0, t1);
+
+    return 0;
+}
+
+}  // extern "C"
diff --git a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..1efd00806
--- /dev/null
+++ b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared on a5/host_build_graph.
+
+Mirrors tests/st/a2a3/host_build_graph/prepared_callable for the a5 variant.
+Reuses the dump_tensor example kernels (a + b + 1) since a5/hbg has no
+vector_example today and dump_tensor already runs cleanly on a5sim.
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPreparedCallableHbgA5(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable on a5/hbg.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/dump_tensor_orch.cpp",
+            "function_name": "build_dump_tensor_graph",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_add_scalar_inplace.cpp",
+                "core_type": "aiv",
+                "signature": [D.INOUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3}
+    _PLATFORMS = ["a5sim", "a5"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # dump_tensor orchestration computes f = (a + b) + 1
+        args.f[:] = (args.a + args.b) + 1
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
+
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        baseline_aicpu = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2
+            assert st_worker.aicpu_dlopen_count == baseline_aicpu
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.host_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 1
+            st_worker.unregister_callable(_CID_PRIMARY)
+            registered = False
+            assert st_worker.host_dlopen_count - baseline == 1, "unregister must NOT decrement the host dlopen counter"
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.host_dlopen_count - baseline == 2
+        finally:
+            if registered:
+                st_worker.unregister_callable(_CID_PRIMARY)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py
new file mode 100644
index 000000000..2a4ed2406
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py
@@ -0,0 +1,61 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Isolated L2 worker for prepared_callable white-box tests.
+
+The default ``st_worker`` (root conftest) is shared across L2 ST classes
+in a session-scoped pool — correct for ordinary business tests but not
+for prepared_callable, which asserts on the worker's internal cid table
+(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare
+``RuntimeError``, SO cache hits). Sharing the worker breaks those
+assertions: other tests' ``register()`` calls leave residue on the
+hard-coded cids 0/1.
+
+Override ``st_worker`` here as class-scope, building a fresh L2 worker
+that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close
+per prepared_callable test class.
+
+The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/
+host_build_graph) share identical conftest content — keep them in sync.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(scope="class")
+def st_worker(request, st_platform, device_pool):
+    cls = request.node.cls
+    if cls is None or not hasattr(cls, "_st_runtime"):
+        pytest.skip("isolated st_worker requires a SceneTestCase subclass")
+
+    runtime = cls._st_runtime
+    build = request.config.getoption("--build", default=False)
+
+    ids = device_pool.allocate(1)
+    if not ids:
+        pytest.fail("no devices available for isolated L2 worker")
+    dev_id = ids[0]
+    try:
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        w = Worker(
+            level=2,
+            device_id=dev_id,
+            platform=st_platform,
+            runtime=runtime,
+            build=build,
+        )
+        w.init()
+        try:
+            yield w
+        finally:
+            w.close()
+    finally:
+        device_pool.release(ids)
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
new file mode 100644
index 000000000..a8a7cedf2
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable on a5/trb.
+
+Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable. Reuses the
+vector_example orchestration + AIV kernels. Exercises:
+  - prepare_callable once, then run_prepared twice (second run proves the
+    AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload).
+  - Two distinct callable_ids sharing the same orch SO binary: verifies both
+    produce correct output independently.
+  - unregister_callable after runs complete: should not raise.
+  - aicpu_dlopen_count assertions covering: same-cid repeat, multi-cid
+    interleaving, double-prepare rejection, and unregister + re-prepare.
+"""
+
+import pytest
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs
+
+_VECTOR_KERNELS = "../../../../../examples/a5/tensormap_and_ringbuffer/vector_example/kernels"
+
+# White-box cids: this class owns the entire cid table of its isolated
+# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional —
+# they signify "the first two slots in a fresh table" rather than "any
+# free cid". Naming them makes that intent explicit.
+_CID_PRIMARY = 0
+_CID_SECONDARY = 1
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPreparedCallable(SceneTestCase):
+    """Exercise prepare_callable / run_prepared / unregister_callable ABI on a5/trb.
+
+    Requires an isolated L2 ``Worker`` (cid table starts empty); this is
+    provided by the directory-local ``conftest.py`` overriding ``st_worker``
+    with a class-scope fixture.
+    """
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
+    _PLATFORMS = ["a5sim", "a5"]
+
+    CASES = [
+        {
+            "name": "prepare_run_twice",
+            "platforms": _PLATFORMS,
+            "config": _COMMON_CONFIG,
+            "params": {"a": 2.0, "b": 3.0},
+        },
+    ]
+
+    def generate_args(self, params):
+        size = 128 * 128
+        a, b = params["a"], params["b"]
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
+            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
+            Tensor("f", torch.zeros(size, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def _run_and_validate_l2(
+        self,
+        worker,
+        callable_obj,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_l2_swimlane=False,
+        enable_dump_tensor=False,
+        enable_pmu=0,
+        output_prefix="",
+    ):
+        params = case.get("params", {})
+        config_dict = case.get("config", {})
+        orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
+
+        config = self._build_config(config_dict)
+
+        # 1) prepare two callable_ids with the SAME callable (shared orch SO)
+        worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        worker.prepare_callable(_CID_SECONDARY, callable_obj)
+
+        # 2) run_prepared primary cid twice (second run proves dedup/cache hit)
+        for _ in range(2):
+            test_args = self.generate_args(params)
+            chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+            golden_args = test_args.clone()
+            self.compute_golden(golden_args, params)
+
+            worker.run_prepared(_CID_PRIMARY, chip_args, config=config)
+            _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 3) run_prepared secondary cid — different slot, same SO, must also work
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+
+        worker.run_prepared(_CID_SECONDARY, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+        # 4) unregister both — should not raise
+        worker.unregister_callable(_CID_PRIMARY)
+        worker.unregister_callable(_CID_SECONDARY)
+
+    # ------------------------------------------------------------------
+    # aicpu_dlopen_count assertions.
+    #
+    # The class-scope L2 worker is shared across test methods in this
+    # class (see ./conftest.py), so the counter can be non-zero on entry
+    # from prior methods. Each test below snapshots the counter on entry,
+    # asserts the *delta* introduced by the scenario, then unregisters
+    # everything it staged. unregister_callable does NOT decrement the
+    # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare).
+    # ------------------------------------------------------------------
+
+    def _setup_dlopen_count_test(self, st_worker, st_platform):
+        """Common fixture: build callable + config, return (callable, config, case)."""
+        case = self.CASES[0]
+        callable_obj = self.build_callable(st_platform)
+        config = self._build_config(case["config"])
+        return callable_obj, config, case
+
+    def _run_one(self, worker, cid, callable_obj, config, case):
+        params = case["params"]
+        orch_sig = self.CALLABLE["orchestration"]["signature"]
+        test_args = self.generate_args(params)
+        chip_args, output_names = _build_chip_task_args(test_args, orch_sig)
+        golden_args = test_args.clone()
+        self.compute_golden(golden_args, params)
+        worker.run_prepared(cid, chip_args, config=config)
+        _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
+
+    def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker):
+        """Case A: prepare(primary) + run × 5 → dlopen_count delta == 1."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1, (
+                f"expected exactly 1 new dlopen for 5 runs of primary cid, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker):
+        """Case B: prepare(primary)+prepare(secondary) + alternating runs × 5 → delta == 2."""
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            st_worker.prepare_callable(_CID_SECONDARY, callable_obj)
+            for _ in range(5):
+                self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+                self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"expected exactly 2 new dlopens for two cids interleaved, "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+            st_worker.unregister_callable(_CID_SECONDARY)
+
+    def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker):
+        """Case C: prepare(primary) + prepare(primary) → second call raises RuntimeError."""
+        callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform)
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            with pytest.raises(RuntimeError):
+                st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+        finally:
+            st_worker.unregister_callable(_CID_PRIMARY)
+
+    def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker):
+        """Case D: prepare+run+unregister+prepare+run on the same cid → delta == 2.
+
+        unregister erases the cid from aicpu_seen_callable_ids_, so the second
+        prepare/run pair sets register_new_callable_id_ again and the AICPU
+        does a fresh dlopen. The counter is monotonic (does NOT decrement on
+        unregister), so the delta after the second cycle is 2.
+        """
+        callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform)
+        baseline = st_worker.aicpu_dlopen_count
+        registered = False
+        try:
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 1
+            st_worker.unregister_callable(_CID_PRIMARY)
+            registered = False
+            after_unreg = st_worker.aicpu_dlopen_count
+            assert after_unreg - baseline == 1, (
+                f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}"
+            )
+            st_worker.prepare_callable(_CID_PRIMARY, callable_obj)
+            registered = True
+            self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case)
+            assert st_worker.aicpu_dlopen_count - baseline == 2, (
+                f"after re-prepare expected counter +2 (two distinct AICPU dlopens), "
+                f"got delta {st_worker.aicpu_dlopen_count - baseline}"
+            )
+        finally:
+            if registered:
+                st_worker.unregister_callable(_CID_PRIMARY)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/explicit_fatal/test_explicit_fatal.py b/tests/st/explicit_fatal/test_explicit_fatal.py
index 8a88f0f41..f6c8a34c5 100644
--- a/tests/st/explicit_fatal/test_explicit_fatal.py
+++ b/tests/st/explicit_fatal/test_explicit_fatal.py
@@ -42,12 +42,13 @@ def test_explicit_fatal_reports(st_platform, st_device_ids):
 
     chip_callable = _build_chip_callable(st_platform)
     worker = Worker(level=2, platform=st_platform, runtime=RUNTIME, device_id=int(st_device_ids[0]))
+    cid = worker.register(chip_callable)
     worker.init()
     try:
         config = CallConfig()
         config.block_dim = 24
         config.aicpu_thread_num = 4
-        with pytest.raises(RuntimeError, match=r"run_runtime failed with code -9"):
-            worker.run(chip_callable, ChipStorageTaskArgs(), config)
+        with pytest.raises(RuntimeError, match=r"(run_runtime|run_prepared) failed with code -9"):
+            worker.run(cid, ChipStorageTaskArgs(), config)
     finally:
         worker.close()
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index b3caacd97..5ad49cd52 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -246,6 +246,26 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp)
 add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp)
 add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp)
 
+# Per-callable_id orch SO file naming regression (see rtStreamSynchronize
+# 507018 root cause). Compiles the a2a3 onboard `create_orch_so_file`
+# against the test source so it runs on no-hw runners too.
+add_executable(test_orch_so_file
+    common/test_orch_so_file.cpp
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/onboard/aicpu/orch_so_file.cpp
+)
+target_include_directories(test_orch_so_file PRIVATE
+    ${GTEST_INCLUDE_DIRS}
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+)
+target_compile_options(test_orch_so_file PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+target_link_libraries(test_orch_so_file PRIVATE
+    ${GTEST_MAIN_LIB}
+    ${GTEST_LIB}
+    pthread
+)
+add_test(NAME test_orch_so_file COMMAND test_orch_so_file)
+set_tests_properties(test_orch_so_file PROPERTIES LABELS "no_hardware")
+
 # ---------------------------------------------------------------------------
 # A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
 # ---------------------------------------------------------------------------
diff --git a/tests/ut/cpp/common/test_orch_so_file.cpp b/tests/ut/cpp/common/test_orch_so_file.cpp
new file mode 100644
index 000000000..6e1b32bd6
--- /dev/null
+++ b/tests/ut/cpp/common/test_orch_so_file.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+// Regression test for the per-callable_id orch SO file naming contract.
+//
+// The onboard variants of `create_orch_so_file` (src/{a2a3,a5}/platform/
+// onboard/aicpu/orch_so_file.cpp) historically used pid-only naming, which
+// silently broke once multi-callable dispatch was introduced on the same
+// device process: the second cid's `O_TRUNC` open
+// shredded the first cid's already-dlopen'd SO image and the next launch
+// on cid=0 SIGBUS'd inside the AICPU executor (manifesting as
+// `rtStreamSynchronize (AICPU) failed: 507018` on the host).
+//
+// The fix is to embed `callable_id` in the file name when cid >= 0. This
+// test exercises the contract directly: distinct cids must produce distinct
+// paths, and the legacy cid=-1 path must remain pid-only (no behavioural
+// change for variants that never adopt per-cid dispatch).
+
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "aicpu/orch_so_file.h"
+
+namespace {
+
+std::string mkscratch_dir() {
+    char templ[] = "/tmp/orch_so_file_ut_XXXXXX";
+    const char *dir = mkdtemp(templ);
+    if (dir == nullptr) {
+        std::abort();
+    }
+    return std::string(dir);
+}
+
+void rmtree(const std::string &dir) {
+    std::string cmd = "rm -rf '" + dir + "'";
+    (void)std::system(cmd.c_str());
+}
+
+}  // namespace
+
+TEST(OrchSoFile, DistinctCallableIdsProduceDistinctPaths) {
+    // Repro for the 507018 SIGBUS bug: with pid-only naming, cid=0 and
+    // cid=1 collide on `libdevice_orch_<pid>.so` and the second
+    // O_TRUNC open silently shreds the first cid's already-dlopen'd
+    // image. Embedding the cid restores per-callable file isolation.
+    const std::string dir = mkscratch_dir();
+    char path0[256] = {};
+    char path1[256] = {};
+
+    int32_t fd0 = create_orch_so_file(dir.c_str(), /*callable_id=*/0, path0, sizeof(path0));
+    ASSERT_GE(fd0, 0) << "create_orch_so_file(cid=0) failed";
+    close(fd0);
+
+    int32_t fd1 = create_orch_so_file(dir.c_str(), /*callable_id=*/1, path1, sizeof(path1));
+    ASSERT_GE(fd1, 0) << "create_orch_so_file(cid=1) failed";
+    close(fd1);
+
+    EXPECT_STRNE(path0, path1) << "Distinct cids must yield distinct file paths "
+                                  "(otherwise O_TRUNC would corrupt the first SO).";
+
+    rmtree(dir);
+}
+
+TEST(OrchSoFile, LegacySentinelKeepsPidOnlyNaming) {
+    // Variants that never adopt per-cid dispatch pass cid=-1; the file
+    // name must remain pid-only so existing callers see no change.
+    const std::string dir = mkscratch_dir();
+    char path[256] = {};
+
+    int32_t fd = create_orch_so_file(dir.c_str(), /*callable_id=*/-1, path, sizeof(path));
+    ASSERT_GE(fd, 0);
+    close(fd);
+
+    char expected[256];
+    std::snprintf(expected, sizeof(expected), "%s/libdevice_orch_%d.so", dir.c_str(), getpid());
+    EXPECT_STREQ(path, expected) << "Legacy (cid=-1) path must remain pid-only";
+
+    rmtree(dir);
+}
diff --git a/tests/ut/cpp/hierarchical/test_orchestrator.cpp b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
index 7c0d45978..59371c6da 100644
--- a/tests/ut/cpp/hierarchical/test_orchestrator.cpp
+++ b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
@@ -70,7 +70,7 @@ struct OrchestratorFixture : public ::testing::Test {
 
 TEST_F(OrchestratorFixture, IndependentTaskIsImmediatelyReady) {
     auto a = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level(/*callable=*/0xDEAD, a, cfg);
+    auto res = orch.submit_next_level(/*callable_id=*/42, a, cfg);
     EXPECT_NE(res.task_slot, INVALID_SLOT);
 
     TaskSlot slot;
@@ -82,13 +82,13 @@ TEST_F(OrchestratorFixture, IndependentTaskIsImmediatelyReady) {
 TEST_F(OrchestratorFixture, DependentTaskIsPending) {
     // Task A produces an OUTPUT at key 0xBEEF
     auto args_a = single_tensor_args(0xBEEF, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot a_slot;
     rq.try_pop(a_slot);
 
     // Task B reads INPUT at the same key -- depends on A
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
-    auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
+    auto b = orch.submit_next_level(42, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
     EXPECT_EQ(S(b.task_slot).fanin_count, 1);
 
@@ -98,7 +98,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
 
 TEST_F(OrchestratorFixture, TensorMapTracksProducer) {
     auto args_a = single_tensor_args(0x1234, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
@@ -107,7 +107,7 @@ TEST_F(OrchestratorFixture, TensorMapTracksProducer) {
 
 TEST_F(OrchestratorFixture, OnConsumedCleansUpTensorMap) {
     auto args_a = single_tensor_args(0x42, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot slot;
     rq.try_pop(slot);
 
@@ -123,7 +123,7 @@ TEST_F(OrchestratorFixture, OnConsumedCleansUpTensorMap) {
 TEST_F(OrchestratorFixture, ScopeRegistersAndReleasesRef) {
     orch.scope_begin();
     auto args_a = single_tensor_args(0x77, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot slot;
     rq.try_pop(slot);
 
@@ -147,13 +147,13 @@ TEST_F(OrchestratorFixture, ScopeRegistersAndReleasesRef) {
 TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
     // OUTPUT-tagged input registers a producer
     auto args_a = single_tensor_args(0xAAAA, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto a = orch.submit_next_level(42, args_a, cfg);
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
     // Second task references same key but tagged NO_DEP -- should be independent
     auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
-    auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
+    auto b = orch.submit_next_level(42, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
     EXPECT_EQ(S(b.task_slot).fanin_count, 0);
 }
@@ -161,7 +161,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
 TEST_F(OrchestratorFixture, GroupTaskStoresArgsListPerMember) {
     TaskArgs a0 = single_tensor_args(0xA0, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xA1, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto res = orch.submit_next_level_group(42, {a0, a1}, cfg);
 
     EXPECT_NE(res.task_slot, INVALID_SLOT);
     EXPECT_TRUE(S(res.task_slot).is_group());
@@ -179,7 +179,7 @@ TEST_F(OrchestratorFixture, GroupTaskStoresArgsListPerMember) {
 
 TEST_F(OrchestratorFixture, SingleTaskStoresTaskArgsDirectly) {
     TaskArgs a0 = single_tensor_args(0xC0, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level(0xDEAD, a0, cfg);
+    auto res = orch.submit_next_level(42, a0, cfg);
     ASSERT_NE(res.task_slot, INVALID_SLOT);
     EXPECT_FALSE(S(res.task_slot).is_group());
     EXPECT_EQ(S(res.task_slot).group_size(), 1);
@@ -200,7 +200,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {
     t.dtype = DataType::UINT8;
     args.add_tensor(t, TensorArgType::OUTPUT);
 
-    auto res = orch.submit_next_level(0xDEAD, args, cfg);
+    auto res = orch.submit_next_level(42, args, cfg);
     ASSERT_NE(res.task_slot, INVALID_SLOT);
 
     uint64_t data = S(res.task_slot).task_args.tensor(0).data;
@@ -220,7 +220,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     // the alloc-slot (so its HeapRing slab stays live while they write)
     // must tag the buffer INOUT.
     auto creator_args = single_tensor_args(0xFEED, TensorArgType::OUTPUT);
-    auto creator = orch.submit_next_level(0xDEAD, creator_args, cfg);
+    auto creator = orch.submit_next_level(42, creator_args, cfg);
     TaskSlot drain;
     rq.try_pop(drain);
     // Mark the creator COMPLETED so the new submit mimics the alloc-slot
@@ -228,7 +228,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     S(creator.task_slot).state.store(TaskState::COMPLETED, std::memory_order_relaxed);
 
     auto writer_args = single_tensor_args(0xFEED, TensorArgType::INOUT);
-    auto writer = orch.submit_next_level(0xDEAD, writer_args, cfg);
+    auto writer = orch.submit_next_level(42, writer_args, cfg);
     TaskSlot writer_slot;
     rq.try_pop(writer_slot);
 
@@ -259,13 +259,13 @@ TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
     };
     for (Case c : {Case{0xABCD, TensorArgType::OUTPUT}, Case{0xBEEF, TensorArgType::OUTPUT_EXISTING}}) {
         auto prior_args = single_tensor_args(c.key, TensorArgType::OUTPUT);
-        auto prior = orch.submit_next_level(0xDEAD, prior_args, cfg);
+        auto prior = orch.submit_next_level(42, prior_args, cfg);
         TaskSlot drain;
         rq.try_pop(drain);
         S(prior.task_slot).state.store(TaskState::COMPLETED, std::memory_order_relaxed);
 
         auto writer_args = single_tensor_args(c.key, c.tag);
-        auto writer = orch.submit_next_level(0xDEAD, writer_args, cfg);
+        auto writer = orch.submit_next_level(42, writer_args, cfg);
 
         EXPECT_EQ(tm.lookup(TensorKey{c.key, -1}), writer.task_slot);
         EXPECT_EQ(S(writer.task_slot).fanin_count, 0);
diff --git a/tests/ut/cpp/hierarchical/test_scheduler.cpp b/tests/ut/cpp/hierarchical/test_scheduler.cpp
index a66dcfd27..2fc7ba8c1 100644
--- a/tests/ut/cpp/hierarchical/test_scheduler.cpp
+++ b/tests/ut/cpp/hierarchical/test_scheduler.cpp
@@ -267,13 +267,13 @@ struct SchedulerFixture : public ::testing::Test {
 
 TEST_F(SchedulerFixture, IndependentTaskDispatchedAndConsumed) {
     auto args_a = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level(0xDEAD, args_a, cfg);
+    auto res = orch.submit_next_level(42, args_a, cfg);
     TaskSlot slot = res.task_slot;
 
     mock_worker.wait_running();
     ASSERT_GE(mock_worker.dispatched_count(), 1);
     EXPECT_EQ(mock_worker.dispatched[0].tensor_key, 0xCAFEu);
-    EXPECT_EQ(mock_worker.dispatched[0].callable, 0xDEADu);
+    EXPECT_EQ(mock_worker.dispatched[0].callable, 42u);
 
     mock_worker.complete();
     wait_consumed(slot);
@@ -281,14 +281,14 @@ TEST_F(SchedulerFixture, IndependentTaskDispatchedAndConsumed) {
 
 TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
     auto args_a = single_tensor_args(0xBEEF, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level(0xAA, args_a, cfg);
+    auto a = orch.submit_next_level(10, args_a, cfg);
 
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
-    auto b = orch.submit_next_level(0xBB, args_b, cfg);
+    auto b = orch.submit_next_level(11, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
 
     mock_worker.wait_running();
-    EXPECT_EQ(mock_worker.dispatched[0].callable, 0xAAu);
+    EXPECT_EQ(mock_worker.dispatched[0].callable, 10u);
     mock_worker.complete();  // A done
 
     auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(300);
@@ -296,7 +296,7 @@ TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
         std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
     ASSERT_GE(mock_worker.dispatched_count(), 2);
-    EXPECT_EQ(mock_worker.dispatched[1].callable, 0xBBu);
+    EXPECT_EQ(mock_worker.dispatched[1].callable, 11u);
 
     mock_worker.complete();  // B done
     wait_consumed(b.task_slot);
@@ -375,7 +375,7 @@ TEST_F(GroupSchedulerFixture, GroupDispatchesToNWorkers) {
     TaskArgs a0 = single_tensor_args(0xA0, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xA1, TensorArgType::OUTPUT);
 
-    auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto res = orch.submit_next_level_group(42, {a0, a1}, cfg);
     TaskSlot slot = res.task_slot;
 
     worker_a.wait_running();
@@ -400,7 +400,7 @@ TEST_F(GroupSchedulerFixture, GroupDispatchesToNWorkers) {
 TEST_F(GroupSchedulerFixture, GroupCompletesOnlyWhenAllDone) {
     TaskArgs a0 = single_tensor_args(0xB0, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xB1, TensorArgType::OUTPUT);
-    auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto res = orch.submit_next_level_group(42, {a0, a1}, cfg);
     TaskSlot slot = res.task_slot;
 
     worker_a.wait_running();
@@ -491,7 +491,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
     // Submit a next-level task; the only chip worker begins running it and
     // stays blocked until we call complete() on it.
     auto chip_args = single_tensor_args(0xAAA, TensorArgType::OUTPUT);
-    auto chip = orch.submit_next_level(0xCDCD, chip_args, cfg);
+    auto chip = orch.submit_next_level(20, chip_args, cfg);
     next_level_worker.wait_running();
     ASSERT_TRUE(next_level_worker.is_running.load());
 
@@ -522,10 +522,10 @@ TEST_F(GroupSchedulerFixture, GroupDependencyChain) {
     // Task B reads INPUT at the same key -- depends on group A.
     TaskArgs a0 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
-    auto a = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
+    auto a = orch.submit_next_level_group(42, {a0, a1}, cfg);
 
     auto args_b = single_tensor_args(0xCAFE, TensorArgType::INPUT);
-    auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
+    auto b = orch.submit_next_level(42, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
 
     worker_a.wait_running();
diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py
index 520254cc5..d6489dc09 100644
--- a/tests/ut/py/test_chip_worker.py
+++ b/tests/ut/py/test_chip_worker.py
@@ -68,19 +68,6 @@ def test_initial_state(self):
         assert worker.device_set is False
         assert worker.device_id == -1
 
-    def test_run_before_set_device_raises(self):
-        from _task_interface import ChipCallable, ChipStorageTaskArgs  # noqa: PLC0415
-
-        worker = _ChipWorker()
-        config = CallConfig()
-        args = ChipStorageTaskArgs()
-
-        # Build a minimal ChipCallable for the test
-        callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[])
-
-        with pytest.raises(RuntimeError, match="device not set"):
-            worker.run(callable_obj, args, config)
-
     def test_set_device_before_init_raises(self):
         worker = _ChipWorker()
         with pytest.raises(RuntimeError, match="not initialized"):
@@ -110,6 +97,28 @@ def test_init_with_nonexistent_lib_raises(self):
         with pytest.raises(RuntimeError, match="dlopen"):
             worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so")
 
+    def test_prepare_callable_before_set_device_raises(self):
+        from _task_interface import ChipCallable  # noqa: PLC0415
+
+        worker = _ChipWorker()
+        callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[])
+        with pytest.raises(RuntimeError, match="device not set"):
+            worker.prepare_callable(0, callable_obj)
+
+    def test_run_prepared_before_set_device_raises(self):
+        from _task_interface import ChipStorageTaskArgs  # noqa: PLC0415
+
+        worker = _ChipWorker()
+        config = CallConfig()
+        args = ChipStorageTaskArgs()
+        with pytest.raises(RuntimeError, match="device not set"):
+            worker.run_prepared(0, args, config)
+
+    def test_unregister_callable_before_set_device_raises(self):
+        worker = _ChipWorker()
+        with pytest.raises(RuntimeError, match="device not set"):
+            worker.unregister_callable(0)
+
 
 # ============================================================================
 # Python-level ChipWorker wrapper tests