hw-native-sys · poursoul · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/conftest.py b/conftest.py
@@ -920,11 +920,22 @@ def st_worker(request, st_platform, device_pool, _l2_worker_pool):
 
         # Register SubCallable entries from cls.CALLABLE
         sub_ids = {}
+        chip_cids = {}
         for entry in cls.CALLABLE.get("callables", []):
             if "callable" in entry:
                 cid = w.register(entry["callable"])
                 sub_ids[entry["name"]] = cid
+            elif "orchestration" in entry:
+                from simpler_setup.scene_test import _compile_chip_callable_from_spec  # noqa: PLC0415
+
+                name = entry["name"]
+                cache_key = (cls.__qualname__, name, st_platform, runtime)
+                chip = _compile_chip_callable_from_spec(entry, st_platform, runtime, cache_key)
+                cid = w.register(chip)
+                chip_cids[name] = cid
+                chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", [])
         cls._st_sub_ids = sub_ids
+        cls._st_chip_cids = chip_cids
 
         w.init()
         yield w

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -166,14 +166,21 @@ worker.init(host_path=str(binaries.host_path),
             aicore_path=str(binaries.aicore_path))
 worker.set_device(device_id=0)
 
-# Execute callable on device
-worker.run(chip_callable, orch_args, block_dim=24)
+# Register the ChipCallable to obtain a callable_id
+cid = worker.register(chip_callable)
+
+# Execute the registered callable on device
+worker.run(cid, orch_args, block_dim=24)
 
 # Cleanup
 worker.reset_device()
 worker.finalize()
 ```
 
+`ChipWorker` follows the same `register → run(cid)` contract as
+`Worker(level=2)`; reach for the high-level `Worker` first and use
+`ChipWorker` only when a low-level handle is required.
+
 ## Configuration
 
 ### Compile-time Configuration (Runtime Limits)

diff --git a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
@@ -137,6 +137,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -157,7 +158,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 

diff --git a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
@@ -159,6 +159,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 

diff --git a/...a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py b/...a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py
@@ -172,6 +172,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -191,7 +192,7 @@ def orch_fn(orch, _args, cfg):
                 args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING)
                 args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING)
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 

diff --git a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py
@@ -131,6 +131,7 @@ def run(platform: str = "a5", device_ids: list[int] | None = None, pto_isa_commi
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -151,7 +152,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 

diff --git a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py
@@ -159,6 +159,7 @@ def run(
         chip_bootstrap_configs=cfgs,
         build=build,
     )
+    chip_cid = worker.register(chip_callable)
     try:
         worker.init()
         contexts: list[ChipContext] = worker.chip_contexts
@@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg):
                     TensorArgType.INPUT,
                 )
                 args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, args, cfg, worker=rank)
+                orch.submit_next_level(chip_cid, args, cfg, worker=rank)
 
         worker.run(orch_fn, args=None, config=CallConfig())
 

diff --git a/examples/workers/l2/README.md b/examples/workers/l2/README.md
@@ -23,12 +23,19 @@ worker = Worker(
 )
 worker.init()             # load host.so + aicpu.so + aicore.o, set device
 try:
-    # ... allocate device buffers, build ChipCallable, run ...
-    worker.run(chip_callable, task_args, call_config)
+    # ... allocate device buffers, build ChipCallable ...
+    cid = worker.register(chip_callable)   # one-shot: cid is reused across runs
+    worker.run(cid, task_args, call_config)
 finally:
     worker.close()        # release ACL resources and device
 ```
 
+`register()` is the only way to obtain a `cid`; `worker.run` always takes
+that int, never the raw `ChipCallable`. A cid stays valid for the
+lifetime of the worker, so you register once and reuse it across runs —
+this is also why ST cases cache the cid on the test class (see
+`_st_l2_cid` in `simpler_setup/scene_test.py`).
+
 The `try/finally` is important — if anything between `init()` and `close()`
 raises, you still want the device released. The
 [L2 conftest leak issue](https://github.com/hw-native-sys/simpler/issues/604)

diff --git a/examples/workers/l2/vector_add/README.md b/examples/workers/l2/vector_add/README.md
@@ -96,7 +96,7 @@ args.add_tensor(ContinuousTensor.make(dev_a,   shape, DataType.FLOAT32))
 args.add_tensor(ContinuousTensor.make(dev_b,   shape, DataType.FLOAT32))
 args.add_tensor(ContinuousTensor.make(dev_out, shape, DataType.FLOAT32))
 
-worker.run(chip_callable, args, CallConfig())
+worker.run(chip_cid, args, CallConfig())  # chip_cid = worker.register(chip_callable) before init()
 ```
 
 The tensor order must match `signature` order on the `ChipCallable`. `run()`

diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py
@@ -19,7 +19,8 @@
     host arrays ──[worker.malloc + copy_to]──►  device buffers
                                           │
                                           ▼
-                              worker.run(chip_callable, task_args, cfg)
+                       chip_cid = worker.register(chip_callable)  # before init()
+                              worker.run(chip_cid, task_args, cfg)
                                           │
     device result ──[worker.copy_from]──► host array ──[torch compare]
 
@@ -126,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable:
     )
 
 
-def _run(worker: Worker, chip_callable: ChipCallable) -> None:
+def _run(worker: Worker, chip_cid: int) -> None:
     """Allocate device memory, copy inputs, execute, copy outputs back, verify."""
     # --- 1. Prepare host arrays ---
     torch.manual_seed(42)
@@ -154,7 +155,7 @@ def _run(worker: Worker, chip_callable: ChipCallable) -> None:
     # --- 4. Run. CallConfig() defaults are fine for this kernel. ---
     config = CallConfig()
     print("[vector_add] running on device...")
-    worker.run(chip_callable, args, config)
+    worker.run(chip_cid, args, config)
 
     # --- 5. D2H copy back + verify ---
     worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
@@ -183,10 +184,12 @@ def run(platform: str, device_id: int) -> int:
     chip_callable = build_chip_callable(platform)
     print(f"[vector_add] compiled. binary_size={chip_callable.binary_size} bytes")
 
+    chip_cid = worker.register(chip_callable)
+
     print(f"[vector_add] init worker (device={device_id})...")
     worker.init()
     try:
-        _run(worker, chip_callable)
+        _run(worker, chip_cid)
     finally:
         worker.close()
     return 0

diff --git a/examples/workers/l3/allreduce_distributed/main.py b/examples/workers/l3/allreduce_distributed/main.py
@@ -194,6 +194,7 @@ def run(device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    chip_cid = worker.register(chip_callable)
 
     try:
         print("[allreduce] init worker (forks chip children + bootstraps HCCL)...")
@@ -227,7 +228,7 @@ def orch_fn(orch, _args, cfg):
                 )
                 chip_args.add_scalar(ctx.nranks)
                 chip_args.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+                orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
         print("[allreduce] running 2-chip allreduce DAG...")
         worker.run(orch_fn, args=None, config=CallConfig())

diff --git a/examples/workers/l3/child_memory/main.py b/examples/workers/l3/child_memory/main.py
@@ -147,6 +147,7 @@ def run(platform: str, device_id: int) -> int:
 
     print(f"[child_memory] compiling kernels for {platform}...")
     chip_callable = build_chip_callable(platform)
+    chip_cid = worker.register(chip_callable)
 
     print("[child_memory] init worker...")
     worker.init()
@@ -172,7 +173,7 @@ def orch_fn(orch, _args, cfg):
                 a.add_tensor(make_tensor_arg(host_a), TensorArgType.INPUT)
                 a.add_tensor(w_dev, TensorArgType.INPUT)
                 a.add_tensor(make_tensor_arg(out), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(chip_callable, a, cfg, worker=0)
+                orch.submit_next_level(chip_cid, a, cfg, worker=0)
 
             # dev_w is reclaimed by DeviceRunner::finalize on worker.close() —
             # we don't orch.free it here, that's the whole point of child_memory.

diff --git a/examples/workers/l3/ffn_tp_parallel/main.py b/examples/workers/l3/ffn_tp_parallel/main.py
@@ -209,6 +209,8 @@ def run(device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
+    ffn_cid = worker.register(ffn_local_cc)
+    allreduce_cid = worker.register(allreduce_cc)
 
     try:
         print("[ffn_tp_parallel] init worker (forks chip children + bootstraps HCCL)...")
@@ -231,7 +233,7 @@ def orch_fn(orch, _args, cfg):
                 a1.add_tensor(make_tensor_arg(host_x_shards[i]), TensorArgType.INPUT)
                 a1.add_tensor(make_tensor_arg(host_w_shards[i]), TensorArgType.INPUT)
                 a1.add_tensor(make_tensor_arg(host_partial[i]), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(ffn_local_cc, a1, cfg, worker=i)
+                orch.submit_next_level(ffn_cid, a1, cfg, worker=i)
 
                 # Stage 2: AIV cross-rank sum. Tagging partial_local INPUT
                 # with the same buffer.addr makes TensorMap auto-link this
@@ -250,7 +252,7 @@ def orch_fn(orch, _args, cfg):
                 )
                 a2.add_scalar(ctx.nranks)
                 a2.add_scalar(ctx.device_ctx)
-                orch.submit_next_level(allreduce_cc, a2, cfg, worker=i)
+                orch.submit_next_level(allreduce_cid, a2, cfg, worker=i)
 
         print("[ffn_tp_parallel] running 2-chip 2-stage DAG...")
         worker.run(orch_fn, args=None, config=CallConfig())

diff --git a/examples/workers/l3/multi_chip_dispatch/README.md b/examples/workers/l3/multi_chip_dispatch/README.md
@@ -10,9 +10,10 @@ chip outputs. The smallest correct L3 program.
 | ------- | ------------------------------ |
 | Shared-memory tensors | `torch.randn(...).share_memory_()` — chip children see the same storage |
 | `TensorArgType` tags | `INPUT` / `OUTPUT_EXISTING` drive DAG dependency tracking |
-| Python SubWorker | `worker.register(fn)` **before** `init()` |
+| ChipCallable id | `chip_cid = worker.register(chip_callable)` **before** `init()` |
+| Python SubWorker | `sub_cid = worker.register(fn)` **before** `init()` |
 | `Worker(level=3)` config | `device_ids=[0, 1]`, `num_sub_workers=1` |
-| Orchestration | `orch.submit_next_level(...)` per chip + `orch.submit_sub(cid, args)` |
+| Orchestration | `orch.submit_next_level(chip_cid, ...)` per chip + `orch.submit_sub(sub_cid, args)` |
 
 ## Layout
 
@@ -66,17 +67,20 @@ host_b   = [torch.randn(...).share_memory_() for _ in device_ids]
 host_out = [torch.zeros(...).share_memory_() for _ in device_ids]
 
 def subworker(sub_args): ...
-sub_cid = worker.register(subworker)   # BEFORE init() — see below
+chip_cid = worker.register(chip_callable)   # ChipCallable: BEFORE init()
+sub_cid  = worker.register(subworker)        # Python SubWorker: BEFORE init()
 ```
 
 `share_memory_()` moves the tensor's storage to a `mmap` region. After
 `fork()`, the chip child process has that region mapped at the same virtual
 address, so when the kernel writes to `host_out[i]`, the parent's tensor sees
 it immediately. No explicit copy back.
 
-**`register()` MUST come before `init()`**. `init()` forks child processes;
-the registry is captured by copy-on-write. Anything registered after `init()`
-is invisible to the forked children.
+**`register()` MUST come before `init()`** for *every* callable — both
+the `ChipCallable` dispatched to chips and the Python sub functions.
+`init()` forks child processes; the registry is captured by copy-on-write.
+Anything registered after `init()` is invisible to the forked children,
+and `Worker.register()` at L≥3 raises if called post-init.
 
 ### 2. `init()` — fork + C++ scheduler
 
@@ -93,7 +97,7 @@ def orch_fn(orch, _args, cfg):
         chip_args.add_tensor(make_tensor_arg(host_a[i]),   TensorArgType.INPUT)
         chip_args.add_tensor(make_tensor_arg(host_b[i]),   TensorArgType.INPUT)
         chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING)
-        orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+        orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
     sub_args = TaskArgs()
     for i in range(len(device_ids)):

diff --git a/examples/workers/l3/multi_chip_dispatch/main.py b/examples/workers/l3/multi_chip_dispatch/main.py
@@ -146,6 +146,9 @@ def subworker(sub_args: TaskArgs) -> None:
     print(f"[multi_chip_dispatch] compiling kernels for {platform}...")
     chip_callable = build_chip_callable(platform)
 
+    # Register the ChipCallable so submit_next_level takes a cid.
+    chip_cid = worker.register(chip_callable)
+
     # --- 5. init() forks chip + sub child processes, starts C++ scheduler.
     print("[multi_chip_dispatch] init worker...")
     worker.init()
@@ -165,7 +168,7 @@ def orch_fn(orch, _args, cfg):
                 chip_args.add_tensor(make_tensor_arg(host_a[i]), TensorArgType.INPUT)
                 chip_args.add_tensor(make_tensor_arg(host_b[i]), TensorArgType.INPUT)
                 chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING)
-                orch.submit_next_level(chip_callable, chip_args, cfg, worker=i)
+                orch.submit_next_level(chip_cid, chip_args, cfg, worker=i)
 
             # Sub task that depends on both chip outputs. Tagging the two
             # host_out[i] tensors INPUT tells the scheduler to wait for