diff --git a/conftest.py b/conftest.py index bde0b7800..5d029caaf 100644 --- a/conftest.py +++ b/conftest.py @@ -920,11 +920,22 @@ def st_worker(request, st_platform, device_pool, _l2_worker_pool): # Register SubCallable entries from cls.CALLABLE sub_ids = {} + chip_cids = {} for entry in cls.CALLABLE.get("callables", []): if "callable" in entry: cid = w.register(entry["callable"]) sub_ids[entry["name"]] = cid + elif "orchestration" in entry: + from simpler_setup.scene_test import _compile_chip_callable_from_spec # noqa: PLC0415 + + name = entry["name"] + cache_key = (cls.__qualname__, name, st_platform, runtime) + chip = _compile_chip_callable_from_spec(entry, st_platform, runtime, cache_key) + cid = w.register(chip) + chip_cids[name] = cid + chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", []) cls._st_sub_ids = sub_ids + cls._st_chip_cids = chip_cids w.init() yield w diff --git a/docs/getting-started.md b/docs/getting-started.md index a7232d2bf..14a1d3a3f 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -166,14 +166,21 @@ worker.init(host_path=str(binaries.host_path), aicore_path=str(binaries.aicore_path)) worker.set_device(device_id=0) -# Execute callable on device -worker.run(chip_callable, orch_args, block_dim=24) +# Register the ChipCallable to obtain a callable_id +cid = worker.register(chip_callable) + +# Execute the registered callable on device +worker.run(cid, orch_args, block_dim=24) # Cleanup worker.reset_device() worker.finalize() ``` +`ChipWorker` follows the same `register → run(cid)` contract as +`Worker(level=2)`; reach for the high-level `Worker` first and use +`ChipWorker` only when a low-level handle is required. + ## Configuration ### Compile-time Configuration (Runtime Limits) diff --git a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py index 9977a3a4b..7461f1b7e 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py +++ b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py @@ -137,6 +137,7 @@ def run( chip_bootstrap_configs=cfgs, build=build, ) + chip_cid = worker.register(chip_callable) try: worker.init() contexts: list[ChipContext] = worker.chip_contexts @@ -157,7 +158,7 @@ def orch_fn(orch, _args, cfg): TensorArgType.INPUT, ) args.add_scalar(ctx.device_ctx) - orch.submit_next_level(chip_callable, args, cfg, worker=rank) + orch.submit_next_level(chip_cid, args, cfg, worker=rank) worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py index 6045efe4d..31cd3c479 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py +++ b/examples/a2a3/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py @@ -159,6 +159,7 @@ def run( chip_bootstrap_configs=cfgs, build=build, ) + chip_cid = worker.register(chip_callable) try: worker.init() contexts: list[ChipContext] = worker.chip_contexts @@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg): TensorArgType.INPUT, ) args.add_scalar(ctx.device_ctx) - orch.submit_next_level(chip_callable, args, cfg, worker=rank) + orch.submit_next_level(chip_cid, args, cfg, worker=rank) worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py b/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py index fd370712d..5033e0a95 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py +++ b/examples/a2a3/tensormap_and_ringbuffer/sdma_async_completion_demo/test_sdma_async_completion_demo.py @@ -172,6 +172,7 @@ def run( chip_bootstrap_configs=cfgs, build=build, ) + chip_cid = worker.register(chip_callable) try: worker.init() contexts: list[ChipContext] = worker.chip_contexts @@ -191,7 +192,7 @@ def orch_fn(orch, _args, cfg): args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING) args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING) args.add_scalar(ctx.device_ctx) - orch.submit_next_level(chip_callable, args, cfg, worker=rank) + orch.submit_next_level(chip_cid, args, cfg, worker=rank) worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py index 2bfab2131..c71fe5498 100644 --- a/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py +++ b/examples/a5/tensormap_and_ringbuffer/async_notify_demo/test_async_notify_demo.py @@ -131,6 +131,7 @@ def run(platform: str = "a5", device_ids: list[int] | None = None, pto_isa_commi num_sub_workers=0, chip_bootstrap_configs=cfgs, ) + chip_cid = worker.register(chip_callable) try: worker.init() contexts: list[ChipContext] = worker.chip_contexts @@ -151,7 +152,7 @@ def orch_fn(orch, _args, cfg): TensorArgType.INPUT, ) args.add_scalar(ctx.device_ctx) - orch.submit_next_level(chip_callable, args, cfg, worker=rank) + orch.submit_next_level(chip_cid, args, cfg, worker=rank) worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py index e20b2ecec..d05e19b13 100644 --- a/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py +++ b/examples/a5/tensormap_and_ringbuffer/deferred_notify_demo/test_deferred_notify_demo.py @@ -159,6 +159,7 @@ def run( chip_bootstrap_configs=cfgs, build=build, ) + chip_cid = worker.register(chip_callable) try: worker.init() contexts: list[ChipContext] = worker.chip_contexts @@ -187,7 +188,7 @@ def orch_fn(orch, _args, cfg): TensorArgType.INPUT, ) args.add_scalar(ctx.device_ctx) - orch.submit_next_level(chip_callable, args, cfg, worker=rank) + orch.submit_next_level(chip_cid, args, cfg, worker=rank) worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/workers/l2/README.md b/examples/workers/l2/README.md index 242ca926f..fb6f29251 100644 --- a/examples/workers/l2/README.md +++ b/examples/workers/l2/README.md @@ -23,12 +23,19 @@ worker = Worker( ) worker.init() # load host.so + aicpu.so + aicore.o, set device try: - # ... allocate device buffers, build ChipCallable, run ... - worker.run(chip_callable, task_args, call_config) + # ... allocate device buffers, build ChipCallable ... + cid = worker.register(chip_callable) # one-shot: cid is reused across runs + worker.run(cid, task_args, call_config) finally: worker.close() # release ACL resources and device ``` +`register()` is the only way to obtain a `cid`; `worker.run` always takes +that int, never the raw `ChipCallable`. A cid stays valid for the +lifetime of the worker, so you register once and reuse it across runs — +this is also why ST cases cache the cid on the test class (see +`_st_l2_cid` in `simpler_setup/scene_test.py`). + The `try/finally` is important — if anything between `init()` and `close()` raises, you still want the device released. The [L2 conftest leak issue](https://github.com/hw-native-sys/simpler/issues/604) diff --git a/examples/workers/l2/vector_add/README.md b/examples/workers/l2/vector_add/README.md index 7e5776d9e..e294fb2aa 100644 --- a/examples/workers/l2/vector_add/README.md +++ b/examples/workers/l2/vector_add/README.md @@ -96,7 +96,7 @@ args.add_tensor(ContinuousTensor.make(dev_a, shape, DataType.FLOAT32)) args.add_tensor(ContinuousTensor.make(dev_b, shape, DataType.FLOAT32)) args.add_tensor(ContinuousTensor.make(dev_out, shape, DataType.FLOAT32)) -worker.run(chip_callable, args, CallConfig()) +worker.run(chip_cid, args, CallConfig()) # chip_cid = worker.register(chip_callable) before init() ``` The tensor order must match `signature` order on the `ChipCallable`. `run()` diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py index 94867ca32..6ad1480ad 100644 --- a/examples/workers/l2/vector_add/main.py +++ b/examples/workers/l2/vector_add/main.py @@ -19,7 +19,8 @@ host arrays ──[worker.malloc + copy_to]──► device buffers │ ▼ - worker.run(chip_callable, task_args, cfg) + chip_cid = worker.register(chip_callable) # before init() + worker.run(chip_cid, task_args, cfg) │ device result ──[worker.copy_from]──► host array ──[torch compare] @@ -126,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable: ) -def _run(worker: Worker, chip_callable: ChipCallable) -> None: +def _run(worker: Worker, chip_cid: int) -> None: """Allocate device memory, copy inputs, execute, copy outputs back, verify.""" # --- 1. Prepare host arrays --- torch.manual_seed(42) @@ -154,7 +155,7 @@ def _run(worker: Worker, chip_callable: ChipCallable) -> None: # --- 4. Run. CallConfig() defaults are fine for this kernel. --- config = CallConfig() print("[vector_add] running on device...") - worker.run(chip_callable, args, config) + worker.run(chip_cid, args, config) # --- 5. D2H copy back + verify --- worker.copy_from(host_out.data_ptr(), dev_out, NBYTES) @@ -183,10 +184,12 @@ def run(platform: str, device_id: int) -> int: chip_callable = build_chip_callable(platform) print(f"[vector_add] compiled. binary_size={chip_callable.binary_size} bytes") + chip_cid = worker.register(chip_callable) + print(f"[vector_add] init worker (device={device_id})...") worker.init() try: - _run(worker, chip_callable) + _run(worker, chip_cid) finally: worker.close() return 0 diff --git a/examples/workers/l3/allreduce_distributed/main.py b/examples/workers/l3/allreduce_distributed/main.py index bd646df82..0dfa3d4de 100644 --- a/examples/workers/l3/allreduce_distributed/main.py +++ b/examples/workers/l3/allreduce_distributed/main.py @@ -194,6 +194,7 @@ def run(device_ids: list[int]) -> int: num_sub_workers=0, chip_bootstrap_configs=cfgs, ) + chip_cid = worker.register(chip_callable) try: print("[allreduce] init worker (forks chip children + bootstraps HCCL)...") @@ -227,7 +228,7 @@ def orch_fn(orch, _args, cfg): ) chip_args.add_scalar(ctx.nranks) chip_args.add_scalar(ctx.device_ctx) - orch.submit_next_level(chip_callable, chip_args, cfg, worker=i) + orch.submit_next_level(chip_cid, chip_args, cfg, worker=i) print("[allreduce] running 2-chip allreduce DAG...") worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/workers/l3/child_memory/main.py b/examples/workers/l3/child_memory/main.py index 2dfe3f4e0..b107983fb 100644 --- a/examples/workers/l3/child_memory/main.py +++ b/examples/workers/l3/child_memory/main.py @@ -147,6 +147,7 @@ def run(platform: str, device_id: int) -> int: print(f"[child_memory] compiling kernels for {platform}...") chip_callable = build_chip_callable(platform) + chip_cid = worker.register(chip_callable) print("[child_memory] init worker...") worker.init() @@ -172,7 +173,7 @@ def orch_fn(orch, _args, cfg): a.add_tensor(make_tensor_arg(host_a), TensorArgType.INPUT) a.add_tensor(w_dev, TensorArgType.INPUT) a.add_tensor(make_tensor_arg(out), TensorArgType.OUTPUT_EXISTING) - orch.submit_next_level(chip_callable, a, cfg, worker=0) + orch.submit_next_level(chip_cid, a, cfg, worker=0) # dev_w is reclaimed by DeviceRunner::finalize on worker.close() — # we don't orch.free it here, that's the whole point of child_memory. diff --git a/examples/workers/l3/ffn_tp_parallel/main.py b/examples/workers/l3/ffn_tp_parallel/main.py index b41dd561b..aa2bb2d2c 100644 --- a/examples/workers/l3/ffn_tp_parallel/main.py +++ b/examples/workers/l3/ffn_tp_parallel/main.py @@ -209,6 +209,8 @@ def run(device_ids: list[int]) -> int: num_sub_workers=0, chip_bootstrap_configs=cfgs, ) + ffn_cid = worker.register(ffn_local_cc) + allreduce_cid = worker.register(allreduce_cc) try: print("[ffn_tp_parallel] init worker (forks chip children + bootstraps HCCL)...") @@ -231,7 +233,7 @@ def orch_fn(orch, _args, cfg): a1.add_tensor(make_tensor_arg(host_x_shards[i]), TensorArgType.INPUT) a1.add_tensor(make_tensor_arg(host_w_shards[i]), TensorArgType.INPUT) a1.add_tensor(make_tensor_arg(host_partial[i]), TensorArgType.OUTPUT_EXISTING) - orch.submit_next_level(ffn_local_cc, a1, cfg, worker=i) + orch.submit_next_level(ffn_cid, a1, cfg, worker=i) # Stage 2: AIV cross-rank sum. Tagging partial_local INPUT # with the same buffer.addr makes TensorMap auto-link this @@ -250,7 +252,7 @@ def orch_fn(orch, _args, cfg): ) a2.add_scalar(ctx.nranks) a2.add_scalar(ctx.device_ctx) - orch.submit_next_level(allreduce_cc, a2, cfg, worker=i) + orch.submit_next_level(allreduce_cid, a2, cfg, worker=i) print("[ffn_tp_parallel] running 2-chip 2-stage DAG...") worker.run(orch_fn, args=None, config=CallConfig()) diff --git a/examples/workers/l3/multi_chip_dispatch/README.md b/examples/workers/l3/multi_chip_dispatch/README.md index 83ba2f889..7994dcaf5 100644 --- a/examples/workers/l3/multi_chip_dispatch/README.md +++ b/examples/workers/l3/multi_chip_dispatch/README.md @@ -10,9 +10,10 @@ chip outputs. The smallest correct L3 program. | ------- | ------------------------------ | | Shared-memory tensors | `torch.randn(...).share_memory_()` — chip children see the same storage | | `TensorArgType` tags | `INPUT` / `OUTPUT_EXISTING` drive DAG dependency tracking | -| Python SubWorker | `worker.register(fn)` **before** `init()` | +| ChipCallable id | `chip_cid = worker.register(chip_callable)` **before** `init()` | +| Python SubWorker | `sub_cid = worker.register(fn)` **before** `init()` | | `Worker(level=3)` config | `device_ids=[0, 1]`, `num_sub_workers=1` | -| Orchestration | `orch.submit_next_level(...)` per chip + `orch.submit_sub(cid, args)` | +| Orchestration | `orch.submit_next_level(chip_cid, ...)` per chip + `orch.submit_sub(sub_cid, args)` | ## Layout @@ -66,7 +67,8 @@ host_b = [torch.randn(...).share_memory_() for _ in device_ids] host_out = [torch.zeros(...).share_memory_() for _ in device_ids] def subworker(sub_args): ... -sub_cid = worker.register(subworker) # BEFORE init() — see below +chip_cid = worker.register(chip_callable) # ChipCallable: BEFORE init() +sub_cid = worker.register(subworker) # Python SubWorker: BEFORE init() ``` `share_memory_()` moves the tensor's storage to a `mmap` region. After @@ -74,9 +76,11 @@ sub_cid = worker.register(subworker) # BEFORE init() — see below address, so when the kernel writes to `host_out[i]`, the parent's tensor sees it immediately. No explicit copy back. -**`register()` MUST come before `init()`**. `init()` forks child processes; -the registry is captured by copy-on-write. Anything registered after `init()` -is invisible to the forked children. +**`register()` MUST come before `init()`** for *every* callable — both +the `ChipCallable` dispatched to chips and the Python sub functions. +`init()` forks child processes; the registry is captured by copy-on-write. +Anything registered after `init()` is invisible to the forked children, +and `Worker.register()` at L≥3 raises if called post-init. ### 2. `init()` — fork + C++ scheduler @@ -93,7 +97,7 @@ def orch_fn(orch, _args, cfg): chip_args.add_tensor(make_tensor_arg(host_a[i]), TensorArgType.INPUT) chip_args.add_tensor(make_tensor_arg(host_b[i]), TensorArgType.INPUT) chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING) - orch.submit_next_level(chip_callable, chip_args, cfg, worker=i) + orch.submit_next_level(chip_cid, chip_args, cfg, worker=i) sub_args = TaskArgs() for i in range(len(device_ids)): diff --git a/examples/workers/l3/multi_chip_dispatch/main.py b/examples/workers/l3/multi_chip_dispatch/main.py index b92a6fc10..1b5278877 100644 --- a/examples/workers/l3/multi_chip_dispatch/main.py +++ b/examples/workers/l3/multi_chip_dispatch/main.py @@ -146,6 +146,9 @@ def subworker(sub_args: TaskArgs) -> None: print(f"[multi_chip_dispatch] compiling kernels for {platform}...") chip_callable = build_chip_callable(platform) + # Register the ChipCallable so submit_next_level takes a cid. + chip_cid = worker.register(chip_callable) + # --- 5. init() forks chip + sub child processes, starts C++ scheduler. print("[multi_chip_dispatch] init worker...") worker.init() @@ -165,7 +168,7 @@ def orch_fn(orch, _args, cfg): chip_args.add_tensor(make_tensor_arg(host_a[i]), TensorArgType.INPUT) chip_args.add_tensor(make_tensor_arg(host_b[i]), TensorArgType.INPUT) chip_args.add_tensor(make_tensor_arg(host_out[i]), TensorArgType.OUTPUT_EXISTING) - orch.submit_next_level(chip_callable, chip_args, cfg, worker=i) + orch.submit_next_level(chip_cid, chip_args, cfg, worker=i) # Sub task that depends on both chip outputs. Tagging the two # host_out[i] tensors INPUT tells the scheduler to wait for diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 344758b78..258e00cec 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -623,34 +623,76 @@ NB_MODULE(_task_interface, m) { .def("reset_device", &ChipWorker::reset_device) .def("finalize", &ChipWorker::finalize) .def( - "run", - [](ChipWorker &self, const PyChipCallable &callable, ChipStorageTaskArgs &args, const CallConfig &config) { - self.run(callable.buffer_.data(), &args, config); + "prepare_callable", + [](ChipWorker &self, int32_t callable_id, const PyChipCallable &callable) { + self.prepare_callable(callable_id, callable.buffer_.data()); }, - nb::arg("callable"), nb::arg("args"), nb::arg("config") + nb::arg("callable_id"), nb::arg("callable"), + "Stage a ChipCallable under callable_id for cheap repeated launches " + "via run_prepared. Variants without per-callable_id support raise." ) .def( - "run_raw", - [](ChipWorker &self, uint64_t callable, uint64_t args, const CallConfig &config) { - self.run(reinterpret_cast(callable), reinterpret_cast(args), config); + "run_prepared", + [](ChipWorker &self, int32_t callable_id, ChipStorageTaskArgs &args, const CallConfig &config) { + self.run_prepared(callable_id, &args, config); }, - nb::arg("callable"), nb::arg("args"), nb::arg("config"), - "Run with raw pointer arguments (used from forked chip process)." + nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), + "Launch a callable_id previously staged via prepare_callable." ) .def( - "run_from_blob", - [](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, const CallConfig &config) { - TaskArgsView view = read_blob(reinterpret_cast(blob_ptr), MAILBOX_ARGS_CAPACITY); - self.run(callable, view, config); + "run_prepared", + [](ChipWorker &self, int32_t callable_id, TaskArgs &args, const CallConfig &config) { + TaskArgsView view = make_view(args); + self.run_prepared(callable_id, view, config); }, - nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("config"), - "Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at " - "blob_ptr and dispatch to the runtime. Used from forked chip processes " - "reading the WorkerThread mailbox." + nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), + "Launch a callable_id from a TaskArgs (used for in-process callers)." + ) + .def( + "run_prepared_from_blob", + [](ChipWorker &self, int32_t callable_id, uint64_t args_blob_ptr, size_t blob_capacity, + const CallConfig &config) { + // The mailbox region is the on-wire format `write_blob` produced; + // `read_blob` is the matching reader that returns a zero-copy + // TaskArgsView into the caller-owned bytes. Forwards to the + // existing `run_prepared(cid, view, config)` path so chip-child + // loops never re-implement the tensor/scalar layout in Python + // (where it has historically dropped fields like child_memory). + TaskArgsView view = read_blob(reinterpret_cast(args_blob_ptr), blob_capacity); + self.run_prepared(callable_id, view, config); + }, + nb::arg("callable_id"), nb::arg("args_blob_ptr"), nb::arg("blob_capacity"), nb::arg("config"), + "Launch a callable_id from a raw mailbox-blob pointer + capacity " + "(used by chip-child mailbox loops to avoid Python-side re-deserialisation " + "of the per-task tensor/scalar layout). The blob must be in the format " + "produced by `write_blob`; read_blob enforces capacity bounds against shm corruption." + ) + .def( + "unregister_callable", + [](ChipWorker &self, int32_t callable_id) { + self.unregister_callable(callable_id); + }, + nb::arg("callable_id"), + "Drop the prepared state for callable_id; releases the per-id share " + "of the device orch SO buffer (kernel binaries stay resident until " + "finalize)." ) .def_prop_ro("device_id", &ChipWorker::device_id) .def_prop_ro("initialized", &ChipWorker::initialized) .def_prop_ro("device_set", &ChipWorker::device_set) + .def_prop_ro( + "aicpu_dlopen_count", &ChipWorker::aicpu_dlopen_count, + "Number of distinct callable_ids the AICPU has dlopened for on the " + "bound device. Equals 0 when no device is set or the runtime " + "variant lacks per-cid registration. Tests assert this to verify " + "prepare_callable + repeated run_prepared do not redundantly dlopen." + ) + .def_prop_ro( + "host_dlopen_count", &ChipWorker::host_dlopen_count, + "Number of host-side dlopens triggered by prepare_callable on " + "host_build_graph variants. Mirrors aicpu_dlopen_count for the " + "host-orchestration path; 0 on device-orch variants." + ) .def("malloc", &ChipWorker::malloc, nb::arg("size")) .def("free", &ChipWorker::free, nb::arg("ptr")) .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size")) diff --git a/python/bindings/worker_bind.h b/python/bindings/worker_bind.h index f9824980f..00355856a 100644 --- a/python/bindings/worker_bind.h +++ b/python/bindings/worker_bind.h @@ -98,20 +98,22 @@ inline void bind_worker(nb::module_ &m) { nb::class_(m, "_Orchestrator") .def( "submit_next_level", - [](Orchestrator &self, uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker) { - return self.submit_next_level(callable, args, config, worker); + [](Orchestrator &self, int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker) { + return self.submit_next_level(callable_id, args, config, worker); }, - nb::arg("callable"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1), - "Submit a NEXT_LEVEL (chip) task. worker= pins to a specific next-level worker (-1 = any)." + nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), nb::arg("worker") = int8_t(-1), + "Submit a NEXT_LEVEL (chip) task by registered callable id. " + "worker= pins to a specific next-level worker (-1 = any)." ) .def( "submit_next_level_group", - [](Orchestrator &self, uint64_t callable, const std::vector &args_list, const CallConfig &config, - const std::vector &workers) { - return self.submit_next_level_group(callable, args_list, config, workers); + [](Orchestrator &self, int32_t callable_id, const std::vector &args_list, + const CallConfig &config, const std::vector &workers) { + return self.submit_next_level_group(callable_id, args_list, config, workers); }, - nb::arg("callable"), nb::arg("args_list"), nb::arg("config"), nb::arg("workers") = std::vector{}, - "Submit a group of NEXT_LEVEL tasks. workers= per-args affinity (empty = any)." + nb::arg("callable_id"), nb::arg("args_list"), nb::arg("config"), nb::arg("workers") = std::vector{}, + "Submit a group of NEXT_LEVEL tasks by registered callable id. " + "workers= per-args affinity (empty = any)." ) .def( "submit_sub", diff --git a/python/simpler/orchestrator.py b/python/simpler/orchestrator.py index 4a8bec655..29bc84db6 100644 --- a/python/simpler/orchestrator.py +++ b/python/simpler/orchestrator.py @@ -17,11 +17,11 @@ def my_orch(orch, args, cfg): a = TaskArgs() a.add_tensor(make_tensor_arg(input_tensor), TensorArgType.INPUT) a.add_tensor(make_tensor_arg(output_tensor), TensorArgType.OUTPUT) - orch.submit_next_level(chip_callable, a, cfg) + orch.submit_next_level(chip_cid, a, cfg) # cid from Worker.register(chip_callable) sub_args = TaskArgs() sub_args.add_tensor(make_tensor_arg(output_tensor), TensorArgType.INPUT) - orch.submit_sub(cid, sub_args) + orch.submit_sub(sub_cid, sub_args) w.run(my_orch, my_args, my_config) @@ -35,6 +35,7 @@ def my_orch(orch, args, cfg): from .task_interface import ( CallConfig, + ChipCallable, ContinuousTensor, DataType, TaskArgs, @@ -44,11 +45,21 @@ def my_orch(orch, args, cfg): ) -def _resolve_callable_ptr(callable_: Any) -> int: - """Accept either a ChipCallable (has buffer_ptr()) or a raw int pointer.""" - if hasattr(callable_, "buffer_ptr"): - return callable_.buffer_ptr() - return int(callable_) +def _require_cid(callable_or_cid: Any, *, kind: str) -> int: + """Coerce a submit argument to a registered cid. + + Raises a clear migration error when the caller still passes a + ``ChipCallable`` directly — every chip callable must be registered + via ``Worker.register(callable)`` *before* ``init()`` so each chip + child can pre-warm it on its own device. + """ + if isinstance(callable_or_cid, ChipCallable) or hasattr(callable_or_cid, "buffer_ptr"): + raise TypeError( + f"{kind} now takes a registered cid, not a ChipCallable. " + "Register the callable before init() via " + "`cid = worker.register(chip_callable)` and pass `cid` here." + ) + return int(callable_or_cid) class Orchestrator: @@ -68,18 +79,21 @@ def __init__(self, c_orchestrator: _COrchestrator) -> None: # ------------------------------------------------------------------ def submit_next_level( - self, callable_: Any, args: TaskArgs, config: Optional[CallConfig] = None, *, worker: int = -1 + self, callable_id: Any, args: TaskArgs, config: Optional[CallConfig] = None, *, worker: int = -1 ): - """Submit a NEXT_LEVEL (chip) task. Tags inside ``args`` drive deps. + """Submit a NEXT_LEVEL (chip) task by registered callable id. + ``callable_id`` must be the int returned by + ``Worker.register(chip_callable)``. Tags inside ``args`` drive deps. ``worker``: logical worker id for affinity (-1 = unconstrained). """ cfg = config if config is not None else CallConfig() - return self._o.submit_next_level(_resolve_callable_ptr(callable_), args, cfg, int(worker)) + cid = _require_cid(callable_id, kind="orch.submit_next_level") + return self._o.submit_next_level(cid, args, cfg, int(worker)) def submit_next_level_group( self, - callable_: Any, + callable_id: Any, args_list: list, config: Optional[CallConfig] = None, *, @@ -91,7 +105,8 @@ def submit_next_level_group( """ cfg = config if config is not None else CallConfig() w = [int(x) for x in workers] if workers else [] - return self._o.submit_next_level_group(_resolve_callable_ptr(callable_), args_list, cfg, w) + cid = _require_cid(callable_id, kind="orch.submit_next_level_group") + return self._o.submit_next_level_group(cid, args_list, cfg, w) def submit_sub(self, callable_id: int, args: Optional[TaskArgs] = None): """Submit a SUB task by registered callable id. diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index 71ac81122..3dd918b89 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -240,7 +240,8 @@ class ChipWorker: aicpu_path="build/lib/.../aicpu.so", aicore_path="build/lib/.../aicore.o") worker.set_device(device_id=0) - worker.run(chip_callable, orch_args, block_dim=24) + worker.prepare_callable(callable_id=0, callable=chip_callable) + worker.run_prepared(callable_id=0, args=orch_args, config=CallConfig(block_dim=24)) worker.reset_device() worker.finalize() """ @@ -305,11 +306,20 @@ def finalize(self): """ self._impl.finalize() - def run(self, callable, args, config=None, **kwargs): - """Execute a callable synchronously. + def prepare_callable(self, callable_id, callable): + """Stage a ChipCallable under ``callable_id`` for repeated cheap launches. + + Uploads the kernel binaries + the orchestration SO once; subsequent + ``run_prepared(callable_id, ...)`` skips that work. ``callable_id`` + must be in ``[0, 64)``. Requires ``set_device()``. + """ + self._impl.prepare_callable(int(callable_id), callable) + + def run_prepared(self, callable_id, args, config=None, **kwargs): + """Launch a ``callable_id`` previously staged via ``prepare_callable``. Args: - callable: ChipCallable built from orchestration + kernel binaries. + callable_id: Stable id passed to a prior ``prepare_callable``. args: ChipStorageTaskArgs for this invocation. config: Optional CallConfig. If None, a default is created. **kwargs: Overrides applied to config (e.g. block_dim=24). @@ -318,16 +328,21 @@ def run(self, callable, args, config=None, **kwargs): config = CallConfig() for k, v in kwargs.items(): setattr(config, k, v) - self._impl.run(callable, args, config) + self._impl.run_prepared(int(callable_id), args, config) - def run_from_blob(self, callable, blob_ptr, config): - """Execute via a serialized args blob in shared memory. + def unregister_callable(self, callable_id): + """Drop prepared state for ``callable_id`` and release its orch SO share.""" + self._impl.unregister_callable(int(callable_id)) - Used by `_chip_process_loop` after reading the mailbox: instead of - deserializing the args into Python objects, the C++ side parses the - POD blob directly at `blob_ptr`. - """ - self._impl.run_from_blob(int(callable), int(blob_ptr), config) + @property + def aicpu_dlopen_count(self): + """Number of distinct callable_ids the AICPU has dlopened for.""" + return self._impl.aicpu_dlopen_count + + @property + def host_dlopen_count(self): + """Number of host-side orch SO dlopens (host_build_graph variants).""" + return self._impl.host_dlopen_count def malloc(self, size): """Allocate memory. Returns a pointer (uint64).""" diff --git a/python/simpler/worker.py b/python/simpler/worker.py index 073084dc6..8fc2861ac 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -8,23 +8,31 @@ # ----------------------------------------------------------------------------------------------------------- """Worker — unified factory for all hierarchy levels. +Callable identity is a ``cid`` (int), allocated exclusively by +``Worker.register(callable)``. ``Worker.run`` and the orchestrator's +``submit_next_level`` / ``submit_sub`` all take this cid — never the raw +``ChipCallable`` / Python function. L≥3 ``register()`` must run **before** +``init()`` so forked chip / sub children inherit the registry via COW. + Usage:: # L2: one NPU chip w = Worker(level=2, device_id=8, platform="a2a3", runtime="tensormap_and_ringbuffer") w.init() - w.run(chip_callable, chip_args, config) + chip_cid = w.register(chip_callable) # L2 may register pre or post init() + w.run(chip_cid, chip_args, config) w.close() # L3: multiple chips + SubWorkers, auto-discovery in init() w = Worker(level=3, device_ids=[8, 9], num_sub_workers=2, platform="a2a3", runtime="tensormap_and_ringbuffer") - cid = w.register(lambda args: postprocess()) + chip_cid = w.register(chip_callable) # ChipCallable, before init() + sub_cid = w.register(lambda args: postprocess()) # Python sub, before init() w.init() def my_orch(orch, args, cfg): - r = orch.submit_next_level(chip_callable, chip_args_ptr, cfg) - orch.submit_sub(cid, sub_args) + r = orch.submit_next_level(chip_cid, chip_args_ptr, cfg) + orch.submit_sub(sub_cid, sub_args) w.run(my_orch, my_args, my_config) w.close() @@ -54,7 +62,7 @@ def my_l4_orch(orch, args, config): import time import traceback from multiprocessing.shared_memory import SharedMemory -from typing import Any, Callable, Optional +from typing import Any, Optional from _task_interface import ( # pyright: ignore[reportMissingImports] CHIP_BOOTSTRAP_MAILBOX_SIZE, @@ -62,6 +70,7 @@ def my_l4_orch(orch, args, config): ChipBootstrapMailboxState, _mailbox_load_i32, _mailbox_store_i32, + read_args_from_blob, ) from . import _log as _simpler_log @@ -72,10 +81,9 @@ def my_l4_orch(orch, args, config): MAILBOX_SIZE, CallConfig, ChipBootstrapConfig, + ChipCallable, ChipContext, ChipWorker, - ContinuousTensor, - DataType, TaskArgs, _Worker, ) @@ -86,6 +94,7 @@ def my_l4_orch(orch, args, config): _BOOTSTRAP_WAIT_TIMEOUT_S = 120.0 _BOOTSTRAP_POLL_INTERVAL_S = 0.001 + # --------------------------------------------------------------------------- # Unified mailbox layout (must match worker_manager.h MAILBOX_OFF_*) # --------------------------------------------------------------------------- @@ -127,6 +136,11 @@ def my_l4_orch(orch, args, config): _CTRL_FREE = 1 _CTRL_COPY_TO = 2 _CTRL_COPY_FROM = 3 +# Pre-warm a chip child for cid=arg0 by calling +# `prepare_callable(cid, registry[cid])` so the first run_prepared() does +# not pay the H2D upload cost. Sent from the parent right after init() +# (or whenever a new ChipCallable cid is registered). +_CTRL_PREPARE = 4 # Control args layout (reuses task mailbox fields when state == _CONTROL_*): # offset 8 (_OFF_CALLABLE): uint64 sub-command @@ -190,38 +204,21 @@ def _format_exc(prefix: str, exc: BaseException) -> str: def _read_args_from_mailbox(buf) -> TaskArgs: """Decode the TaskArgs blob written by C++ write_blob from the mailbox. - Blob layout at _OFF_ARGS: - int32 tensor_count (T), int32 scalar_count (S), - ContinuousTensor[T] (40 B each), uint64_t[S] (8 B each). + Used by the Python-targeted child loops (sub_worker, nested L4+ child) + where the destination of `args` is a Python callable that needs a + typed TaskArgs object. The chip-child loops that immediately forward + to C++ run_prepared use the zero-copy `run_prepared_from_blob` path + instead — see those loops for the matching comment. + + Delegates to the nanobind helper so the ContinuousTensor layout is + parsed by C++ `read_blob` (single source of truth) instead of being + reimplemented in Python. The Python re-implementation that lived + here previously dropped the `child_memory` byte (offset 33), which + silently broke any tensor carrying a chip-owned device pointer + (HCCL window slots etc.) — now structurally impossible. """ - base = _OFF_ARGS - t_count = struct.unpack_from("i", buf, base)[0] - s_count = struct.unpack_from("i", buf, base + 4)[0] - if t_count < 0 or s_count < 0: - raise RuntimeError(f"args blob has negative counts: tensors={t_count}, scalars={s_count}") - blob_bytes = 8 + t_count * 40 + s_count * 8 - if blob_bytes > _MAILBOX_ARGS_CAPACITY: - raise RuntimeError( - f"args blob ({blob_bytes} bytes) exceeds mailbox capacity ({_MAILBOX_ARGS_CAPACITY} bytes); " - f"tensors={t_count}, scalars={s_count} — likely a corrupt header or a writer bug" - ) - - args = TaskArgs() - ct_off = base + 8 - for i in range(t_count): - off = ct_off + i * 40 - data = struct.unpack_from("Q", buf, off)[0] - shapes = struct.unpack_from("5I", buf, off + 8) - ndims = struct.unpack_from("I", buf, off + 28)[0] - dtype_val = struct.unpack_from("B", buf, off + 32)[0] - ct = ContinuousTensor.make(data, tuple(shapes[:ndims]), DataType(dtype_val)) - args.add_tensor(ct) - - sc_off = ct_off + t_count * 40 - for i in range(s_count): - args.add_scalar(struct.unpack_from("Q", buf, sc_off + i * 8)[0]) - - return args + mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf)) + return read_args_from_blob(mailbox_addr + _OFF_ARGS) def _sub_worker_loop(buf, registry: dict) -> None: @@ -260,6 +257,7 @@ def _chip_process_loop( buf: memoryview, bins, device_id: int, + registry: dict, log_level: int = 1, log_info_v: int = 5, ) -> None: @@ -271,6 +269,13 @@ def _chip_process_loop( `log_level` / `log_info_v` are the parent's snapshot of the simpler logger (computed via `_log.get_current_config()`); the child cannot read the parent's logger after fork, so the values are passed explicitly. + + Per-callable_id dispatch: TASK_READY carries a cid in OFF_CALLABLE; the + child looks the cid up in the COW-inherited Python ``registry`` to get + the ChipCallable, calls ``cw.prepare_callable(cid, callable)`` once, + then ``cw.run_prepared(cid, args, cfg)``. ``_CTRL_PREPARE`` is the + explicit pre-warm path (parent pushes after init() to amortise the + first H2D upload). """ import traceback as _tb # noqa: PLC0415 @@ -289,20 +294,39 @@ def _chip_process_loop( mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf)) state_addr = mailbox_addr + _OFF_STATE - args_ptr = mailbox_addr + _OFF_ARGS sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id}] ready\n") sys.stderr.flush() + # Per-child set of cids already prepared on this device. The parent + # pre-warms via _CTRL_PREPARE, but TASK_READY also lazy-prepares as a + # safety net (e.g. registrations that bypassed the prefetch path). + prepared: set[int] = set() + + def _ensure_prepared(cid: int) -> None: + if cid in prepared: + return + callable_obj = registry.get(cid) + if callable_obj is None: + raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry") + cw.prepare_callable(cid, callable_obj) + prepared.add(cid) + while True: state = _mailbox_load_i32(state_addr) if state == _TASK_READY: - callable_ptr = struct.unpack_from("Q", buf, _OFF_CALLABLE)[0] + cid = int(struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]) & 0xFFFFFFFF cfg = _read_config_from_mailbox(buf) code = 0 msg = "" try: - cw.run_from_blob(callable_ptr, args_ptr, cfg) + _ensure_prepared(cid) + # Hand the mailbox bytes straight to C++ (zero-copy zero-decode): + # the blob layout is what `write_blob` already wrote, so re-parsing + # it in Python is N×40B of avoidable work and a permanent + # opportunity to drop a field. C++ reinterpret_cast + # is the source of truth. + cw._impl.run_prepared_from_blob(cid, mailbox_addr + _OFF_ARGS, _MAILBOX_ARGS_CAPACITY, cfg) except Exception as e: # noqa: BLE001 code = 1 msg = _format_exc(f"chip_process dev={device_id}", e) @@ -330,6 +354,9 @@ def _chip_process_loop( src = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] n = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] cw.copy_from(dst, src, n) + elif sub_cmd == _CTRL_PREPARE: + cid = int(struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0]) & 0xFFFFFFFF + _ensure_prepared(cid) except Exception as e: # noqa: BLE001 code = 1 msg = _format_exc(f"chip_process dev={device_id} ctrl={int(sub_cmd)}", e) @@ -340,13 +367,14 @@ def _chip_process_loop( break -def _chip_process_loop_with_bootstrap( # noqa: PLR0912 +def _chip_process_loop_with_bootstrap( # noqa: PLR0912, PLR0915 buf: memoryview, bins, device_id: int, bootstrap_cfg: ChipBootstrapConfig, bootstrap_mailbox_addr: int, max_buffer_count: int, + registry: dict, log_level: int = 1, log_info_v: int = 5, ) -> None: @@ -395,21 +423,36 @@ def _chip_process_loop_with_bootstrap( # noqa: PLR0912 mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf)) state_addr = mailbox_addr + _OFF_STATE - args_ptr = mailbox_addr + _OFF_ARGS sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id} bootstrap] ready\n") sys.stderr.flush() + # Per-child set of cids already prepared on this device. Mirrors + # `_chip_process_loop`'s `prepared`. + prepared: set[int] = set() + + def _ensure_prepared(cid: int) -> None: + if cid in prepared: + return + callable_obj = registry.get(cid) + if callable_obj is None: + raise RuntimeError(f"chip_process dev={device_id}: cid {cid} not in registry") + cw._impl.prepare_callable(cid, callable_obj) + prepared.add(cid) + try: while True: state = _mailbox_load_i32(state_addr) if state == _TASK_READY: - callable_ptr = struct.unpack_from("Q", buf, _OFF_CALLABLE)[0] + cid = int(struct.unpack_from("Q", buf, _OFF_CALLABLE)[0]) & 0xFFFFFFFF cfg = _read_config_from_mailbox(buf) code = 0 msg = "" try: - cw._impl.run_from_blob(callable_ptr, args_ptr, cfg) + _ensure_prepared(cid) + # Hand the mailbox bytes straight to C++ (zero-copy zero-decode); + # see the matching comment in `_chip_process_loop`. + cw._impl.run_prepared_from_blob(cid, mailbox_addr + _OFF_ARGS, _MAILBOX_ARGS_CAPACITY, cfg) except Exception as e: # noqa: BLE001 code = 1 msg = _format_exc(f"chip_process dev={device_id}", e) @@ -467,6 +510,9 @@ def _chip_process_loop_with_bootstrap( # noqa: PLR0912 src = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] n = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] cw._impl.copy_from(dst, src, n) + elif sub_cmd == _CTRL_PREPARE: + cid = int(struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0]) & 0xFFFFFFFF + _ensure_prepared(cid) except Exception as e: # noqa: BLE001 code = 1 msg = _format_exc(f"chip_process dev={device_id} ctrl={int(sub_cmd)}", e) @@ -561,7 +607,7 @@ def __init__( ) -> None: self.level = level self._config = config - self._callable_registry: dict[int, Callable] = {} + self._callable_registry: dict[int, Any] = {} self._initialized = False # Level-2 internals @@ -601,14 +647,38 @@ def __init__( # Callable registration (before init) # ------------------------------------------------------------------ - def register(self, fn: Callable) -> int: - """Register a callable (sub or orch fn). Must be called before init().""" - if self.level < 3: - raise RuntimeError("Worker.register() is only available at level 3+") - if self._initialized: - raise RuntimeError("Worker.register() must be called before init()") + def register(self, target) -> int: + """Register a callable. Returns the cid passed to ``run`` / ``submit_*``. + + A unified id space serves Python functions (sub fn / orch fn) and + ``ChipCallable`` instances at every level. L2 returns a cid the + user passes to ``Worker.run(cid, args, cfg)``; L3+ returns a cid + the orch function passes to ``orch.submit_next_level(cid, …)`` / + ``orch.submit_sub(cid, …)``. + + Timing constraints: + - L3+: must be called **before** ``init()`` so the COW-inherited + registry is visible to forked chip / sub children. ChipCallables + are pre-warmed by pushing ``_CTRL_PREPARE`` to every chip child + during ``init()``. + - L2: may be called either before or after ``init()`` (no fork, + no COW constraint). When called post-init, ChipCallables are + prepared on the device immediately; pre-init registrations are + batched and prepared at the end of ``init()``. + """ + if self.level >= 3 and self._initialized: + raise RuntimeError( + "Worker.register() at level >= 3 must be called before init() " + "(forked children inherit the registry via COW)" + ) cid = len(self._callable_registry) - self._callable_registry[cid] = fn + self._callable_registry[cid] = target + + # L2 post-init: pre-warm immediately so the very first + # `Worker.run(cid, …)` is a clean cache hit. + if self.level == 2 and self._initialized and isinstance(target, ChipCallable): + assert self._chip_worker is not None + self._chip_worker.prepare_callable(cid, target) return cid def add_worker(self, worker: "Worker") -> None: @@ -665,6 +735,13 @@ def _init_level2(self) -> None: self._chip_worker.init(binaries) self._chip_worker.set_device(device_id) + # Pre-warm any registered ChipCallable so the first run(cid, …) + # does not pay the H2D upload cost. + assert self._chip_worker is not None + for cid, target in self._callable_registry.items(): + if isinstance(target, ChipCallable): + self._chip_worker.prepare_callable(cid, target) + def _init_hierarchical(self) -> None: device_ids = self._config.get("device_ids", []) n_sub = self._config.get("num_sub_workers", 0) @@ -778,6 +855,7 @@ def _start_hierarchical(self) -> None: # noqa: PLR0912 -- three parallel fork l bootstrap_cfg, bootstrap_addr, max_buffer_count, + registry, chip_log_level, chip_log_info_v, ) @@ -786,6 +864,7 @@ def _start_hierarchical(self) -> None: # noqa: PLR0912 -- three parallel fork l buf, self._l3_bins, dev_id, + registry, chip_log_level, chip_log_info_v, ) @@ -846,6 +925,17 @@ def _start_hierarchical(self) -> None: # noqa: PLR0912 -- three parallel fork l self._orch = Orchestrator(dw.get_orchestrator()) + # Pre-warm every chip child: for each registered ChipCallable cid, + # send `_CTRL_PREPARE` to all chip children so the first + # `submit_next_level` does not pay the H2D upload cost. Sub fns / + # orch fns do not need pre-warming — the registry is already + # COW-inherited. + if device_ids: + for cid, target in self._callable_registry.items(): + if isinstance(target, ChipCallable): + for worker_id in range(len(self._chip_shms)): + self._chip_control(worker_id, _CTRL_PREPARE, arg0=cid) + # ------------------------------------------------------------------ # Bootstrap plumbing # ------------------------------------------------------------------ @@ -1042,16 +1132,21 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None: def run(self, callable, args=None, config=None) -> None: """Execute one task (L2) or one DAG (L3+) synchronously. - callable: ChipCallable (L2) or Python orch fn (L3+) - args: TaskArgs (optional) - config: CallConfig (optional, default-constructed if None) + Dispatch: + - L2: ``callable`` is a cid returned by ``Worker.register(chip_callable)``. + Routes to ``_chip_worker.run_prepared(cid, args, cfg)``. + - L3+: ``callable`` is a Python orch fn invoked with the + ``Orchestrator`` handle. + + ``args`` : TaskArgs (optional) + ``config``: CallConfig (optional, default-constructed if None) """ assert self._initialized, "Worker not initialized; call init() first" cfg = config if config is not None else CallConfig() if self.level == 2: assert self._chip_worker is not None - self._chip_worker.run(callable, args, cfg) + self._chip_worker.run_prepared(int(callable), args, cfg) else: self._start_hierarchical() assert self._orch is not None @@ -1075,6 +1170,68 @@ def run(self, callable, args=None, config=None) -> None: self._orch._scope_end() self._orch._drain() + def prepare_callable(self, callable_id: int, callable) -> None: + """L2 only: pre-stage a callable under ``callable_id`` (see + ``ChipWorker.prepare_callable``). Subsequent ``run_prepared`` skips + per-run kernel/orch SO upload. + """ + assert self._initialized, "Worker not initialized; call init() first" + if self.level != 2: + raise NotImplementedError("prepare_callable is L2-only") + assert self._chip_worker is not None + self._chip_worker.prepare_callable(callable_id, callable) + + def run_prepared(self, callable_id: int, args=None, config=None) -> None: + """L2 only: launch a callable previously staged via ``prepare_callable``.""" + assert self._initialized, "Worker not initialized; call init() first" + if self.level != 2: + raise NotImplementedError("run_prepared is L2-only") + assert self._chip_worker is not None + cfg = config if config is not None else CallConfig() + self._chip_worker.run_prepared(callable_id, args, cfg) + + def unregister_callable(self, callable_id: int) -> None: + """L2 only: drop the prepared state for ``callable_id``.""" + assert self._initialized, "Worker not initialized; call init() first" + if self.level != 2: + raise NotImplementedError("unregister_callable is L2-only") + assert self._chip_worker is not None + self._chip_worker.unregister_callable(callable_id) + + @property + def aicpu_dlopen_count(self) -> int: + """L2 only: number of distinct callable_ids the AICPU has dlopened for. + + Used by tests to assert that ``register`` + repeated ``run(cid)`` calls + do not retrigger the AICPU dlopen for an already-seen cid. Returns 0 + on non-L2 workers (no per-cid registration there). + """ + if self.level != 2 or self._chip_worker is None: + return 0 + return self._chip_worker.aicpu_dlopen_count + + @property + def host_dlopen_count(self) -> int: + """L2 only: number of host-side orch SO dlopens (hbg variants). + + Mirrors ``aicpu_dlopen_count`` for the host_build_graph path. Returns + 0 on non-L2 workers or device-orch variants (trb). + """ + if self.level != 2 or self._chip_worker is None: + return 0 + return self._chip_worker.host_dlopen_count + + def _run_as_child(self, cid: int, args, config) -> None: + """Called from C++ _Worker::run when this Worker is a THREAD-mode child. + + Looks up the orch function from the callable registry and delegates + to ``self.run(orch_fn, args, config)``. + """ + orch_fn = self._callable_registry.get(cid) + if orch_fn is None: + raise KeyError(f"callable id {cid} not found in registry") + self.run(orch_fn, args, config) + # ------------------------------------------------------------------ # close # ------------------------------------------------------------------ diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index 54c6519ef..9241bae92 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -918,6 +918,15 @@ def _run_and_validate_l2( config_dict = case.get("config", {}) orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", []) + # The L2 entry point is `Worker.run(cid, args, cfg)`. Reuse the + # cid registered by the st_worker fixture / standalone path. For + # first-time callers (worker reused across rounds), `_st_l2_cid` + # caches the cid so subsequent runs skip re-registration. + cid = getattr(type(self), "_st_l2_cid", None) + if cid is None: + cid = worker.register(callable_obj) + type(self)._st_l2_cid = cid + # Build args test_args = self.generate_args(params) chip_args, output_names = _build_chip_task_args(test_args, orch_sig) @@ -949,7 +958,7 @@ def _run_and_validate_l2( ) with _temporary_env(self._resolve_env()): - worker.run(callable_obj, chip_args, config=config) + worker.run(cid, chip_args, config=config) if not skip_golden: _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) @@ -1057,6 +1066,11 @@ def test_run(self, st_platform, st_worker, request): cls_name = type(self).__name__ callable_obj = self.build_callable(st_platform) sub_ids = getattr(type(self), "_st_sub_ids", {}) + # For L3, use pre-registered chip cids instead of raw ChipCallable + # objects. + chip_cids = getattr(type(self), "_st_chip_cids", {}) + if self._st_level == 3 and chip_cids: + callable_obj = {**chip_cids} # Primary device id: prefer the one actually allocated by st_worker # (each test class can hold a different slot from DevicePool); fall back @@ -1319,12 +1333,19 @@ def run_module(module_name): # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch ok = True for (runtime, level), group in by_rt_level.items(): print(f"\n=== Runtime: {runtime} Level: {level} ===") - worker, per_class_sub_ids = _create_standalone_worker(group, level, args, selected_by_cls) + worker, per_class_sub_ids, per_class_chip_cids = _create_standalone_worker( + group, level, args, selected_by_cls + ) try: for cls in group: inst = cls() callable_obj = inst.build_callable(args.platform) sub_ids = per_class_sub_ids.get(cls, {}) + chip_cids = per_class_chip_cids.get(cls, {}) + # For L3: merge chip cids into callable_obj (replacing + # ChipCallable objects with their registered cid). + if level == 3 and chip_cids: + callable_obj = {**chip_cids} for case in selected_by_cls[cls]: label = f"{cls.__name__}::{case['name']}" print(f" {label} ... ", end="", flush=True) @@ -1557,11 +1578,15 @@ def _create_standalone_worker(group, level, args, selected_by_cls): ``max_sub_workers`` must be computed from these, not from ``cls.CASES``: otherwise a manual case with a larger ``device_count`` inflates the allocation even when it isn't scheduled. + + Returns ``(worker, per_class_sub_ids, per_class_chip_cids)`` for both + L2 and L3 so the caller can unpack uniformly. L2 has neither sub + callables nor pre-registered chip callables, so both dicts are empty. """ first_cls = group[0] build = getattr(args, "build", False) if level == 2: - return first_cls._create_worker(args.platform, args.device, build=build), {} + return first_cls._create_worker(args.platform, args.device, build=build), {}, {} from simpler.worker import Worker # noqa: PLC0415 @@ -1590,12 +1615,24 @@ def _create_standalone_worker(group, level, args, selected_by_cls): ) # Register sub callables per-class to avoid name collisions per_class_sub_ids: dict[type, dict] = {} + # Also register ChipCallables here (before init) so the chip children + # pre-warm them via _CTRL_PREPARE. + per_class_chip_cids: dict[type, dict] = {} for cls in group: cls_sub_ids = {} + cls_chip_cids = {} for entry in cls.CALLABLE.get("callables", []): if "callable" in entry: cid = worker.register(entry["callable"]) cls_sub_ids[entry["name"]] = cid + elif "orchestration" in entry: + name = entry["name"] + cache_key = (cls.__qualname__, name, args.platform, cls._st_runtime) + chip = _compile_chip_callable_from_spec(entry, args.platform, cls._st_runtime, cache_key) + cid = worker.register(chip) + cls_chip_cids[name] = cid + cls_chip_cids[f"{name}_sig"] = entry["orchestration"].get("signature", []) per_class_sub_ids[cls] = cls_sub_ids + per_class_chip_cids[cls] = cls_chip_cids worker.init() - return worker, per_class_sub_ids + return worker, per_class_sub_ids, per_class_chip_cids diff --git a/src/a2a3/platform/include/aicpu/orch_so_file.h b/src/a2a3/platform/include/aicpu/orch_so_file.h index a305ab8fa..29318f5ea 100644 --- a/src/a2a3/platform/include/aicpu/orch_so_file.h +++ b/src/a2a3/platform/include/aicpu/orch_so_file.h @@ -39,10 +39,15 @@ * Caller is expected to try the next candidate directory. * * @param dir Candidate directory (e.g. "/tmp") + * @param callable_id Per-callable_id table slot id (>= 0). Required for + * uniqueness on the onboard path so concurrently- + * resident orch SOs (one per cid) do not collide on + * the same on-disk file. Pass -1 for the legacy + * single-slot dispatch path. * @param out_path Buffer that receives the full file path on success * @param out_path_size Size of `out_path` in bytes * @return Open writable fd on success, -1 on failure */ -int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size); +int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size); #endif // PLATFORM_AICPU_ORCH_SO_FILE_H_ diff --git a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp index 322cb7dcc..4e7f55232 100644 --- a/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp +++ b/src/a2a3/platform/onboard/aicpu/orch_so_file.cpp @@ -15,10 +15,20 @@ #include -int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) { - // Pid-based naming: AICPU device libc may lack mkstemps, and only one - // runtime runs per device process, so pid uniqueness is sufficient. - int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid()); +int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) { + // Pid + callable_id naming: AICPU device libc may lack mkstemps. With + // per-callable_id dispatch, multiple orch SOs can be resident in the + // same device process at once (one per cid in `orch_so_table_`), so + // the on-disk file name must be unique per cid — otherwise the + // second cid's `O_TRUNC` would silently shred the first cid's already + // dlopen'd file image and the next launch on cid=0 would SIGBUS. + // callable_id < 0 is the legacy single-slot path: pid alone is fine. + int32_t written; + if (callable_id >= 0) { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d_%d.so", dir, getpid(), callable_id); + } else { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid()); + } if (written < 0 || static_cast(written) >= out_path_size) { return -1; } diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index b43e5c5d7..b3c924e64 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -28,6 +28,7 @@ // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" #include "callable.h" +#include "callable_protocol.h" #include "utils/elf_build_id.h" #include "host/host_regs.h" // Register address retrieval #include "host/raii_scope_guard.h" @@ -746,6 +747,45 @@ void DeviceRunner::print_handshake_results() { } int DeviceRunner::prepare_orch_so(Runtime &runtime) { + // Per-callable_id path: when run_prepared bound a known callable_id, + // the SO bytes were already H2D'd at prepare_callable time. + // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on + // whether the AICPU has seen this id since registration. + const int32_t cid = runtime.get_active_callable_id(); + if (cid >= 0) { + auto it = prepared_callables_.find(cid); + if (it == prepared_callables_.end()) { + LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid); + return -1; + } + const auto &state = it->second; + // hbg variant: orch SO never crosses the host/device boundary, so the + // AICPU does no per-cid dlopen. Skip the orch_so_table_ bookkeeping + // (and the AICPU dlopen counter) and clear the device-orch metadata. + if (state.host_dlopen_handle != nullptr) { + runtime.set_dev_orch_so(0, 0); + runtime.set_active_callable_id(cid, /*is_new=*/false); + return 0; + } + const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second; + if (first_sighting) { + ++aicpu_dlopen_total_; + } + runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size); + // The c_api caller passed is_new=false; refresh with the authoritative + // first_sighting flag before AICPU consumes register_new_callable_id_. + runtime.set_active_callable_id(cid, first_sighting); + // Pending fields must be empty in the prepared path — runtime_maker's + // bind_prepared_to_runtime_impl never stages them. Defensive clear: + runtime.pending_orch_so_data_ = nullptr; + runtime.pending_orch_so_size_ = 0; + LOG_INFO_V0( + "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size, + first_sighting ? 1 : 0 + ); + return 0; + } + const void *host_so_data = runtime.pending_orch_so_data_; const size_t host_so_size = runtime.pending_orch_so_size_; runtime.pending_orch_so_data_ = nullptr; @@ -753,7 +793,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { if (host_so_data == nullptr || host_so_size == 0) { // Host-orchestration mode (no device SO needed). - runtime.set_dev_orch_so(0, 0, false); + runtime.set_dev_orch_so(0, 0); return 0; } @@ -761,7 +801,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) { LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size); - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/false); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); return 0; } @@ -797,11 +837,174 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { } cached_orch_so_hash_ = new_hash; - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/true); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size); return 0; } +int DeviceRunner::register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name, + std::vector> kernel_addrs +) { + // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]` + // (declared in src/common/task_interface/callable_protocol.h) and indexes it by + // callable_id; rejecting an out-of-range id here keeps the host and + // AICPU sides in sync and avoids an OOB access at run time. + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (orch_so_data == nullptr || orch_so_size == 0) { + LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id); + return -1; + } + + const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size); + + // Hash dedup: share device buffer across callable_ids that carry the same + // SO bytes. Refcount drops in unregister_prepared_callable; we only free + // when the count hits zero. + auto buf_it = orch_so_dedup_.find(hash); + uint64_t dev_addr = 0; + if (buf_it == orch_so_dedup_.end()) { + void *buf = mem_alloc_.alloc(orch_so_size); + if (buf == nullptr) { + LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size); + return -1; + } + int rc = rtMemcpy(buf, orch_so_size, orch_so_data, orch_so_size, RT_MEMCPY_HOST_TO_DEVICE); + if (rc != 0) { + LOG_ERROR("register_prepared_callable: rtMemcpy failed: %d", rc); + mem_alloc_.free(buf); + return rc; + } + OrchSoBuffer entry; + entry.dev_addr = buf; + entry.capacity = orch_so_size; + entry.refcount = 1; + orch_so_dedup_.emplace(hash, entry); + dev_addr = reinterpret_cast(buf); + LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size); + } else { + buf_it->second.refcount++; + dev_addr = reinterpret_cast(buf_it->second.dev_addr); + LOG_INFO_V0( + "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount + ); + } + + PreparedCallableState state; + state.hash = hash; + state.dev_orch_so_addr = dev_addr; + state.dev_orch_so_size = orch_so_size; + state.func_name = (func_name != nullptr) ? func_name : ""; + state.config_name = (config_name != nullptr) ? config_name : ""; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + return 0; +} + +int DeviceRunner::register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs +) { + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id, + MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) { + LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id); + return -1; + } + + PreparedCallableState state; + state.host_dlopen_handle = host_dlopen_handle; + state.host_orch_func_ptr = host_orch_func_ptr; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + ++host_dlopen_total_; + LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_); + return 0; +} + +int DeviceRunner::unregister_prepared_callable(int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + return 0; + } + PreparedCallableState state = std::move(it->second); + prepared_callables_.erase(it); + aicpu_seen_callable_ids_.erase(callable_id); + + if (state.host_dlopen_handle != nullptr) { + // hbg path: no orch SO refcount, just dlclose the host handle. + dlclose(state.host_dlopen_handle); + return 0; + } + + auto buf_it = orch_so_dedup_.find(state.hash); + if (buf_it != orch_so_dedup_.end()) { + if (--buf_it->second.refcount <= 0) { + mem_alloc_.free(buf_it->second.dev_addr); + orch_so_dedup_.erase(buf_it); + } + } + return 0; +} + +bool DeviceRunner::has_prepared_callable(int32_t callable_id) const { + return prepared_callables_.count(callable_id) != 0; +} + +int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id); + return -1; + } + const auto &state = it->second; + + // Replay kernel addresses directly into runtime->func_id_to_addr_ without + // going through set_function_bin_addr. The latter records func_ids in + // registered_kernel_func_ids_, which validate_runtime_impl iterates to + // free kernel binaries — but prepared kernels must survive across runs. + for (const auto &kv : state.kernel_addrs) { + if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first); + return -1; + } + runtime.replay_function_bin_addr(kv.first, kv.second); + } + // Replay both paths unconditionally — the runtime carries staging fields + // for both trb (device-side dlopen via entry-symbol names) and hbg (host- + // side dlopen handle + fn ptr). Whichever set was populated by + // register_prepared_callable / register_prepared_callable_host_orch wins; + // the other set stays at its initial value (empty string / nullptr). + runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle; + runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr; + runtime.set_device_orch_func_name(state.func_name.c_str()); + runtime.set_device_orch_config_name(state.config_name.c_str()); + // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag + // with the authoritative first_sighting answer right before launch. + runtime.set_active_callable_id(callable_id, /*is_new=*/false); + return 0; +} + int DeviceRunner::finalize() { if (device_id_ == -1) { return 0; @@ -821,17 +1024,27 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); - // Kernel binaries should have been removed by validate_runtime_impl() + // Kernel binaries are normally released by validate_runtime_impl on the + // legacy run() path. The prepared-callable path intentionally leaves + // them resident across runs (shared by func_id) and relies on + // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the + // legacy regression signal is preserved for callers that never went + // through prepare_callable. if (!func_id_to_addr_.empty()) { - LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); - // Cleanup leaked binaries to prevent memory leaks + const bool prepared_path_used = prepared_callable_path_used_; + if (prepared_path_used) { + LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size()); + } else { + LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); + } for (const auto &pair : func_id_to_addr_) { void *gm_addr = reinterpret_cast(pair.second); mem_alloc_.free(gm_addr); - LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second); + LOG_DEBUG("Freed kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second); } } func_id_to_addr_.clear(); + func_id_to_hash_.clear(); binaries_loaded_ = false; // Release the cached orchestration SO buffer. @@ -844,6 +1057,29 @@ int DeviceRunner::finalize() { host_orch_so_copy_.clear(); host_orch_so_copy_.shrink_to_fit(); + // Release any prepared-callable orch SO buffers that callers forgot to + // unregister. Refcounts no longer matter at this point — the device is + // about to be reset. + for (auto &kv : orch_so_dedup_) { + if (kv.second.dev_addr != nullptr) { + mem_alloc_.free(kv.second.dev_addr); + } + } + orch_so_dedup_.clear(); + // hbg path: dlclose any host orch handles callers forgot to unregister. + // finalize() is the last chance; Worker.close() does not auto-unregister + // each callable_id, so without this loop the host process leaks one + // dlopen handle per (re)created Worker — observable in long-running + // pytest sessions. + for (auto &kv : prepared_callables_) { + if (kv.second.host_dlopen_handle != nullptr) { + dlclose(kv.second.host_dlopen_handle); + } + } + prepared_callables_.clear(); + aicpu_seen_callable_ids_.clear(); + aicpu_dlopen_total_ = 0; + // Cleanup performance profiling if (l2_perf_collector_.is_initialized()) { auto unregister_cb = [](void *dev_ptr, int device_id) -> int { @@ -1008,11 +1244,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data return 0; } - // Return cached callable address if already uploaded + // Return cached callable address if already uploaded *and* the new bytes + // match. With the prepared-callable path, multiple ChipCallables share a + // single ChipWorker (and DeviceRunner) and can pick distinct kernel + // binaries for the same func_id. Naively reusing the cached entry hands + // the AICore the previous callable's kernel: dispatch never completes + // the new task and the AICPU spins forever. + const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size); auto it = func_id_to_addr_.find(func_id); if (it != func_id_to_addr_.end()) { - LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id); - return it->second; + auto hash_it = func_id_to_hash_.find(func_id); + if (hash_it != func_id_to_hash_.end() && hash_it->second == new_hash) { + LOG_INFO_V0("Kernel func_id=%d already uploaded (matching hash), returning cached address", func_id); + return it->second; + } + LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id); + mem_alloc_.free(reinterpret_cast(it->second)); + func_id_to_addr_.erase(it); + func_id_to_hash_.erase(func_id); } LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size); @@ -1042,6 +1291,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data } func_id_to_addr_[func_id] = callable_addr; + func_id_to_hash_[func_id] = new_hash; LOG_DEBUG(" func_id=%d -> callable_addr=0x%lx, binary_code_addr=0x%lx", func_id, callable_addr, binary_code_addr); @@ -1059,6 +1309,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) { mem_alloc_.free(gm_addr); func_id_to_addr_.erase(it); + func_id_to_hash_.erase(func_id); LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr); } diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 8dd4dc816..c910e47c5 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include "common/kernel_args.h" @@ -420,6 +422,99 @@ class DeviceRunner { */ void release_run_context(); + /** + * Stage a per-callable_id orchestration SO into device memory and remember + * the supporting metadata (entry/config symbol names, kernel func_id ↔ + * dev_addr table). Identical SO bytes across two callable_ids share one + * device buffer (refcounted by hash) so the worst case for an N-cid pool + * is N distinct device buffers, not N copies of the same SO. + * + * @param callable_id Caller-stable id, must be in [0, MAX_REGISTERED_CALLABLE_IDS). + * @param orch_so_data Host pointer to orchestration SO bytes (owned by caller). + * @param orch_so_size Size of orchestration SO in bytes. + * @param func_name Entry symbol name (copied). + * @param config_name Config symbol name (copied). + * @param kernel_addrs func_id ↔ dev_addr pairs already uploaded by the + * caller. Stored verbatim so run_prepared can replay + * them onto a fresh Runtime without re-uploading. + * @return 0 on success, negative on failure. + */ + int register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, + const char *config_name, std::vector> kernel_addrs + ); + + /** + * Host-orchestration variant of register_prepared_callable: stores a + * dlopen handle + entry-symbol pointer that runtime_maker resolved on the + * host (host_build_graph variant). Mutually exclusive with the trb-shaped + * `register_prepared_callable` overload — exactly one is invoked for a + * given callable_id, picked by the C ABI based on which staging fields the + * runtime carries after prepare_callable_impl. dlopen handle is owned by + * DeviceRunner from this call onward and dlclose'd by + * unregister_prepared_callable. Increments `host_dlopen_count_`. + */ + int register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs + ); + + /** + * Drop the prepared state for `callable_id`. trb path: decrement the orch + * SO buffer's hash-keyed refcount and free when it hits zero. hbg path: + * dlclose the host dlopen handle. Kernel binaries are shared across + * callables and only released by finalize(). + * + * @param callable_id Id previously passed to one of the + * register_prepared_callable* overloads. + * @return 0 on success or if the id was not registered. + */ + int unregister_prepared_callable(int32_t callable_id); + + /** + * True iff `callable_id` has prepared state staged via + * register_prepared_callable. Lets the c_api layer reject `run_prepared` + * calls without a matching `prepare_callable`. + */ + bool has_prepared_callable(int32_t callable_id) const; + + /** + * Replay the prepared state for `callable_id` onto a freshly-constructed + * Runtime: restores kernel func_id ↔ dev_addr table, the orch entry/config + * symbol names, and stamps `runtime.set_active_callable_id` so the + * subsequent `run` dispatches via the AICPU per-cid table. The kernel + * addresses are written directly into func_id_to_addr_ (bypassing + * registered_kernel_func_ids_) so validate_runtime_impl will not free them + * — they survive until unregister_prepared_callable / finalize(). + * + * Marks the cid as seen so the upcoming prepare_orch_so resolves + * `register_new_callable_id_` correctly (true exactly on first sighting + * after registration). + * + * @return 0 on success, -1 if the cid is not registered. + */ + int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id); + + /** + * Number of distinct callable_ids the AICPU has been asked to dlopen for. + * Monotonically increases on every first-sighting bind; `unregister_callable` + * does NOT decrement it. So a `prepare → run → unregister → re-prepare → run` + * sequence reports 2 (each AICPU dlopen counted once), even though only one + * cid is currently registered. Tests assert this to verify per-cid + * registration eliminates duplicate dlopens across repeated runs. + */ + size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; } + + /** + * Number of host-side dlopen() invocations triggered by + * `register_prepared_callable_host_orch`. Mirrors `aicpu_dlopen_count` but + * counts the host_build_graph variant's host-side dlopens; it never + * decrements (re-prepare after unregister still counts). Tests assert + * `host_dlopen_count == distinct_registered_cids` to verify the prepared + * path doesn't dlopen on every run. + */ + size_t host_dlopen_count() const { return host_dlopen_total_; } + private: // Internal state int device_id_{-1}; @@ -441,6 +536,10 @@ class DeviceRunner { // Kernel binary management bool binaries_loaded_{false}; // true after AICPU SO loaded std::map func_id_to_addr_; // func_id -> function_bin_addr (device GM) + // Parallel hash map for upload_kernel_binary() to detect when the same + // func_id is re-uploaded with different binary bytes (different + // ChipCallable sharing the same func_id under the per-callable_id path). + std::map func_id_to_hash_; // Orchestration SO cache. `cached_orch_so_hash_ == 0` means "no cache". // The device buffer grows monotonically — cache miss with a larger SO @@ -451,6 +550,50 @@ class DeviceRunner { size_t dev_orch_so_capacity_{0}; std::vector host_orch_so_copy_; + // Per-callable_id prepared state. + // + // `prepared_callables_` maps the caller-stable callable_id to the orch + // SO slice + symbol names needed to launch it. `orch_so_dedup_` shares + // device buffers across callable_ids whose orch SO bytes have the same + // ELF Build-ID hash (refcounted; freed when the count hits zero). + // `aicpu_seen_callable_ids_` tracks which ids have already been delivered + // to the AICPU at least once so prepare_orch_so can set + // register_new_callable_id_ correctly on first sighting. + struct PreparedCallableState { + // trb path (AICPU dlopens orch SO from device buffer) + uint64_t hash{0}; + uint64_t dev_orch_so_addr{0}; + size_t dev_orch_so_size{0}; + std::string func_name; + std::string config_name; + // common + std::vector> kernel_addrs; + // hbg path (host already dlopen'd the orch SO) + void *host_dlopen_handle{nullptr}; + void *host_orch_func_ptr{nullptr}; + }; + struct OrchSoBuffer { + void *dev_addr{nullptr}; + size_t capacity{0}; + int refcount{0}; + }; + std::unordered_map prepared_callables_; + std::unordered_map orch_so_dedup_; + std::unordered_set aicpu_seen_callable_ids_; + // Monotonic count of AICPU dlopens triggered (incremented on each + // first-sighting bind; never decremented). Diverges from + // aicpu_seen_callable_ids_.size() once any cid is unregistered and + // re-prepared. Exposed via aicpu_dlopen_count() for tests. + size_t aicpu_dlopen_total_{0}; + // Monotonic count of host-side dlopens triggered (incremented on every + // register_prepared_callable_host_orch call; never decremented). Same + // re-prepare semantics as aicpu_dlopen_total_, but for hbg variants. + size_t host_dlopen_total_{0}; + // Sticky flag: prepare_callable was called at least once. Distinguishes + // legacy-path "kernel still cached at finalize" leaks from prepared-path + // kernels that legitimately live until finalize. + bool prepared_callable_path_used_{false}; + // ACL lifecycle (process-wide). aclInit must run exactly once; ensure_acl_ready // gates it behind this flag. finalize() drives aclFinalize only if we observed // acl_ready_, so runtimes that never ask for ACL (e.g. pure rt-layer) stay unaffected. @@ -498,8 +641,8 @@ class DeviceRunner { ); /** - * Populate runtime.{dev_orch_so_addr_, dev_orch_so_size_, has_new_orch_so_} - * from `runtime.pending_orch_so_data_` / `_size_`. + * Populate runtime.{dev_orch_so_addr_, dev_orch_so_size_} from + * `runtime.pending_orch_so_data_` / `_size_`. * * The host tracks the SO identity via a 64-bit hash derived from the ELF * GNU Build-ID. When the hash matches the previous run, the device-side diff --git a/src/a2a3/platform/onboard/host/host_regs.cpp b/src/a2a3/platform/onboard/host/host_regs.cpp index 0a90e4b07..f519392e1 100644 --- a/src/a2a3/platform/onboard/host/host_regs.cpp +++ b/src/a2a3/platform/onboard/host/host_regs.cpp @@ -135,8 +135,11 @@ get_aicore_reg_info(std::vector &aic, std::vector &aiv, const /** * Get one flat AIC-then-AIV address array for the requested register kind. - * Returns a negative code on HAL failure; does NOT generate placeholder - * addresses (callers must treat failure as fatal for that kind). + * For Ctrl kind, falls back to placeholder addresses on HAL failure to + * preserve historical behavior on hardware where halMemCtl rejects + * ADDR_MAP_TYPE_REG_AIC_CTRL queries (the dispatch path does not actually + * dereference these addresses). For Pmu kind, propagates the HAL error so + * the caller can disable PMU collection cleanly. */ static int get_aicore_regs(std::vector ®s, uint64_t device_id, AicoreRegKind kind) { std::vector aic; @@ -144,8 +147,19 @@ static int get_aicore_regs(std::vector ®s, uint64_t device_id, Aicor int rc = get_aicore_reg_info(aic, aiv, kind_to_addr_type(kind), device_id); if (rc != 0) { - LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc); - return rc; + if (kind == AicoreRegKind::Ctrl) { + LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d, using placeholder addresses", kind_to_name(kind), rc); + aic.clear(); + aiv.clear(); + for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) { + aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000)); + aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000); + aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000); + } + } else { + LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc); + return rc; + } } // AIC cores first, then AIV cores diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index e4b7d3b20..c647f4887 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -21,6 +21,8 @@ #include "task_args.h" #include + +#include #include #include "common/unified_log.h" @@ -39,7 +41,8 @@ extern "C" { /* =========================================================================== * Runtime Implementation Functions (defined in runtime_maker.cpp) * =========================================================================== */ -int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); +int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable); +int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); /* =========================================================================== @@ -195,16 +198,137 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de } } -int run_runtime( - DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; + try { + return static_cast(ctx)->finalize(); + } catch (...) { + return -1; + } +} + +/* =========================================================================== + * Internal helpers called from runtime_maker.cpp via Runtime.host_api + * =========================================================================== */ + +void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) { + if (runtime == NULL) return; + Runtime *r = static_cast(runtime); + r->record_tensor_pair(host_ptr, dev_ptr, size); +} + +void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { + if (ctx == NULL) return; + + // CANN dlog: derive from simpler logger choice unless ASCEND_GLOBAL_LOG_LEVEL + // is externally configured. + if (std::getenv("ASCEND_GLOBAL_LOG_LEVEL") == NULL) { + dlog_setlevel(-1, log_level, /*enableEvent*/ 0); + } + + HostLogger::get_instance().set_level(static_cast(log_level)); + HostLogger::get_instance().set_info_v(log_info_v); + + DeviceRunner *runner = static_cast(ctx); + runner->set_log_level(log_level); + runner->set_log_info_v(log_info_v); +} + +/* =========================================================================== + * Per-callable_id preparation + * =========================================================================== */ + +int prepare_callable( + DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary, + size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size +) { + if (ctx == NULL || callable == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + // AICPU/AICore executor binaries are only consumed by run()/run_prepared(); + // prepare_callable just uploads kernel + orch SO state. + (void)aicpu_binary; + (void)aicpu_size; + (void)aicore_binary; + (void)aicore_size; + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + auto tsd_guard = RAIIScopeGuard([]() { + pthread_setspecific(g_runner_key, nullptr); + }); + + try { + int rc = runner->prepare_run_context(device_id); + if (rc != 0) return rc; + auto run_context_guard = RAIIScopeGuard([runner]() { + runner->release_run_context(); + }); + + // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB, + // larger than the default thread stack. + std::unique_ptr r_owner = std::make_unique(); + Runtime *r = r_owner.get(); + r->host_api.device_malloc = device_malloc; + r->host_api.device_free = device_free; + r->host_api.copy_to_device = copy_to_device; + r->host_api.copy_from_device = copy_from_device; + r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; + + rc = prepare_callable_impl(r, reinterpret_cast(callable)); + if (rc != 0) { + return rc; + } + + // Extract kernel func_id ↔ dev_addr pairs uploaded by prepare_callable_impl. + std::vector> kernel_addrs; + int kcount = r->get_registered_kernel_count(); + kernel_addrs.reserve(kcount); + for (int i = 0; i < kcount; i++) { + int fid = r->get_registered_kernel_func_id(i); + kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid)); + } + // Clear registered kernels so the Runtime destructor (or any accidental + // validate call) does NOT free the kernel binaries we just uploaded — + // they belong to the prepared state now. + r->clear_registered_kernels(); + + // Pick the path by inspecting which staging fields the runtime carries: + // hbg's prepare_callable_impl populates pending_host_dlopen_handle_; + // trb's leaves it null and instead populates pending_orch_so_data_ + + // device_orch_func_name_/config_name_. + if (r->pending_host_dlopen_handle_ != nullptr) { + rc = runner->register_prepared_callable_host_orch( + callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs) + ); + r->pending_host_dlopen_handle_ = nullptr; + r->pending_host_orch_func_ptr_ = nullptr; + } else { + rc = runner->register_prepared_callable( + callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(), + r->get_device_orch_config_name(), std::move(kernel_addrs) + ); + } + return rc; + } catch (...) { + return -1; + } +} + +int run_prepared( + DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix ) { if (ctx == NULL || runtime == NULL) return -1; - if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1; - DeviceRunner *runner = static_cast(ctx); + if (!runner->has_prepared_callable(callable_id)) { + LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id); + return -1; + } + pthread_once(&g_runner_key_once, create_runner_key); pthread_setspecific(g_runner_key, ctx); auto tsd_guard = RAIIScopeGuard([]() { @@ -226,11 +350,15 @@ int run_runtime( r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; - LOG_DEBUG("About to call init_runtime_impl, r=%p", (void *)r); - rc = init_runtime_impl( - r, reinterpret_cast(callable), reinterpret_cast(args) - ); - LOG_DEBUG("init_runtime_impl returned: %d", rc); + // Restore kernel addrs + orch symbol names + active_callable_id + rc = runner->bind_prepared_callable_to_runtime(*r, callable_id); + if (rc != 0) { + r->~Runtime(); + return rc; + } + + // Per-run binding (tensor args, GM heap, SM alloc) + rc = bind_prepared_to_runtime_impl(r, reinterpret_cast(args)); if (rc != 0) { r->set_gm_sm_ptr(nullptr); validate_runtime_impl(r); @@ -260,40 +388,31 @@ int run_runtime( } } -int finalize_device(DeviceContextHandle ctx) { +int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) { if (ctx == NULL) return -1; try { - return static_cast(ctx)->finalize(); + return static_cast(ctx)->unregister_prepared_callable(callable_id); } catch (...) { return -1; } } -/* =========================================================================== - * Internal helpers called from runtime_maker.cpp via Runtime.host_api - * =========================================================================== */ - -void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) { - if (runtime == NULL) return; - Runtime *r = static_cast(runtime); - r->record_tensor_pair(host_ptr, dev_ptr, size); +size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->aicpu_dlopen_count(); + } catch (...) { + return 0; + } } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; - - // CANN dlog: derive from simpler logger choice unless ASCEND_GLOBAL_LOG_LEVEL - // is externally configured. - if (std::getenv("ASCEND_GLOBAL_LOG_LEVEL") == NULL) { - dlog_setlevel(-1, log_level, /*enableEvent*/ 0); +size_t get_host_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->host_dlopen_count(); + } catch (...) { + return 0; } - - HostLogger::get_instance().set_level(static_cast(log_level)); - HostLogger::get_instance().set_info_v(log_info_v); - - DeviceRunner *runner = static_cast(ctx); - runner->set_log_level(log_level); - runner->set_log_info_v(log_info_v); } } // extern "C" diff --git a/src/a2a3/platform/sim/aicpu/orch_so_file.cpp b/src/a2a3/platform/sim/aicpu/orch_so_file.cpp index 4da92d7de..114fe4826 100644 --- a/src/a2a3/platform/sim/aicpu/orch_so_file.cpp +++ b/src/a2a3/platform/sim/aicpu/orch_so_file.cpp @@ -24,10 +24,17 @@ #include -int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) { +int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) { // mkstemps: multiple sim workers can share a process, so names must be // unique per call. The "XXXXXX" template is replaced in-place. - int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir); + // callable_id is embedded purely for log readability (mkstemps already + // guarantees uniqueness regardless). + int32_t written; + if (callable_id >= 0) { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_cid%d_XXXXXX.so", dir, callable_id); + } else { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir); + } if (written < 0 || static_cast(written) >= out_path_size) { return -1; } diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 26db1e3d6..8cee9029e 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -36,6 +36,7 @@ #include "aicpu/platform_aicpu_affinity.h" #include "callable.h" +#include "callable_protocol.h" #include "utils/elf_build_id.h" #include "cpu_sim_context.h" #include "host/raii_scope_guard.h" @@ -673,13 +674,46 @@ void DeviceRunner::unload_executor_binaries() { } int DeviceRunner::prepare_orch_so(Runtime &runtime) { + // Per-callable_id path: mirror onboard. Bytes were staged at + // register_prepared_callable time; here we only stamp metadata onto + // the runtime and resolve `register_new_callable_id_` from first sighting. + const int32_t cid = runtime.get_active_callable_id(); + if (cid >= 0) { + auto it = prepared_callables_.find(cid); + if (it == prepared_callables_.end()) { + LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid); + return -1; + } + const auto &state = it->second; + // hbg: orch SO never crosses host/device — clear device-orch metadata + // and skip AICPU bookkeeping. See onboard/device_runner.cpp. + if (state.host_dlopen_handle != nullptr) { + runtime.set_dev_orch_so(0, 0); + runtime.set_active_callable_id(cid, /*is_new=*/false); + return 0; + } + const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second; + if (first_sighting) { + ++aicpu_dlopen_total_; + } + runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size); + runtime.set_active_callable_id(cid, first_sighting); + runtime.pending_orch_so_data_ = nullptr; + runtime.pending_orch_so_size_ = 0; + LOG_INFO_V0( + "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size, + first_sighting ? 1 : 0 + ); + return 0; + } + const void *host_so_data = runtime.pending_orch_so_data_; const size_t host_so_size = runtime.pending_orch_so_size_; runtime.pending_orch_so_data_ = nullptr; runtime.pending_orch_so_size_ = 0; if (host_so_data == nullptr || host_so_size == 0) { - runtime.set_dev_orch_so(0, 0, false); + runtime.set_dev_orch_so(0, 0); return 0; } @@ -687,7 +721,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) { LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size); - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/false); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); return 0; } @@ -715,11 +749,156 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { std::memcpy(dev_orch_so_buffer_, host_orch_so_copy_.data(), host_so_size); cached_orch_so_hash_ = new_hash; - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/true); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size); return 0; } +int DeviceRunner::register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name, + std::vector> kernel_addrs +) { + // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]` + // (declared in src/common/task_interface/callable_protocol.h) and indexes it by + // callable_id; rejecting an out-of-range id here keeps the host and + // AICPU sides in sync and avoids an OOB access at run time. + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (orch_so_data == nullptr || orch_so_size == 0) { + LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id); + return -1; + } + + const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size); + + auto buf_it = orch_so_dedup_.find(hash); + uint64_t dev_addr = 0; + if (buf_it == orch_so_dedup_.end()) { + void *buf = mem_alloc_.alloc(orch_so_size); + if (buf == nullptr) { + LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size); + return -1; + } + // Sim shares an address space with the simulated AICPU thread, so a + // plain memcpy is the moral equivalent of rtMemcpy on hardware. + std::memcpy(buf, orch_so_data, orch_so_size); + OrchSoBuffer entry; + entry.dev_addr = buf; + entry.capacity = orch_so_size; + entry.refcount = 1; + orch_so_dedup_.emplace(hash, entry); + dev_addr = reinterpret_cast(buf); + LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size); + } else { + buf_it->second.refcount++; + dev_addr = reinterpret_cast(buf_it->second.dev_addr); + LOG_INFO_V0( + "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount + ); + } + + PreparedCallableState state; + state.hash = hash; + state.dev_orch_so_addr = dev_addr; + state.dev_orch_so_size = orch_so_size; + state.func_name = (func_name != nullptr) ? func_name : ""; + state.config_name = (config_name != nullptr) ? config_name : ""; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + return 0; +} + +int DeviceRunner::register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs +) { + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id, + MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) { + LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id); + return -1; + } + + PreparedCallableState state; + state.host_dlopen_handle = host_dlopen_handle; + state.host_orch_func_ptr = host_orch_func_ptr; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + ++host_dlopen_total_; + LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_); + return 0; +} + +int DeviceRunner::unregister_prepared_callable(int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + return 0; + } + PreparedCallableState state = std::move(it->second); + prepared_callables_.erase(it); + aicpu_seen_callable_ids_.erase(callable_id); + + if (state.host_dlopen_handle != nullptr) { + // hbg: dlclose the host handle; no orch SO refcount to decrement. + dlclose(state.host_dlopen_handle); + return 0; + } + + auto buf_it = orch_so_dedup_.find(state.hash); + if (buf_it != orch_so_dedup_.end()) { + if (--buf_it->second.refcount <= 0) { + mem_alloc_.free(buf_it->second.dev_addr); + orch_so_dedup_.erase(buf_it); + } + } + return 0; +} + +bool DeviceRunner::has_prepared_callable(int32_t callable_id) const { + return prepared_callables_.count(callable_id) != 0; +} + +int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id); + return -1; + } + const auto &state = it->second; + for (const auto &kv : state.kernel_addrs) { + if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first); + return -1; + } + runtime.replay_function_bin_addr(kv.first, kv.second); + } + runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle; + runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr; + runtime.set_device_orch_func_name(state.func_name.c_str()); + runtime.set_device_orch_config_name(state.config_name.c_str()); + runtime.set_active_callable_id(callable_id, /*is_new=*/false); + return 0; +} + int DeviceRunner::finalize() { // Skip if already finalized if (device_id_ == -1 && aicpu_so_handle_ == nullptr && aicore_so_handle_ == nullptr) { @@ -744,15 +923,22 @@ int DeviceRunner::finalize() { pmu_collector_.finalize(nullptr, free_cb, &mem_alloc_); } - // Kernel binaries should have been removed by validate_runtime_impl() + // Kernel binaries are normally released by validate_runtime_impl on the + // legacy run() path. The prepared-callable path intentionally leaves + // them resident across runs and relies on finalize() to reclaim them; + // that is not a leak. if (!func_id_to_addr_.empty()) { - LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size()); - // Cleanup leaked handles and host copies + const bool prepared_path_used = prepared_callable_path_used_; + if (prepared_path_used) { + LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size()); + } else { + LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size()); + } for (auto &pair : func_id_to_addr_) { MappedKernel &kernel = pair.second; if (kernel.dl_handle != nullptr) { dlclose(kernel.dl_handle); - LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first); + LOG_DEBUG("Closed kernel: func_id=%d", pair.first); } delete[] kernel.callable_buf; } @@ -769,6 +955,27 @@ int DeviceRunner::finalize() { host_orch_so_copy_.clear(); host_orch_so_copy_.shrink_to_fit(); + // Release any prepared-callable orch SO buffers callers forgot to drop. + for (auto &kv : orch_so_dedup_) { + if (kv.second.dev_addr != nullptr) { + mem_alloc_.free(kv.second.dev_addr); + } + } + orch_so_dedup_.clear(); + // hbg path: dlclose any host orch handles callers forgot to unregister. + // finalize() is the last chance; Worker.close() does not auto-unregister + // each callable_id, so without this loop the host process leaks one + // dlopen handle per (re)created Worker — observable in long-running + // pytest sessions. + for (auto &kv : prepared_callables_) { + if (kv.second.host_dlopen_handle != nullptr) { + dlclose(kv.second.host_dlopen_handle); + } + } + prepared_callables_.clear(); + aicpu_seen_callable_ids_.clear(); + aicpu_dlopen_total_ = 0; + // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); @@ -794,11 +1001,25 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data return 0; } - // Return cached callable address if already uploaded + // Return cached callable address if already uploaded *and* the new bytes + // match. With the prepared-callable path, multiple ChipCallables share a + // single ChipWorker (and hence DeviceRunner) and can pick distinct + // kernel binaries for the same func_id. Naively reusing the cached + // entry hands the AICore the previous callable's kernel and segfaults + // at dispatch. auto it = func_id_to_addr_.find(func_id); if (it != func_id_to_addr_.end()) { - LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id); - return reinterpret_cast(it->second.callable_buf); + const auto &cached_callable = *reinterpret_cast(it->second.callable_buf); + const auto *new_callable = reinterpret_cast(bin_data); + if (cached_callable.binary_size() == new_callable->binary_size() && + std::memcmp(cached_callable.binary_data(), new_callable->binary_data(), new_callable->binary_size()) == 0) { + LOG_INFO_V0("Kernel func_id=%d already uploaded (matching bytes), returning cached address", func_id); + return reinterpret_cast(it->second.callable_buf); + } + LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id); + if (it->second.dl_handle != nullptr) dlclose(it->second.dl_handle); + delete[] it->second.callable_buf; + func_id_to_addr_.erase(it); } // Extract binary from CoreCallable envelope diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 210aeb9ba..994d92c3b 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include "common/core_type.h" @@ -210,6 +212,23 @@ class DeviceRunner { */ void remove_kernel_binary(int func_id); + int register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, + const char *config_name, std::vector> kernel_addrs + ); + // Host-orchestration sibling of register_prepared_callable; see + // src/a2a3/platform/onboard/host/device_runner.h for the contract. Sim + // shares the host-only dlopen path verbatim (no AICPU side effects). + int register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs + ); + int unregister_prepared_callable(int32_t callable_id); + bool has_prepared_callable(int32_t callable_id) const; + int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id); + size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; } + size_t host_dlopen_count() const { return host_dlopen_total_; } + private: // Configuration int device_id_{-1}; @@ -232,6 +251,38 @@ class DeviceRunner { size_t dev_orch_so_capacity_{0}; std::vector host_orch_so_copy_; + // Per-callable_id prepared state. Mirrors onboard. + struct PreparedCallableState { + // trb path + uint64_t hash{0}; + uint64_t dev_orch_so_addr{0}; + size_t dev_orch_so_size{0}; + std::string func_name; + std::string config_name; + // common + std::vector> kernel_addrs; + // hbg path + void *host_dlopen_handle{nullptr}; + void *host_orch_func_ptr{nullptr}; + }; + struct OrchSoBuffer { + void *dev_addr{nullptr}; + size_t capacity{0}; + int refcount{0}; + }; + std::unordered_map prepared_callables_; + std::unordered_map orch_so_dedup_; + std::unordered_set aicpu_seen_callable_ids_; + size_t aicpu_dlopen_total_{0}; + size_t host_dlopen_total_{0}; + // Sticky flag: prepare_callable was called at least once in this + // DeviceRunner's lifetime. unregister_prepared_callable clears the maps + // above, so we cannot use them at finalize() time to decide whether a + // remaining func_id_to_addr_ entry is a legacy-path leak or a kernel + // legitimately staged by prepare_callable (which is owned until finalize + // by design). + bool prepared_callable_path_used_{false}; + // AICPU executor SO: load-once, matching onboard's binaries_loaded_ pattern. // The aicpu_executor g_aicpu_executor static lives inside the dlopen'd DSO; // reloading it destroys orch_so_handle_ and breaks the orch-SO cache-hit path. @@ -279,8 +330,8 @@ class DeviceRunner { * Stage the orchestration SO bytes into a host-resident buffer that * `aicpu_executor` can dlopen. Identical contract to the onboard * version: `runtime.pending_orch_so_data_/size_` are consumed and - * `runtime.{dev_orch_so_addr_, dev_orch_so_size_, has_new_orch_so_}` - * are populated with the cache-aware result. + * `runtime.{dev_orch_so_addr_, dev_orch_so_size_}` are populated with + * the cache-aware result. */ int prepare_orch_so(Runtime &runtime); diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index b8315b31a..79b54bf51 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -22,6 +22,8 @@ #include #include + +#include #include #include "common/unified_log.h" @@ -35,7 +37,8 @@ extern "C" { /* =========================================================================== * Runtime Implementation Functions (defined in runtime_maker.cpp) * =========================================================================== */ -int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); +int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable); +int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); /* =========================================================================== @@ -156,19 +159,149 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de } } -int run_runtime( - DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; + try { + int rc = static_cast(ctx)->finalize(); + int dev = pto_cpu_sim_get_bound_device(); + if (dev >= 0) { + pto_cpu_sim_release_device(dev); + } + return rc; + } catch (...) { + return -1; + } +} + +/* =========================================================================== + * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these + * no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the + * full extension surface unconditionally). The paired comm_init / barrier / + * destroy entry points already live in comm_sim.cpp. + * =========================================================================== */ + +int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) { + (void)ctx; + (void)device_id; + return 0; +} + +void *create_comm_stream_ctx(DeviceContextHandle ctx) { + (void)ctx; + return NULL; +} + +int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { + (void)ctx; + (void)stream; + return 0; +} + +/* =========================================================================== + * Internal helpers called from runtime_maker.cpp via Runtime.host_api + * =========================================================================== */ + +void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) { + if (runtime == NULL) return; + Runtime *r = static_cast(runtime); + r->record_tensor_pair(host_ptr, dev_ptr, size); +} + +void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { + if (ctx == NULL) return; + // No CANN dlog on sim. + HostLogger::get_instance().set_level(static_cast(log_level)); + HostLogger::get_instance().set_info_v(log_info_v); + DeviceRunner *runner = static_cast(ctx); + runner->set_log_level(log_level); + runner->set_log_info_v(log_info_v); +} + +/* =========================================================================== + * Per-callable_id preparation + * =========================================================================== */ + +int prepare_callable( + DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary, + size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size +) { + if (ctx == NULL || callable == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + (void)aicpu_binary; + (void)aicpu_size; + (void)aicore_binary; + (void)aicore_size; + (void)device_id; + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + + try { + // Heap-allocate the temp Runtime — sizeof(Runtime) is in the tens of MB + // for hbg variants (RUNTIME_MAX_TASKS=131072), well past the stack + // budget. unique_ptr keeps the cleanup symmetric on every exit. + std::unique_ptr r_owner = std::make_unique(); + Runtime *r = r_owner.get(); + r->host_api.device_malloc = device_malloc; + r->host_api.device_free = device_free; + r->host_api.copy_to_device = copy_to_device; + r->host_api.copy_from_device = copy_from_device; + r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; + + int rc = prepare_callable_impl(r, reinterpret_cast(callable)); + if (rc != 0) { + pthread_setspecific(g_runner_key, nullptr); + return rc; + } + + std::vector> kernel_addrs; + int kcount = r->get_registered_kernel_count(); + kernel_addrs.reserve(kcount); + for (int i = 0; i < kcount; i++) { + int fid = r->get_registered_kernel_func_id(i); + kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid)); + } + r->clear_registered_kernels(); + + if (r->pending_host_dlopen_handle_ != nullptr) { + rc = runner->register_prepared_callable_host_orch( + callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs) + ); + r->pending_host_dlopen_handle_ = nullptr; + r->pending_host_orch_func_ptr_ = nullptr; + } else { + rc = runner->register_prepared_callable( + callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(), + r->get_device_orch_config_name(), std::move(kernel_addrs) + ); + } + pthread_setspecific(g_runner_key, nullptr); + return rc; + } catch (...) { + pthread_setspecific(g_runner_key, nullptr); + return -1; + } +} + +int run_prepared( + DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix ) { if (ctx == NULL || runtime == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + if (!runner->has_prepared_callable(callable_id)) { + LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id); + return -1; + } pthread_once(&g_runner_key_once, create_runner_key); pthread_setspecific(g_runner_key, ctx); - DeviceRunner *runner = static_cast(ctx); try { - // Phase 1: placement new + build graph Runtime *r = new (runtime) Runtime(); r->host_api.device_malloc = device_malloc; r->host_api.device_free = device_free; @@ -177,9 +310,14 @@ int run_runtime( r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; - int rc = init_runtime_impl( - r, reinterpret_cast(callable), reinterpret_cast(args) - ); + int rc = runner->bind_prepared_callable_to_runtime(*r, callable_id); + if (rc != 0) { + r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); + return rc; + } + + rc = bind_prepared_to_runtime_impl(r, reinterpret_cast(args)); if (rc != 0) { r->set_gm_sm_ptr(nullptr); validate_runtime_impl(r); @@ -188,15 +326,11 @@ int run_runtime( return rc; } - // Phase 2: publish diagnostics enablement to the DeviceRunner so run() - // and its helpers can read the three sub-features uniformly (via - // members, not Runtime / run() args). runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0); runner->set_dump_tensor_enabled(enable_dump_tensor != 0); runner->set_pmu_enabled(enable_pmu); runner->set_output_prefix(output_prefix); - // Phase 3: launch std::vector aicpu_vec; std::vector aicore_vec; if (aicpu_binary != NULL && aicpu_size > 0) { @@ -213,7 +347,6 @@ int run_runtime( return rc; } - // Phase 4: finalize (copy results back) rc = validate_runtime_impl(r); r->~Runtime(); pthread_setspecific(g_runner_key, nullptr); @@ -224,62 +357,31 @@ int run_runtime( } } -int finalize_device(DeviceContextHandle ctx) { +int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) { if (ctx == NULL) return -1; try { - int rc = static_cast(ctx)->finalize(); - int dev = pto_cpu_sim_get_bound_device(); - if (dev >= 0) { - pto_cpu_sim_release_device(dev); - } - return rc; + return static_cast(ctx)->unregister_prepared_callable(callable_id); } catch (...) { return -1; } } -/* =========================================================================== - * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these - * no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the - * full extension surface unconditionally). The paired comm_init / barrier / - * destroy entry points already live in comm_sim.cpp. - * =========================================================================== */ - -int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) { - (void)ctx; - (void)device_id; - return 0; -} - -void *create_comm_stream_ctx(DeviceContextHandle ctx) { - (void)ctx; - return NULL; -} - -int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { - (void)ctx; - (void)stream; - return 0; -} - -/* =========================================================================== - * Internal helpers called from runtime_maker.cpp via Runtime.host_api - * =========================================================================== */ - -void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) { - if (runtime == NULL) return; - Runtime *r = static_cast(runtime); - r->record_tensor_pair(host_ptr, dev_ptr, size); +size_t get_host_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->host_dlopen_count(); + } catch (...) { + return 0; + } } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; - // No CANN dlog on sim. - HostLogger::get_instance().set_level(static_cast(log_level)); - HostLogger::get_instance().set_info_v(log_info_v); - DeviceRunner *runner = static_cast(ctx); - runner->set_log_level(log_level); - runner->set_log_info_v(log_info_v); +size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->aicpu_dlopen_count(); + } catch (...) { + return 0; + } } } // extern "C" diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp index cf6618170..390ad3d19 100644 --- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp @@ -276,31 +276,27 @@ extern "C" { #endif /** - * Initialize a pre-allocated runtime with dynamic orchestration. - * - * This function loads the orchestration SO from binary data via a temp file, - * resolves the orchestration function via dlsym, then calls it to build the - * task graph. The orchestration function is responsible for: - * - Allocating device memory via device_malloc() - * - Copying data to device via copy_to_device() - * - Building the task graph - * - Recording tensor pairs via record_tensor_pair() - * - * @param runtime Pointer to pre-constructed Runtime - * @param callable ChipCallable containing orch binary, func_name, and child kernels - * @param orch_args Separated tensor/scalar arguments - * @return 0 on success, -1 on failure + * Stage the per-callable resources for the host_build_graph variant: upload + * kernel binaries and dlopen the orchestration SO on the host. The dlopen + * handle and resolved entry-symbol pointer are parked on the runtime via + * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the + * platform layer can hoist them into PreparedCallableState. Splitting this + * out of init_runtime_impl is what the hbg prepare_callable / run_prepared + * path rests on — the dlopen runs once per cid instead of every run. */ -int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) { - // Validate inputs +int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) { if (runtime == nullptr) { LOG_ERROR("Runtime pointer is null"); return -1; } + if (callable == nullptr) { + LOG_ERROR("Callable pointer is null"); + return -1; + } // Register kernel binaries from ChipCallable children if (callable->child_count() > 0) { - LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count()); + LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count()); for (int32_t i = 0; i < callable->child_count(); i++) { int func_id = callable->child_func_id(i); if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { @@ -329,7 +325,9 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip return -1; } - // Load orchestration SO from binary data via temp file + // Load orchestration SO from binary data via temp file. Held open across + // the lifetime of the prepared callable; closed by + // DeviceRunner::unregister_prepared_callable. std::string fd_path; if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) { LOG_ERROR("Failed to create temp SO file"); @@ -343,7 +341,7 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip return -1; } - dlerror(); // Clear any existing error + dlerror(); OrchestrationFunc orch_func = reinterpret_cast(dlsym(handle, orch_func_name)); const char *dlsym_error = dlerror(); if (dlsym_error != nullptr) { @@ -354,11 +352,42 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name); - // Clear any previous tensor pairs + runtime->pending_host_dlopen_handle_ = handle; + runtime->pending_host_orch_func_ptr_ = reinterpret_cast(orch_func); + // hbg never uploads orch SO bytes to the device; clear the trb staging + // fields so DeviceRunner::register_prepared_callable cannot mistake this + // for a trb-shaped registration. + runtime->pending_orch_so_data_ = nullptr; + runtime->pending_orch_so_size_ = 0; + return 0; +} + +/** + * Per-run binding for hbg: invoke the previously-resolved orchestration entry + * point against the supplied args, then upload tensor info / allocation + * storage. Assumes prepare_callable_impl populated + * `pending_host_orch_func_ptr_` (either freshly during prepare_callable, or + * via DeviceRunner::bind_prepared_callable_to_runtime when run_prepared + * replays a prepared cid onto a fresh Runtime). + */ +int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + if (orch_args == nullptr) { + LOG_ERROR("orch_args pointer is null"); + return -1; + } + OrchestrationFunc orch_func = reinterpret_cast(runtime->pending_host_orch_func_ptr_); + if (orch_func == nullptr) { + LOG_ERROR("bind_prepared_to_runtime_impl: host orch_func pointer is null"); + return -1; + } + runtime->clear_tensor_pairs(); LOG_INFO_V0("=== Calling Orchestration Function ==="); - LOG_DEBUG( "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(), orch_args->tensor_count(), orch_args->scalar_count() @@ -370,13 +399,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder }; - // Call orchestration function to build task graph - // The orchestration function handles device memory allocation and copy-to-device int rc = orch_func(reinterpret_cast(&orchestration_runtime), *orch_args); if (rc != 0) { LOG_ERROR("Orchestration function failed with code %d", rc); runtime->clear_tensor_pairs(); - dlclose(handle); return rc; } @@ -384,7 +410,6 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip if (rc != 0) { LOG_ERROR("Failed to upload tensor allocations: %d", rc); runtime->clear_tensor_pairs(); - dlclose(handle); return rc; } @@ -396,16 +421,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip runtime->clear_tensor_allocation_storage(); } runtime->clear_tensor_pairs(); - dlclose(handle); return rc; } LOG_INFO_V0("Runtime initialized. Ready for execution from Python."); - - // Host orchestration is complete once orch_func returns. The task graph now - // lives in Runtime, so the orchestration SO can be closed immediately. - dlclose(handle); - return 0; } diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 46b673878..25d25dc76 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -434,6 +434,19 @@ class Runtime { */ void set_function_bin_addr(int func_id, uint64_t addr); + /** + * Replay a previously-uploaded kernel address onto a fresh Runtime + * without recording it in registered_kernel_func_ids_. Used by + * DeviceRunner::bind_prepared_callable_to_runtime when restoring kernels + * across run_prepared invocations: the prepared callable owns the + * kernel binaries' device memory until unregister, so + * validate_runtime_impl must NOT free them. + */ + void replay_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return; + func_id_to_addr_[func_id] = addr; + } + int get_registered_kernel_count() const { return registered_kernel_count_; } int get_registered_kernel_func_id(int index) const { @@ -451,24 +464,67 @@ class Runtime { // NOTE: Placed at end of class to avoid affecting device memory layout HostApi host_api; - // Device orchestration SO metadata: device buffer + dirty flag (host + // Device orchestration SO metadata: device buffer pointer + size (host // populates these via DeviceRunner::prepare_orch_so before launch). // host_build_graph runtime variant currently does not load device // orchestration SOs, but DeviceRunner is shared with the other variants // and unconditionally writes these fields, so they must exist. uint64_t dev_orch_so_addr_{0}; uint64_t dev_orch_so_size_{0}; - bool has_new_orch_so_{false}; + + // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads + // `active_callable_id_`; the field exists for parity with the + // shared platform layer (DeviceRunner stamps it on every run). + int32_t active_callable_id_{-1}; + bool register_new_callable_id_{false}; // Host-only staging fields (mirror tensormap_and_ringbuffer variant). const void *pending_orch_so_data_{nullptr}; size_t pending_orch_so_size_{0}; - void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) { + // Host-orchestration staging (hbg path). prepare_callable_impl + // dlopens the orch SO on the host and parks the handle + entry-symbol + // pointer here so DeviceRunner::register_prepared_callable_host_orch can + // claim them; bind_prepared_callable_to_runtime restores them onto a fresh + // Runtime so bind_prepared_to_runtime_impl can call orch_func without a + // second dlopen. Distinct from `pending_orch_so_data_` (which is unused on + // hbg — host orchestration never uploads the SO bytes to the device). + void *pending_host_dlopen_handle_{nullptr}; + void *pending_host_orch_func_ptr_{nullptr}; + + // Device-orchestration entry/config symbol names (trb path). Always + // empty on this hbg variant — included for API parity so the shared + // platform layer can call set_device_orch_func_name unconditionally. + char device_orch_func_name_[64]{}; + char device_orch_config_name_[64]{}; + + void set_device_orch_func_name(const char *name) { + device_orch_func_name_[0] = '\0'; + if (name) { + strncpy(device_orch_func_name_, name, sizeof(device_orch_func_name_) - 1); + device_orch_func_name_[sizeof(device_orch_func_name_) - 1] = '\0'; + } + } + const char *get_device_orch_func_name() const { return device_orch_func_name_; } + void set_device_orch_config_name(const char *name) { + device_orch_config_name_[0] = '\0'; + if (name) { + strncpy(device_orch_config_name_, name, sizeof(device_orch_config_name_) - 1); + device_orch_config_name_[sizeof(device_orch_config_name_) - 1] = '\0'; + } + } + const char *get_device_orch_config_name() const { return device_orch_config_name_; } + + void set_dev_orch_so(uint64_t dev_addr, uint64_t size) { dev_orch_so_addr_ = dev_addr; dev_orch_so_size_ = size; - has_new_orch_so_ = is_new; } + void set_active_callable_id(int32_t callable_id, bool is_new) { + active_callable_id_ = callable_id; + register_new_callable_id_ = is_new; + } + int32_t get_active_callable_id() const { return active_callable_id_; } + bool register_new_callable_id() const { return register_new_callable_id_; } }; #endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index ab795b6f8..a15584829 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -24,6 +24,7 @@ #include "aicpu/device_time.h" #include "aicpu/orch_so_file.h" +#include "callable_protocol.h" #include "pto2_dispatch_payload.h" #include "runtime.h" #include "spin_hint.h" @@ -89,6 +90,23 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) { static PTO2Runtime *rt{nullptr}; +// Per-callable_id orchestration SO table. The executor dispatches +// `orch_so_table_[active_callable_id_]` (created on first sighting of +// that callable_id, kept warm across runs). +// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values +// (mailbox uint32 callable_id, register() returns small ints) and is shared +// with the host bounds check in DeviceRunner::register_prepared_callable — +// see src/common/task_interface/callable_protocol.h. + +struct OrchSoEntry { + bool in_use{false}; + void *handle{nullptr}; + char path[256]{}; + DeviceOrchestrationFunc func{nullptr}; + DeviceOrchestrationBindRuntimeFunc bind{nullptr}; + DeviceOrchestrationConfigFunc config_func{nullptr}; +}; + struct AicpuExecutor { int32_t sched_thread_num_; bool orch_to_sched_{false}; @@ -107,16 +125,15 @@ struct AicpuExecutor { std::atomic finished_count_{0}; std::atomic runtime_init_ready_{false}; - // Orchestration SO handle - defer dlclose until all tasks complete - void *orch_so_handle_{nullptr}; - char orch_so_path_[256]{}; // Path to orchestration SO file for cleanup - - // Shared orchestration function pointer (loaded by first orch thread, used by all) - DeviceOrchestrationFunc orch_func_{nullptr}; - DeviceOrchestrationBindRuntimeFunc orch_bind_runtime_{nullptr}; - DeviceOrchestrationConfigFunc orch_config_func_{nullptr}; + // Cached orch args pointer set by the orchestration thread before scheduler + // init; consumed by the (*p_func)(*orch_args_cached_) invocation below. const ChipStorageTaskArgs *orch_args_cached_{nullptr}; + // Per-callable_id table. Single orch thread today, so first-write/read + // race is not possible; if multiple orch threads are ever introduced, + // guard the in_use=false→true transition with a mutex. + OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]; + // ===== Scheduler context (owns all dispatch/completion/drain state) ===== SchedulerContext sched_ctx_; @@ -126,15 +143,14 @@ struct AicpuExecutor { void deinit(Runtime *runtime); ~AicpuExecutor() { - // Process-wide teardown (the single static instance dies here). The - // handle is otherwise kept alive across runs for cache-hit reuse. - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - orch_so_handle_ = nullptr; - } - if (orch_so_path_[0] != '\0') { - unlink(orch_so_path_); - orch_so_path_[0] = '\0'; + // Process-wide teardown (the single static instance dies here). Every + // in-use callable_id slot is dlclose()'d here; each is otherwise kept + // alive across runs for cache-hit reuse. + for (auto &e : orch_so_table_) { + if (!e.in_use) continue; + if (e.handle != nullptr) dlclose(e.handle); + if (e.path[0] != '\0') unlink(e.path); + e = OrchSoEntry{}; } } }; @@ -197,29 +213,37 @@ int32_t AicpuExecutor::run(Runtime *runtime) { if (runtime->get_orch_built_on_host()) { LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx); } else { - // Two paths: - // 1) has_new_orch_so == true → host believes the SO identity - // changed, so we drop the cached handle (if any), write the - // new bytes to disk, and dlopen + dlsym a fresh handle. - // 2) has_new_orch_so == false → host detected a cache hit, so - // we reuse `orch_so_handle_` / `orch_func_` / `orch_bind_runtime_` - // from the previous run untouched. sm_handle / rt below are - // always recreated because they bind this run's memory. - const bool reload_so = runtime->has_new_orch_so(); + // Per-callable_id dispatch: the orch SO state lives in + // `orch_so_table_[callable_id]` keyed by registration order; + // reload is governed by `register_new_callable_id_`. + const int32_t callable_id = runtime->get_active_callable_id(); + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + void **p_handle = &orch_so_table_[callable_id].handle; + char *p_path = orch_so_table_[callable_id].path; + DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func; + DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; + DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; + const bool reload_so = runtime->register_new_callable_id(); if (reload_so) { - LOG_INFO_V0("Thread %d: New orch SO detected, (re)loading", thread_idx); - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - orch_so_handle_ = nullptr; - orch_func_ = nullptr; - orch_bind_runtime_ = nullptr; - if (orch_so_path_[0] != '\0') { + LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id); + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; + *p_func = nullptr; + *p_bind = nullptr; + if (p_path[0] != '\0') { // Unlink the old file so the new open() lands on a // fresh inode — protects against SIGBUS / ETXTBSY when // the kernel still has the old mapping pinned. - unlink(orch_so_path_); - orch_so_path_[0] = '\0'; + unlink(p_path); + p_path[0] = '\0'; } } @@ -242,7 +266,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); for (int32_t i = 0; i < num_candidates && !file_created; i++) { - int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path)); + int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path)); if (fd < 0) { LOG_INFO_V0( "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno @@ -281,6 +305,14 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); + // Unlink the on-disk SO immediately: dlopen has already mmap'd + // the image, so the kernel keeps the inode alive until the + // matching dlclose / process exit. This prevents stale + // libdevice_orch__.so files from accumulating in + // /tmp when child processes exit via os._exit(0), which skips + // ~AicpuExecutor (worker.py: _sub/_chip/_child loops). + unlink(so_path); + const char *entry_symbol = runtime->get_device_orch_func_name(); if (entry_symbol == nullptr || entry_symbol[0] == '\0') { entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; @@ -333,15 +365,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) { bind_runtime_func = nullptr; } - orch_so_handle_ = handle; - orch_func_ = orch_func; - orch_bind_runtime_ = bind_runtime_func; - orch_config_func_ = config_func; - snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path); + *p_handle = handle; + *p_func = orch_func; + *p_bind = bind_runtime_func; + *p_config_func = config_func; + snprintf(p_path, 256, "%s", so_path); + orch_so_table_[callable_id].in_use = true; } else { - LOG_INFO_V0("Thread %d: Reusing cached orch SO handle=%p", thread_idx, orch_so_handle_); - if (orch_so_handle_ == nullptr || orch_func_ == nullptr) { - LOG_ERROR("Thread %d: has_new_orch_so=false but no cached SO handle/func", thread_idx); + LOG_INFO_V0( + "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id + ); + if (*p_handle == nullptr || *p_func == nullptr) { + LOG_ERROR( + "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx, + callable_id + ); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -349,8 +387,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } // Validate arg count on every run (reload or cache hit). - if (orch_config_func_ != nullptr) { - PTO2OrchestrationConfig cfg = orch_config_func_(runtime->get_orch_args()); + if (*p_config_func != nullptr) { + PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args()); LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count); if (cfg.expected_arg_count > 0) { const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); @@ -361,17 +399,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) { cfg.expected_arg_count ); // Clean up cached state so a subsequent run does a full reload. - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - orch_so_handle_ = nullptr; + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; } - if (orch_so_path_[0] != '\0') { - unlink(orch_so_path_); - orch_so_path_[0] = '\0'; + if (p_path[0] != '\0') { + unlink(p_path); + p_path[0] = '\0'; } - orch_func_ = nullptr; - orch_bind_runtime_ = nullptr; - orch_config_func_ = nullptr; + *p_func = nullptr; + *p_bind = nullptr; + *p_config_func = nullptr; + orch_so_table_[callable_id].in_use = false; // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -473,11 +512,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) { orch_cycle_start = get_sys_cnt_aicpu(); #endif framework_bind_runtime(rt); - if (orch_bind_runtime_ != nullptr) { - orch_bind_runtime_(rt); + if (*p_bind != nullptr) { + (*p_bind)(rt); } rt_scope_begin(rt); - orch_func_(*orch_args_cached_); + (*p_func)(*orch_args_cached_); rt_scope_end(rt); #if PTO2_PROFILING uint64_t orch_cycle_end = get_sys_cnt_aicpu(); @@ -633,13 +672,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) { if (prev_finished + 1 == thread_num_) { finished_.store(true, std::memory_order_release); // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we - // always tear them down here, but we keep orch_so_handle_ alive for - // the next run's cache-hit reuse (see run() reload_so branch). + // always tear them down here, but we keep the per-cid orch SO entries + // alive for the next run's cache-hit reuse (see run() reload_so branch). if (!runtime->get_orch_built_on_host() && rt != nullptr) { // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt. + const int32_t callable_id = runtime->get_active_callable_id(); framework_bind_runtime(nullptr); - if (orch_bind_runtime_ != nullptr) { - orch_bind_runtime_(nullptr); + if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) { + DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind; + if (bind != nullptr) { + bind(nullptr); + } } runtime_destroy(rt); } @@ -665,10 +708,9 @@ void AicpuExecutor::deinit(Runtime *runtime) { orch_to_sched_ = false; orch_args_cached_ = nullptr; - // orch_so_handle_ / orch_func_ / orch_bind_runtime_ / orch_config_func_ / orch_so_path_ are - // intentionally preserved across deinit: the next run reuses them when - // has_new_orch_so() == false. The destructor releases them at process - // teardown. + // orch_so_table_ entries are intentionally preserved across deinit: the + // next run reuses cached handles when register_new_callable_id() returns + // false. The destructor releases them at process teardown. // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) rt = nullptr; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 5b1ca640b..b93ac103b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -92,31 +92,29 @@ static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader } /** - * Initialize a pre-allocated runtime for device orchestration. + * Stage the per-callable resources (kernel binaries + orchestration SO) into + * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use + * them. This is the cacheable half of init_runtime_impl: nothing here depends + * on per-run argument values, so the prepare_callable / run_prepared split + * lets us run this once per callable_id and amortize across runs. * - * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side). - * This function: - * - Copies tensor metadata and replaces host pointers with device pointers - * - Copies all tensor data to device - * - Records all tensors for copy-back - * - Copies orchestration SO to device memory - * - Sets up runtime state for device orchestration - * - * @param runtime Pointer to pre-constructed Runtime - * @param callable ChipCallable containing orch binary, func_name, and child kernels - * @param orch_args Separated tensor/scalar arguments + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param callable ChipCallable carrying the orch SO + child kernel binaries * @return 0 on success, -1 on failure */ -extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) { - // Validate inputs +extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) { if (runtime == nullptr) { LOG_ERROR("Runtime pointer is null"); return -1; } + if (callable == nullptr) { + LOG_ERROR("Callable pointer is null"); + return -1; + } // Register kernel binaries from ChipCallable children if (callable->child_count() > 0) { - LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count()); + LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count()); for (int32_t i = 0; i < callable->child_count(); i++) { int func_id = callable->child_func_id(i); if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { @@ -146,6 +144,32 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, return -1; } + // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume. + runtime->pending_orch_so_data_ = orch_so_binary; + runtime->pending_orch_so_size_ = orch_so_size; + LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); + return 0; +} + +/** + * Per-run binding: build device-side argument storage (tensor copy-out, GM + * heap, PTO2 shared memory) and publish it to the runtime. Assumes the + * callable-side state (kernel binaries, orch SO bytes, func/config names) + * is already populated by prepare_callable_impl. + * + * Splitting this from prepare_callable_impl matches the per-callable_id + * design: register/run_prepared invokes this every call, while the prep + * half runs only once per callable_id. + * + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param orch_args Separated tensor/scalar arguments for this run + * @return 0 on success, -1 on failure + */ +extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } if (orch_args == nullptr) { LOG_ERROR("orch_args pointer is null"); return -1; @@ -153,7 +177,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, int tensor_count = orch_args->tensor_count(); int scalar_count = orch_args->scalar_count(); - LOG_INFO_V0("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); + LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); int64_t t_total_start = _now_ms(); @@ -196,16 +220,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, } int64_t t_args_end = _now_ms(); - // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume. - // DeviceRunner hashes the bytes, skips the rtMemcpy when the identity is - // unchanged, and overwrites dev_orch_so_addr_ / size / has_new_orch_so_ - // on Runtime before the struct is sent to device. - int64_t t_so_start = _now_ms(); - runtime->pending_orch_so_data_ = orch_so_binary; - runtime->pending_orch_so_size_ = orch_so_size; - LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); - int64_t t_so_end = _now_ms(); - // Read ready queue shard count from environment for AICPU scheduler { const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS"); @@ -285,7 +299,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, int64_t t_total_end = _now_ms(); LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start); - LOG_INFO_V0("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start); LOG_INFO_V0("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start); LOG_INFO_V0("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 105f1601f..ad70a259a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -189,12 +189,14 @@ class Runtime { // Device orchestration SO (for dlopen on AICPU thread 3). // The SO bytes themselves live in a separately-allocated device buffer // owned by DeviceRunner; only the metadata below travels inside Runtime. - // `has_new_orch_so_` tells AICPU whether the host believes the SO identity - // changed since the previous run — when false AICPU reuses its cached - // dlopen handle and skips writing the file again. uint64_t dev_orch_so_addr_; uint64_t dev_orch_so_size_; - bool has_new_orch_so_; + // Per-callable_id dispatch. AICPU dispatches via + // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_` + // signals whether the host is delivering a freshly-registered + // callable_id (write+dlopen) or reusing an already-loaded one. + int32_t active_callable_id_; + bool register_new_callable_id_; char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; @@ -247,10 +249,16 @@ class Runtime { void set_orch_args(const ChipStorageTaskArgs &args); // Device orchestration SO binary (for dlopen on AICPU thread 3) - void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new); + void set_dev_orch_so(uint64_t dev_addr, uint64_t size); uint64_t get_dev_orch_so_addr() const; uint64_t get_dev_orch_so_size() const; - bool has_new_orch_so() const; + // Per-callable_id dispatch. callable_id must be in + // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU + // whether to (re)load the orch SO into orch_so_table_[callable_id] or + // reuse the cached entry. + void set_active_callable_id(int32_t callable_id, bool is_new); + int32_t get_active_callable_id() const; + bool register_new_callable_id() const; void set_device_orch_func_name(const char *name); const char *get_device_orch_func_name() const; void set_device_orch_config_name(const char *name); @@ -258,6 +266,13 @@ class Runtime { uint64_t get_function_bin_addr(int func_id) const; void set_function_bin_addr(int func_id, uint64_t addr); + /** + * Replay a previously-uploaded kernel address onto a fresh Runtime + * without recording it in registered_kernel_func_ids_. Used by + * DeviceRunner::bind_prepared_callable_to_runtime so prepared kernel + * binaries are not freed by validate_runtime_impl across runs. + */ + void replay_function_bin_addr(int func_id, uint64_t addr); int get_registered_kernel_count() const; int get_registered_kernel_func_id(int index) const; @@ -285,11 +300,18 @@ class Runtime { // Host-only staging for orchestration SO. runtime_maker publishes the // callable-owned pointer here; DeviceRunner consumes it before launching // the device-side execution and replaces it with the device-resident - // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_). The fields + // buffer metadata (dev_orch_so_addr_, dev_orch_so_size_). The fields // below are zeroed on the device because DeviceRunner clears them before // the memcpy, but their values while running on device are irrelevant. const void *pending_orch_so_data_{nullptr}; size_t pending_orch_so_size_{0}; + + // Host-orchestration staging (hbg path). Always nullptr on this trb + // variant — included for API parity with host_build_graph so the + // shared platform layer can branch on `pending_host_dlopen_handle_ != + // nullptr` at runtime instead of via a build-time macro. + void *pending_host_dlopen_handle_{nullptr}; + void *pending_host_orch_func_ptr_{nullptr}; }; #endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 68d374e32..98d464549 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -52,7 +52,8 @@ Runtime::Runtime() { // Initialize device orchestration SO binary dev_orch_so_addr_ = 0; dev_orch_so_size_ = 0; - has_new_orch_so_ = false; + active_callable_id_ = -1; + register_new_callable_id_ = false; device_orch_func_name_[0] = '\0'; device_orch_config_name_[0] = '\0'; @@ -102,18 +103,24 @@ void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } // Device orchestration SO metadata (bytes live in a separate device buffer -// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime). -void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) { +// owned by DeviceRunner; only the address/size travels in Runtime). +void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { dev_orch_so_addr_ = dev_addr; dev_orch_so_size_ = size; - has_new_orch_so_ = is_new; } uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; } uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; } -bool Runtime::has_new_orch_so() const { return has_new_orch_so_; } +void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) { + active_callable_id_ = callable_id; + register_new_callable_id_ = is_new; +} + +int32_t Runtime::get_active_callable_id() const { return active_callable_id_; } + +bool Runtime::register_new_callable_id() const { return register_new_callable_id_; } void Runtime::set_device_orch_func_name(const char *name) { if (name == nullptr) { @@ -160,6 +167,14 @@ void Runtime::set_function_bin_addr(int func_id, uint64_t addr) { func_id_to_addr_[func_id] = addr; } +void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); + return; + } + func_id_to_addr_[func_id] = addr; +} + int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; } int Runtime::get_registered_kernel_func_id(int index) const { diff --git a/src/a5/platform/include/aicpu/orch_so_file.h b/src/a5/platform/include/aicpu/orch_so_file.h index 40bec7411..33862527e 100644 --- a/src/a5/platform/include/aicpu/orch_so_file.h +++ b/src/a5/platform/include/aicpu/orch_so_file.h @@ -39,10 +39,15 @@ * Caller is expected to try the next candidate directory. * * @param dir Candidate directory (e.g. "/tmp") + * @param callable_id Per-callable_id table slot id (>= 0). Required for + * uniqueness on the onboard path so concurrently- + * resident orch SOs (one per cid) do not collide on + * the same on-disk file. Pass -1 for the legacy + * single-slot dispatch path. * @param out_path Buffer that receives the full file path on success * @param out_path_size Size of `out_path` in bytes * @return Open writable fd on success, -1 on failure */ -int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size); +int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size); #endif // PLATFORM_AICPU_ORCH_SO_FILE_H_ diff --git a/src/a5/platform/onboard/aicpu/orch_so_file.cpp b/src/a5/platform/onboard/aicpu/orch_so_file.cpp index 322cb7dcc..4e7f55232 100644 --- a/src/a5/platform/onboard/aicpu/orch_so_file.cpp +++ b/src/a5/platform/onboard/aicpu/orch_so_file.cpp @@ -15,10 +15,20 @@ #include -int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) { - // Pid-based naming: AICPU device libc may lack mkstemps, and only one - // runtime runs per device process, so pid uniqueness is sufficient. - int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid()); +int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) { + // Pid + callable_id naming: AICPU device libc may lack mkstemps. With + // per-callable_id dispatch, multiple orch SOs can be resident in the + // same device process at once (one per cid in `orch_so_table_`), so + // the on-disk file name must be unique per cid — otherwise the + // second cid's `O_TRUNC` would silently shred the first cid's already + // dlopen'd file image and the next launch on cid=0 would SIGBUS. + // callable_id < 0 is the legacy single-slot path: pid alone is fine. + int32_t written; + if (callable_id >= 0) { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d_%d.so", dir, getpid(), callable_id); + } else { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_%d.so", dir, getpid()); + } if (written < 0 || static_cast(written) >= out_path_size) { return -1; } diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 149feb7da..068e3d6bc 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -17,6 +17,8 @@ #include "device_runner.h" +#include + #include #include #include @@ -24,6 +26,7 @@ #include #include "callable.h" +#include "callable_protocol.h" #include "utils/elf_build_id.h" #include "host/host_regs.h" // Register address retrieval #include "host/raii_scope_guard.h" @@ -599,13 +602,51 @@ void DeviceRunner::print_handshake_results() { } int DeviceRunner::prepare_orch_so(Runtime &runtime) { + // Per-callable_id path: when run_prepared bound a known callable_id, + // the SO bytes were already H2D'd at prepare_callable time. + // We just stamp dev_orch_so on the runtime, plus mark `is_new` based on + // whether the AICPU has seen this id since registration. + const int32_t cid = runtime.get_active_callable_id(); + if (cid >= 0) { + auto it = prepared_callables_.find(cid); + if (it == prepared_callables_.end()) { + LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid); + return -1; + } + const auto &state = it->second; + // hbg variant: orch SO never crosses host/device, so AICPU does no + // per-cid dlopen. Skip orch_so_table_ bookkeeping and clear metadata. + if (state.host_dlopen_handle != nullptr) { + runtime.set_dev_orch_so(0, 0); + runtime.set_active_callable_id(cid, /*is_new=*/false); + return 0; + } + const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second; + if (first_sighting) { + ++aicpu_dlopen_total_; + } + runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size); + // The c_api caller passed is_new=false; refresh with the authoritative + // first_sighting flag before AICPU consumes register_new_callable_id_. + runtime.set_active_callable_id(cid, first_sighting); + // Pending fields must be empty in the prepared path — runtime_maker's + // bind_prepared_to_runtime_impl never stages them. Defensive clear: + runtime.pending_orch_so_data_ = nullptr; + runtime.pending_orch_so_size_ = 0; + LOG_INFO_V0( + "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size, + first_sighting ? 1 : 0 + ); + return 0; + } + const void *host_so_data = runtime.pending_orch_so_data_; const size_t host_so_size = runtime.pending_orch_so_size_; runtime.pending_orch_so_data_ = nullptr; runtime.pending_orch_so_size_ = 0; if (host_so_data == nullptr || host_so_size == 0) { - runtime.set_dev_orch_so(0, 0, false); + runtime.set_dev_orch_so(0, 0); return 0; } @@ -613,7 +654,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) { LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size); - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/false); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); return 0; } @@ -645,11 +686,170 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { } cached_orch_so_hash_ = new_hash; - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/true); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size); return 0; } +int DeviceRunner::register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name, + std::vector> kernel_addrs +) { + // The AICPU executor reserves `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]` + // (declared in src/common/task_interface/callable_protocol.h) and indexes + // it by callable_id; rejecting an out-of-range id here keeps host and AICPU + // in sync and avoids an OOB access at run time. + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (orch_so_data == nullptr || orch_so_size == 0) { + LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id); + return -1; + } + + const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size); + + // Hash dedup: share device buffer across callable_ids that carry the same + // SO bytes. Refcount drops in unregister_prepared_callable; we only free + // when the count hits zero. + auto buf_it = orch_so_dedup_.find(hash); + uint64_t dev_addr = 0; + if (buf_it == orch_so_dedup_.end()) { + void *buf = mem_alloc_.alloc(orch_so_size); + if (buf == nullptr) { + LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size); + return -1; + } + int rc = rtMemcpy(buf, orch_so_size, orch_so_data, orch_so_size, RT_MEMCPY_HOST_TO_DEVICE); + if (rc != 0) { + LOG_ERROR("register_prepared_callable: rtMemcpy failed: %d", rc); + mem_alloc_.free(buf); + return rc; + } + OrchSoBuffer entry; + entry.dev_addr = buf; + entry.capacity = orch_so_size; + entry.refcount = 1; + orch_so_dedup_.emplace(hash, entry); + dev_addr = reinterpret_cast(buf); + LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size); + } else { + buf_it->second.refcount++; + dev_addr = reinterpret_cast(buf_it->second.dev_addr); + LOG_INFO_V0( + "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount + ); + } + + PreparedCallableState state; + state.hash = hash; + state.dev_orch_so_addr = dev_addr; + state.dev_orch_so_size = orch_so_size; + state.func_name = (func_name != nullptr) ? func_name : ""; + state.config_name = (config_name != nullptr) ? config_name : ""; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + return 0; +} + +int DeviceRunner::register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs +) { + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id, + MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) { + LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id); + return -1; + } + + PreparedCallableState state; + state.host_dlopen_handle = host_dlopen_handle; + state.host_orch_func_ptr = host_orch_func_ptr; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + ++host_dlopen_total_; + LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_); + return 0; +} + +int DeviceRunner::unregister_prepared_callable(int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + return 0; + } + PreparedCallableState state = std::move(it->second); + prepared_callables_.erase(it); + aicpu_seen_callable_ids_.erase(callable_id); + + if (state.host_dlopen_handle != nullptr) { + // hbg path: dlclose the host handle; no orch SO refcount to decrement. + dlclose(state.host_dlopen_handle); + return 0; + } + + auto buf_it = orch_so_dedup_.find(state.hash); + if (buf_it != orch_so_dedup_.end()) { + if (--buf_it->second.refcount <= 0) { + mem_alloc_.free(buf_it->second.dev_addr); + orch_so_dedup_.erase(buf_it); + } + } + return 0; +} + +bool DeviceRunner::has_prepared_callable(int32_t callable_id) const { + return prepared_callables_.count(callable_id) != 0; +} + +int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id); + return -1; + } + const auto &state = it->second; + + // Replay kernel addresses directly into runtime.func_id_to_addr_ without + // going through set_function_bin_addr — the latter would record func_ids + // in registered_kernel_func_ids_, which validate_runtime_impl iterates to + // free kernel binaries. Prepared kernels must survive across runs and only + // be freed by finalize(). + for (const auto &kv : state.kernel_addrs) { + if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first); + return -1; + } + runtime.replay_function_bin_addr(kv.first, kv.second); + } + runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle; + runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr; + runtime.set_device_orch_func_name(state.func_name.c_str()); + runtime.set_device_orch_config_name(state.config_name.c_str()); + // Stamp callable_id with is_new=false; prepare_orch_so refreshes the flag + // with the authoritative first_sighting answer right before launch. + runtime.set_active_callable_id(callable_id, /*is_new=*/false); + return 0; +} + int DeviceRunner::finalize() { if (device_id_ == -1) { return 0; @@ -669,17 +869,27 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); - // Kernel binaries should have been removed by validate_runtime_impl() + // Kernel binaries are normally released by validate_runtime_impl on the + // legacy run() path. The prepared-callable path intentionally leaves + // them resident across runs (shared by func_id) and relies on + // finalize() to reclaim them; that is not a leak. Emit at DEBUG so the + // legacy regression signal is preserved for callers that never went + // through prepare_callable. if (!func_id_to_addr_.empty()) { - LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); - // Cleanup leaked binaries to prevent memory leaks + const bool prepared_path_used = prepared_callable_path_used_; + if (prepared_path_used) { + LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size()); + } else { + LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", func_id_to_addr_.size()); + } for (const auto &pair : func_id_to_addr_) { void *gm_addr = reinterpret_cast(pair.second); mem_alloc_.free(gm_addr); - LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second); + LOG_DEBUG("Freed kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second); } } func_id_to_addr_.clear(); + func_id_to_hash_.clear(); binaries_loaded_ = false; if (dev_orch_so_buffer_ != nullptr) { @@ -691,6 +901,29 @@ int DeviceRunner::finalize() { host_orch_so_copy_.clear(); host_orch_so_copy_.shrink_to_fit(); + // Release any prepared-callable orch SO buffers that callers forgot to + // unregister. Refcounts no longer matter at this point — the device is + // about to be reset. + for (auto &kv : orch_so_dedup_) { + if (kv.second.dev_addr != nullptr) { + mem_alloc_.free(kv.second.dev_addr); + } + } + orch_so_dedup_.clear(); + // hbg path: dlclose any host orch handles callers forgot to unregister. + // finalize() is the last chance; Worker.close() does not auto-unregister + // each callable_id, so without this loop the host process leaks one + // dlopen handle per (re)created Worker — observable in long-running + // pytest sessions. + for (auto &kv : prepared_callables_) { + if (kv.second.host_dlopen_handle != nullptr) { + dlclose(kv.second.host_dlopen_handle); + } + } + prepared_callables_.clear(); + aicpu_seen_callable_ids_.clear(); + aicpu_dlopen_total_ = 0; + // Cleanup performance profiling (frees L2PerfSetupHeader + all per-core/per-thread buffers) if (l2_perf_collector_.is_initialized()) { auto free_cb = [](void *dev_ptr) -> int { @@ -817,11 +1050,24 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data return 0; } - // Return cached callable address if already uploaded + // Return cached callable address if already uploaded *and* the new bytes + // match. With the prepared-callable path, multiple ChipCallables share a + // single ChipWorker (and DeviceRunner) and can pick distinct kernel + // binaries for the same func_id. Naively reusing the cached entry hands + // the AICore the previous callable's kernel: dispatch never completes + // the new task and the AICPU spins forever. + const uint64_t new_hash = simpler::common::utils::elf_build_id_64(bin_data, bin_size); auto it = func_id_to_addr_.find(func_id); if (it != func_id_to_addr_.end()) { - LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id); - return it->second; + auto hash_it = func_id_to_hash_.find(func_id); + if (hash_it != func_id_to_hash_.end() && hash_it->second == new_hash) { + LOG_INFO_V0("Kernel func_id=%d already uploaded (matching hash), returning cached address", func_id); + return it->second; + } + LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id); + mem_alloc_.free(reinterpret_cast(it->second)); + func_id_to_addr_.erase(it); + func_id_to_hash_.erase(func_id); } LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size); @@ -851,6 +1097,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data } func_id_to_addr_[func_id] = callable_addr; + func_id_to_hash_[func_id] = new_hash; LOG_DEBUG(" func_id=%d -> callable_addr=0x%lx, binary_code_addr=0x%lx", func_id, callable_addr, binary_code_addr); @@ -868,6 +1115,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) { mem_alloc_.free(gm_addr); func_id_to_addr_.erase(it); + func_id_to_hash_.erase(func_id); LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr); } diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 4c5fab748..12c1dab84 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include "common/kernel_args.h" @@ -370,6 +372,67 @@ class DeviceRunner { */ void release_run_context(); + /** + * Stage a per-callable_id orchestration SO into device memory and remember + * the supporting metadata (entry/config symbol names, kernel func_id ↔ + * dev_addr table). Identical SO bytes across two callable_ids share one + * device buffer (refcounted by hash) so the worst case for an N-cid pool + * is N distinct device buffers, not N copies of the same SO. + * + * @param callable_id Caller-stable id, must be in [0, MAX_REGISTERED_CALLABLE_IDS). + * @param orch_so_data Host pointer to orchestration SO bytes (owned by caller). + * @param orch_so_size Size of orchestration SO in bytes. + * @param func_name Entry symbol name (copied). + * @param config_name Config symbol name (copied). + * @param kernel_addrs func_id ↔ dev_addr pairs already uploaded by the + * caller. Stored verbatim so run_prepared can replay + * them onto a fresh Runtime without re-uploading. + * @return 0 on success, negative on failure. + */ + int register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, + const char *config_name, std::vector> kernel_addrs + ); + + /** + * Host-orchestration sibling for hbg variants. See a2a3 onboard + * device_runner.h for full contract. Mutually exclusive with the + * trb-shaped overload. + */ + int register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs + ); + + /** + * Drop the prepared state for `callable_id`. trb path: decrement orch SO + * refcount, free when zero. hbg path: dlclose the host handle. Kernel + * binaries are shared and only released by finalize(). + */ + int unregister_prepared_callable(int32_t callable_id); + + /** True iff `callable_id` has prepared state staged. */ + bool has_prepared_callable(int32_t callable_id) const; + + /** + * Replay the prepared state for `callable_id` onto a freshly-constructed + * Runtime. See a2a3 onboard documentation for full contract. + */ + int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id); + + /** + * Number of distinct callable_ids the AICPU has been asked to dlopen for. + * Monotonically increases on first-sighting bind; never decremented. + */ + size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; } + + /** + * Number of host-side dlopens triggered by + * `register_prepared_callable_host_orch` (hbg variant). Mirrors + * `aicpu_dlopen_count` for the host-orchestration path. + */ + size_t host_dlopen_count() const { return host_dlopen_total_; } + private: // Internal state int device_id_{-1}; @@ -391,6 +454,7 @@ class DeviceRunner { // Kernel binary management bool binaries_loaded_{false}; // true after AICPU SO loaded std::map func_id_to_addr_; // func_id -> function_bin_addr (device GM) + std::map func_id_to_hash_; // func_id -> elf_build_id_64(bin_data) // Orchestration SO cache (host-tracked, device-resident). uint64_t cached_orch_so_hash_{0}; @@ -398,6 +462,39 @@ class DeviceRunner { size_t dev_orch_so_capacity_{0}; std::vector host_orch_so_copy_; + // Per-callable_id prepared state. See a2a3 onboard device_runner.h for + // the full design narrative; mirrored here so a5 shares the same + // dispatch surface. + struct PreparedCallableState { + // trb path + uint64_t hash{0}; + uint64_t dev_orch_so_addr{0}; + size_t dev_orch_so_size{0}; + std::string func_name; + std::string config_name; + // common + std::vector> kernel_addrs; + // hbg path + void *host_dlopen_handle{nullptr}; + void *host_orch_func_ptr{nullptr}; + }; + struct OrchSoBuffer { + void *dev_addr{nullptr}; + size_t capacity{0}; + int refcount{0}; + }; + std::unordered_map prepared_callables_; + std::unordered_map orch_so_dedup_; + std::unordered_set aicpu_seen_callable_ids_; + // Monotonic AICPU dlopen counter (first-sighting bind only; never decremented). + size_t aicpu_dlopen_total_{0}; + // Monotonic host-side dlopen counter for hbg variants. + size_t host_dlopen_total_{0}; + // Sticky flag: prepare_callable was called at least once. Lets finalize() + // distinguish legacy-path leaks from prepared-path kernels that legitimately + // live until finalize. + bool prepared_callable_path_used_{false}; + // Performance profiling L2PerfCollector l2_perf_collector_; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index fa151b1ab..e3d8660be 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -21,6 +21,8 @@ #include "task_args.h" #include + +#include #include #include "common/unified_log.h" @@ -39,7 +41,8 @@ extern "C" { /* =========================================================================== * Runtime Implementation Functions (defined in runtime_maker.cpp) * =========================================================================== */ -int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); +int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable); +int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); /* =========================================================================== @@ -162,71 +165,6 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de } } -int run_runtime( - DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, - int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix -) { - if (ctx == NULL || runtime == NULL) return -1; - if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1; - - DeviceRunner *runner = static_cast(ctx); - - pthread_once(&g_runner_key_once, create_runner_key); - pthread_setspecific(g_runner_key, ctx); - auto tsd_guard = RAIIScopeGuard([]() { - pthread_setspecific(g_runner_key, nullptr); - }); - - try { - int rc = runner->prepare_run_context(device_id); - if (rc != 0) return rc; - auto run_context_guard = RAIIScopeGuard([runner]() { - runner->release_run_context(); - }); - - Runtime *r = new (runtime) Runtime(); - r->host_api.device_malloc = device_malloc; - r->host_api.device_free = device_free; - r->host_api.copy_to_device = copy_to_device; - r->host_api.copy_from_device = copy_from_device; - r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; - r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; - - LOG_DEBUG("About to call init_runtime_impl, r=%p", (void *)r); - rc = init_runtime_impl( - r, reinterpret_cast(callable), reinterpret_cast(args) - ); - LOG_DEBUG("init_runtime_impl returned: %d", rc); - if (rc != 0) { - r->set_gm_sm_ptr(nullptr); - validate_runtime_impl(r); - r->~Runtime(); - return rc; - } - - runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0); - runner->set_dump_tensor_enabled(enable_dump_tensor != 0); - runner->set_pmu_enabled(enable_pmu); - runner->set_output_prefix(output_prefix); - - std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); - std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); - rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); - if (rc != 0) { - validate_runtime_impl(r); - r->~Runtime(); - return rc; - } - - rc = validate_runtime_impl(r); - r->~Runtime(); - return rc; - } catch (...) { - return -1; - } -} - int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { @@ -330,5 +268,181 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { runner->set_log_level(log_level); runner->set_log_info_v(log_info_v); } +/* =========================================================================== + * Per-callable_id preparation + * =========================================================================== */ + +int prepare_callable( + DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary, + size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size +) { + if (ctx == NULL || callable == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + // AICPU/AICore executor binaries are only consumed by run()/run_prepared(); + // prepare_callable just uploads kernel + orch SO state. + (void)aicpu_binary; + (void)aicpu_size; + (void)aicore_binary; + (void)aicore_size; + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + auto tsd_guard = RAIIScopeGuard([]() { + pthread_setspecific(g_runner_key, nullptr); + }); + + try { + int rc = runner->prepare_run_context(device_id); + if (rc != 0) return rc; + auto run_context_guard = RAIIScopeGuard([runner]() { + runner->release_run_context(); + }); + + // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB, + // larger than the default thread stack. + std::unique_ptr r_owner = std::make_unique(); + Runtime *r = r_owner.get(); + r->host_api.device_malloc = device_malloc; + r->host_api.device_free = device_free; + r->host_api.copy_to_device = copy_to_device; + r->host_api.copy_from_device = copy_from_device; + r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; + + rc = prepare_callable_impl(r, reinterpret_cast(callable)); + if (rc != 0) { + return rc; + } + + // Extract kernel func_id ↔ dev_addr pairs uploaded by prepare_callable_impl. + std::vector> kernel_addrs; + int kcount = r->get_registered_kernel_count(); + kernel_addrs.reserve(kcount); + for (int i = 0; i < kcount; i++) { + int fid = r->get_registered_kernel_func_id(i); + kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid)); + } + // Clear registered kernels so the Runtime destructor (or any accidental + // validate call) does NOT free the kernel binaries we just uploaded — + // they belong to the prepared state now. + r->clear_registered_kernels(); + + if (r->pending_host_dlopen_handle_ != nullptr) { + rc = runner->register_prepared_callable_host_orch( + callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs) + ); + r->pending_host_dlopen_handle_ = nullptr; + r->pending_host_orch_func_ptr_ = nullptr; + } else { + rc = runner->register_prepared_callable( + callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(), + r->get_device_orch_config_name(), std::move(kernel_addrs) + ); + } + return rc; + } catch (...) { + return -1; + } +} + +int run_prepared( + DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix +) { + if (ctx == NULL || runtime == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + if (!runner->has_prepared_callable(callable_id)) { + LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id); + return -1; + } + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + auto tsd_guard = RAIIScopeGuard([]() { + pthread_setspecific(g_runner_key, nullptr); + }); + + try { + int rc = runner->prepare_run_context(device_id); + if (rc != 0) return rc; + auto run_context_guard = RAIIScopeGuard([runner]() { + runner->release_run_context(); + }); + + Runtime *r = new (runtime) Runtime(); + r->host_api.device_malloc = device_malloc; + r->host_api.device_free = device_free; + r->host_api.copy_to_device = copy_to_device; + r->host_api.copy_from_device = copy_from_device; + r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; + + // Restore kernel addrs + orch symbol names + active_callable_id + rc = runner->bind_prepared_callable_to_runtime(*r, callable_id); + if (rc != 0) { + r->~Runtime(); + return rc; + } + + // Per-run binding (tensor args, GM heap, SM alloc) + rc = bind_prepared_to_runtime_impl(r, reinterpret_cast(args)); + if (rc != 0) { + r->set_gm_sm_ptr(nullptr); + validate_runtime_impl(r); + r->~Runtime(); + return rc; + } + + runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0); + runner->set_dump_tensor_enabled(enable_dump_tensor != 0); + runner->set_pmu_enabled(enable_pmu); + runner->set_output_prefix(output_prefix); + + std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); + std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + if (rc != 0) { + validate_runtime_impl(r); + r->~Runtime(); + return rc; + } + + rc = validate_runtime_impl(r); + r->~Runtime(); + return rc; + } catch (...) { + return -1; + } +} + +int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) { + if (ctx == NULL) return -1; + try { + return static_cast(ctx)->unregister_prepared_callable(callable_id); + } catch (...) { + return -1; + } +} + +size_t get_host_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->host_dlopen_count(); + } catch (...) { + return 0; + } +} + +size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->aicpu_dlopen_count(); + } catch (...) { + return 0; + } +} } // extern "C" diff --git a/src/a5/platform/sim/aicpu/orch_so_file.cpp b/src/a5/platform/sim/aicpu/orch_so_file.cpp index 4da92d7de..114fe4826 100644 --- a/src/a5/platform/sim/aicpu/orch_so_file.cpp +++ b/src/a5/platform/sim/aicpu/orch_so_file.cpp @@ -24,10 +24,17 @@ #include -int32_t create_orch_so_file(const char *dir, char *out_path, size_t out_path_size) { +int32_t create_orch_so_file(const char *dir, int32_t callable_id, char *out_path, size_t out_path_size) { // mkstemps: multiple sim workers can share a process, so names must be // unique per call. The "XXXXXX" template is replaced in-place. - int32_t written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir); + // callable_id is embedded purely for log readability (mkstemps already + // guarantees uniqueness regardless). + int32_t written; + if (callable_id >= 0) { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_cid%d_XXXXXX.so", dir, callable_id); + } else { + written = snprintf(out_path, out_path_size, "%s/libdevice_orch_XXXXXX.so", dir); + } if (written < 0 || static_cast(written) >= out_path_size) { return -1; } diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 015419665..ea325c7f9 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -36,6 +36,7 @@ #include "aicpu/platform_aicpu_affinity.h" #include "callable.h" +#include "callable_protocol.h" #include "utils/elf_build_id.h" #include "cpu_sim_context.h" #include "host/raii_scope_guard.h" @@ -653,13 +654,45 @@ void DeviceRunner::unload_executor_binaries() { } int DeviceRunner::prepare_orch_so(Runtime &runtime) { + // Per-callable_id path: mirror onboard. Bytes were staged at + // register_prepared_callable time; here we only stamp metadata onto + // the runtime and resolve `register_new_callable_id_` from first sighting. + const int32_t cid = runtime.get_active_callable_id(); + if (cid >= 0) { + auto it = prepared_callables_.find(cid); + if (it == prepared_callables_.end()) { + LOG_ERROR("prepare_orch_so: callable_id=%d not registered", cid); + return -1; + } + const auto &state = it->second; + // hbg variant: orch SO never crosses host/device boundary. + if (state.host_dlopen_handle != nullptr) { + runtime.set_dev_orch_so(0, 0); + runtime.set_active_callable_id(cid, /*is_new=*/false); + return 0; + } + const bool first_sighting = aicpu_seen_callable_ids_.insert(cid).second; + if (first_sighting) { + ++aicpu_dlopen_total_; + } + runtime.set_dev_orch_so(state.dev_orch_so_addr, state.dev_orch_so_size); + runtime.set_active_callable_id(cid, first_sighting); + runtime.pending_orch_so_data_ = nullptr; + runtime.pending_orch_so_size_ = 0; + LOG_INFO_V0( + "Orch SO prepared cid=%d hash=0x%lx %zu bytes (is_new=%d)", cid, state.hash, state.dev_orch_so_size, + first_sighting ? 1 : 0 + ); + return 0; + } + const void *host_so_data = runtime.pending_orch_so_data_; const size_t host_so_size = runtime.pending_orch_so_size_; runtime.pending_orch_so_data_ = nullptr; runtime.pending_orch_so_size_ = 0; if (host_so_data == nullptr || host_so_size == 0) { - runtime.set_dev_orch_so(0, 0, false); + runtime.set_dev_orch_so(0, 0); return 0; } @@ -667,7 +700,7 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { if (new_hash == cached_orch_so_hash_ && dev_orch_so_buffer_ != nullptr) { LOG_INFO_V0("Orch SO cache hit (hash=0x%lx, %zu bytes)", new_hash, host_so_size); - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/false); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); return 0; } @@ -695,11 +728,152 @@ int DeviceRunner::prepare_orch_so(Runtime &runtime) { std::memcpy(dev_orch_so_buffer_, host_orch_so_copy_.data(), host_so_size); cached_orch_so_hash_ = new_hash; - runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size, /*is_new=*/true); + runtime.set_dev_orch_so(reinterpret_cast(dev_orch_so_buffer_), host_so_size); LOG_INFO_V0("Orch SO cache miss (hash=0x%lx, %zu bytes uploaded)", new_hash, host_so_size); return 0; } +int DeviceRunner::register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, const char *config_name, + std::vector> kernel_addrs +) { + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable: callable_id=%d out of range [0, %d)", callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (orch_so_data == nullptr || orch_so_size == 0) { + LOG_ERROR("register_prepared_callable: empty orch SO for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable: callable_id=%d already registered", callable_id); + return -1; + } + + const uint64_t hash = simpler::common::utils::elf_build_id_64(orch_so_data, orch_so_size); + + auto buf_it = orch_so_dedup_.find(hash); + uint64_t dev_addr = 0; + if (buf_it == orch_so_dedup_.end()) { + void *buf = mem_alloc_.alloc(orch_so_size); + if (buf == nullptr) { + LOG_ERROR("register_prepared_callable: alloc %zu bytes failed", orch_so_size); + return -1; + } + // Sim shares an address space with the simulated AICPU thread, so a + // plain memcpy is the moral equivalent of rtMemcpy on hardware. + std::memcpy(buf, orch_so_data, orch_so_size); + OrchSoBuffer entry; + entry.dev_addr = buf; + entry.capacity = orch_so_size; + entry.refcount = 1; + orch_so_dedup_.emplace(hash, entry); + dev_addr = reinterpret_cast(buf); + LOG_INFO_V0("register_prepared_callable: hash=0x%lx new buffer %zu bytes", hash, orch_so_size); + } else { + buf_it->second.refcount++; + dev_addr = reinterpret_cast(buf_it->second.dev_addr); + LOG_INFO_V0( + "register_prepared_callable: hash=0x%lx shared buffer (refcount=%d)", hash, buf_it->second.refcount + ); + } + + PreparedCallableState state; + state.hash = hash; + state.dev_orch_so_addr = dev_addr; + state.dev_orch_so_size = orch_so_size; + state.func_name = (func_name != nullptr) ? func_name : ""; + state.config_name = (config_name != nullptr) ? config_name : ""; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + return 0; +} + +int DeviceRunner::register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs +) { + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "register_prepared_callable_host_orch: callable_id=%d out of range [0, %d)", callable_id, + MAX_REGISTERED_CALLABLE_IDS + ); + return -1; + } + if (host_dlopen_handle == nullptr || host_orch_func_ptr == nullptr) { + LOG_ERROR("register_prepared_callable_host_orch: null handle/fn for callable_id=%d", callable_id); + return -1; + } + if (prepared_callables_.count(callable_id) != 0) { + LOG_ERROR("register_prepared_callable_host_orch: callable_id=%d already registered", callable_id); + return -1; + } + + PreparedCallableState state; + state.host_dlopen_handle = host_dlopen_handle; + state.host_orch_func_ptr = host_orch_func_ptr; + state.kernel_addrs = std::move(kernel_addrs); + prepared_callables_.emplace(callable_id, std::move(state)); + prepared_callable_path_used_ = true; + ++host_dlopen_total_; + LOG_INFO_V0("register_prepared_callable_host_orch: cid=%d (host dlopen #%zu)", callable_id, host_dlopen_total_); + return 0; +} + +int DeviceRunner::unregister_prepared_callable(int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + return 0; + } + PreparedCallableState state = std::move(it->second); + prepared_callables_.erase(it); + aicpu_seen_callable_ids_.erase(callable_id); + + if (state.host_dlopen_handle != nullptr) { + // hbg path: dlclose host handle; no orch SO refcount. + dlclose(state.host_dlopen_handle); + return 0; + } + + auto buf_it = orch_so_dedup_.find(state.hash); + if (buf_it != orch_so_dedup_.end()) { + if (--buf_it->second.refcount <= 0) { + mem_alloc_.free(buf_it->second.dev_addr); + orch_so_dedup_.erase(buf_it); + } + } + return 0; +} + +bool DeviceRunner::has_prepared_callable(int32_t callable_id) const { + return prepared_callables_.count(callable_id) != 0; +} + +int DeviceRunner::bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id) { + auto it = prepared_callables_.find(callable_id); + if (it == prepared_callables_.end()) { + LOG_ERROR("bind_prepared_callable_to_runtime: callable_id=%d not registered", callable_id); + return -1; + } + const auto &state = it->second; + for (const auto &kv : state.kernel_addrs) { + if (kv.first < 0 || kv.first >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("bind_prepared_callable_to_runtime: func_id=%d out of range", kv.first); + return -1; + } + runtime.replay_function_bin_addr(kv.first, kv.second); + } + runtime.pending_host_dlopen_handle_ = state.host_dlopen_handle; + runtime.pending_host_orch_func_ptr_ = state.host_orch_func_ptr; + runtime.set_device_orch_func_name(state.func_name.c_str()); + runtime.set_device_orch_config_name(state.config_name.c_str()); + runtime.set_active_callable_id(callable_id, /*is_new=*/false); + return 0; +} + int DeviceRunner::finalize() { // Skip if already finalized if (device_id_ == -1 && aicpu_so_handle_ == nullptr && aicore_so_handle_ == nullptr) { @@ -736,15 +910,22 @@ int DeviceRunner::finalize() { pmu_collector_.finalize(nullptr, free_cb, nullptr); } - // Kernel binaries should have been removed by validate_runtime_impl() + // Kernel binaries are normally released by validate_runtime_impl on the + // legacy run() path. The prepared-callable path intentionally leaves + // them resident across runs and relies on finalize() to reclaim them; + // that is not a leak. if (!func_id_to_addr_.empty()) { - LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size()); - // Cleanup leaked handles and host copies + const bool prepared_path_used = prepared_callable_path_used_; + if (prepared_path_used) { + LOG_DEBUG("finalize() releasing %zu kernel binaries staged by prepare_callable", func_id_to_addr_.size()); + } else { + LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size()); + } for (auto &pair : func_id_to_addr_) { MappedKernel &kernel = pair.second; if (kernel.dl_handle != nullptr) { dlclose(kernel.dl_handle); - LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first); + LOG_DEBUG("Closed kernel: func_id=%d", pair.first); } delete[] kernel.callable_buf; } @@ -761,6 +942,27 @@ int DeviceRunner::finalize() { host_orch_so_copy_.clear(); host_orch_so_copy_.shrink_to_fit(); + // Release any prepared-callable orch SO buffers callers forgot to drop. + for (auto &kv : orch_so_dedup_) { + if (kv.second.dev_addr != nullptr) { + mem_alloc_.free(kv.second.dev_addr); + } + } + orch_so_dedup_.clear(); + // hbg path: dlclose any host orch handles callers forgot to unregister. + // finalize() is the last chance; Worker.close() does not auto-unregister + // each callable_id, so without this loop the host process leaks one + // dlopen handle per (re)created Worker — observable in long-running + // pytest sessions. + for (auto &kv : prepared_callables_) { + if (kv.second.host_dlopen_handle != nullptr) { + dlclose(kv.second.host_dlopen_handle); + } + } + prepared_callables_.clear(); + aicpu_seen_callable_ids_.clear(); + aicpu_dlopen_total_ = 0; + // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); @@ -786,11 +988,25 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data return 0; } - // Return cached callable address if already uploaded + // Return cached callable address if already uploaded *and* the new bytes + // match. With the prepared-callable path, multiple ChipCallables share a + // single ChipWorker (and hence DeviceRunner) and can pick distinct + // kernel binaries for the same func_id. Naively reusing the cached + // entry hands the AICore the previous callable's kernel and segfaults + // at dispatch. auto it = func_id_to_addr_.find(func_id); if (it != func_id_to_addr_.end()) { - LOG_INFO_V0("Kernel func_id=%d already uploaded, returning cached address", func_id); - return reinterpret_cast(it->second.callable_buf); + const auto &cached_callable = *reinterpret_cast(it->second.callable_buf); + const auto *new_callable = reinterpret_cast(bin_data); + if (cached_callable.binary_size() == new_callable->binary_size() && + std::memcmp(cached_callable.binary_data(), new_callable->binary_data(), new_callable->binary_size()) == 0) { + LOG_INFO_V0("Kernel func_id=%d already uploaded (matching bytes), returning cached address", func_id); + return reinterpret_cast(it->second.callable_buf); + } + LOG_INFO_V0("Kernel func_id=%d binary changed, evicting cached entry", func_id); + if (it->second.dl_handle != nullptr) dlclose(it->second.dl_handle); + delete[] it->second.callable_buf; + func_id_to_addr_.erase(it); } // Extract binary from CoreCallable envelope diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 636149f18..a153a18a1 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include "common/core_type.h" @@ -208,6 +210,36 @@ class DeviceRunner { */ void remove_kernel_binary(int func_id); + /** + * Stage a per-callable_id orchestration SO and its supporting metadata. + * See a5 onboard or a2a3 device_runner.h for full contract. + */ + int register_prepared_callable( + int32_t callable_id, const void *orch_so_data, size_t orch_so_size, const char *func_name, + const char *config_name, std::vector> kernel_addrs + ); + + /** Host-orchestration sibling for hbg variants. See a2a3 onboard. */ + int register_prepared_callable_host_orch( + int32_t callable_id, void *host_dlopen_handle, void *host_orch_func_ptr, + std::vector> kernel_addrs + ); + + /** Drop prepared state for `callable_id`; trb refcounts SO, hbg dlcloses handle. */ + int unregister_prepared_callable(int32_t callable_id); + + /** True iff `callable_id` has prepared state staged. */ + bool has_prepared_callable(int32_t callable_id) const; + + /** Replay prepared state onto a freshly-constructed Runtime. */ + int bind_prepared_callable_to_runtime(Runtime &runtime, int32_t callable_id); + + /** Monotonic AICPU dlopen counter (first-sighting only; never decremented). */ + size_t aicpu_dlopen_count() const { return aicpu_dlopen_total_; } + + /** Monotonic host-side dlopen counter for hbg variants. */ + size_t host_dlopen_count() const { return host_dlopen_total_; } + private: // Configuration int device_id_{-1}; @@ -230,6 +262,32 @@ class DeviceRunner { size_t dev_orch_so_capacity_{0}; std::vector host_orch_so_copy_; + // Per-callable_id prepared state. Mirrors onboard. + struct PreparedCallableState { + // trb path + uint64_t hash{0}; + uint64_t dev_orch_so_addr{0}; + size_t dev_orch_so_size{0}; + std::string func_name; + std::string config_name; + // common + std::vector> kernel_addrs; + // hbg path + void *host_dlopen_handle{nullptr}; + void *host_orch_func_ptr{nullptr}; + }; + struct OrchSoBuffer { + void *dev_addr{nullptr}; + size_t capacity{0}; + int refcount{0}; + }; + std::unordered_map prepared_callables_; + std::unordered_map orch_so_dedup_; + std::unordered_set aicpu_seen_callable_ids_; + size_t aicpu_dlopen_total_{0}; + size_t host_dlopen_total_{0}; + bool prepared_callable_path_used_{false}; + // Runtime pointer for print_handshake_results Runtime *last_runtime_{nullptr}; diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index cd16e3734..db05b3ac1 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -22,6 +22,8 @@ #include #include + +#include #include #include "common/unified_log.h" @@ -35,7 +37,8 @@ extern "C" { /* =========================================================================== * Runtime Implementation Functions (defined in runtime_maker.cpp) * =========================================================================== */ -int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); +int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable); +int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); /* =========================================================================== @@ -156,19 +159,146 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de } } -int run_runtime( - DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; + try { + int rc = static_cast(ctx)->finalize(); + int dev = pto_cpu_sim_get_bound_device(); + if (dev >= 0) { + pto_cpu_sim_release_device(dev); + } + return rc; + } catch (...) { + return -1; + } +} + +/* =========================================================================== + * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these no-op + * to satisfy the uniform host_runtime.so ABI that ChipWorker dlsym's. The + * real comm_* entry points come from src/common/platform_comm/comm_sim.cpp, + * which is compiled into this runtime via CMakeLists. + * =========================================================================== */ + +int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) { + (void)ctx; + (void)device_id; + return 0; +} + +void *create_comm_stream_ctx(DeviceContextHandle ctx) { + (void)ctx; + return NULL; +} + +int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { + (void)ctx; + (void)stream; + return 0; +} + +/* =========================================================================== + * Internal helpers called from runtime_maker.cpp via Runtime.host_api + * =========================================================================== */ + +void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) { + if (runtime == NULL) return; + Runtime *r = static_cast(runtime); + r->record_tensor_pair(host_ptr, dev_ptr, size); +} + +void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { + if (ctx == NULL) return; + // No CANN dlog on sim; only HostLogger + runner state. + HostLogger::get_instance().set_level(static_cast(log_level)); + HostLogger::get_instance().set_info_v(log_info_v); + DeviceRunner *runner = static_cast(ctx); + runner->set_log_level(log_level); + runner->set_log_info_v(log_info_v); +} +/* =========================================================================== + * Per-callable_id preparation + * =========================================================================== */ + +int prepare_callable( + DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary, + size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size +) { + if (ctx == NULL || callable == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + (void)aicpu_binary; + (void)aicpu_size; + (void)aicore_binary; + (void)aicore_size; + (void)device_id; + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + + try { + // Heap-allocate: hbg's Runtime carries 131072 Tasks → tens of MB. + std::unique_ptr r_owner = std::make_unique(); + Runtime *r = r_owner.get(); + r->host_api.device_malloc = device_malloc; + r->host_api.device_free = device_free; + r->host_api.copy_to_device = copy_to_device; + r->host_api.copy_from_device = copy_from_device; + r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; + + int rc = prepare_callable_impl(r, reinterpret_cast(callable)); + if (rc != 0) { + pthread_setspecific(g_runner_key, nullptr); + return rc; + } + + std::vector> kernel_addrs; + int kcount = r->get_registered_kernel_count(); + kernel_addrs.reserve(kcount); + for (int i = 0; i < kcount; i++) { + int fid = r->get_registered_kernel_func_id(i); + kernel_addrs.emplace_back(fid, r->get_function_bin_addr(fid)); + } + r->clear_registered_kernels(); + + if (r->pending_host_dlopen_handle_ != nullptr) { + rc = runner->register_prepared_callable_host_orch( + callable_id, r->pending_host_dlopen_handle_, r->pending_host_orch_func_ptr_, std::move(kernel_addrs) + ); + r->pending_host_dlopen_handle_ = nullptr; + r->pending_host_orch_func_ptr_ = nullptr; + } else { + rc = runner->register_prepared_callable( + callable_id, r->pending_orch_so_data_, r->pending_orch_so_size_, r->get_device_orch_func_name(), + r->get_device_orch_config_name(), std::move(kernel_addrs) + ); + } + pthread_setspecific(g_runner_key, nullptr); + return rc; + } catch (...) { + pthread_setspecific(g_runner_key, nullptr); + return -1; + } +} + +int run_prepared( + DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix ) { if (ctx == NULL || runtime == NULL) return -1; + DeviceRunner *runner = static_cast(ctx); + + if (!runner->has_prepared_callable(callable_id)) { + LOG_ERROR("run_prepared: callable_id=%d not prepared", callable_id); + return -1; + } pthread_once(&g_runner_key_once, create_runner_key); pthread_setspecific(g_runner_key, ctx); - DeviceRunner *runner = static_cast(ctx); try { - // Phase 1: placement new + build graph Runtime *r = new (runtime) Runtime(); r->host_api.device_malloc = device_malloc; r->host_api.device_free = device_free; @@ -177,9 +307,14 @@ int run_runtime( r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; - int rc = init_runtime_impl( - r, reinterpret_cast(callable), reinterpret_cast(args) - ); + int rc = runner->bind_prepared_callable_to_runtime(*r, callable_id); + if (rc != 0) { + r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); + return rc; + } + + rc = bind_prepared_to_runtime_impl(r, reinterpret_cast(args)); if (rc != 0) { r->set_gm_sm_ptr(nullptr); validate_runtime_impl(r); @@ -188,15 +323,11 @@ int run_runtime( return rc; } - // Phase 2: publish diagnostics enablement to the DeviceRunner so run() - // and its helpers can read the three sub-features uniformly (via - // members, not Runtime / run() args). runner->set_l2_swimlane_enabled(enable_l2_swimlane != 0); runner->set_dump_tensor_enabled(enable_dump_tensor != 0); runner->set_pmu_enabled(enable_pmu); runner->set_output_prefix(output_prefix); - // Phase 3: launch std::vector aicpu_vec; std::vector aicore_vec; if (aicpu_binary != NULL && aicpu_size > 0) { @@ -213,7 +344,6 @@ int run_runtime( return rc; } - // Phase 4: finalize (copy results back) rc = validate_runtime_impl(r); r->~Runtime(); pthread_setspecific(g_runner_key, nullptr); @@ -224,62 +354,31 @@ int run_runtime( } } -int finalize_device(DeviceContextHandle ctx) { +int unregister_callable(DeviceContextHandle ctx, int32_t callable_id) { if (ctx == NULL) return -1; try { - int rc = static_cast(ctx)->finalize(); - int dev = pto_cpu_sim_get_bound_device(); - if (dev >= 0) { - pto_cpu_sim_release_device(dev); - } - return rc; + return static_cast(ctx)->unregister_prepared_callable(callable_id); } catch (...) { return -1; } } -/* =========================================================================== - * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these no-op - * to satisfy the uniform host_runtime.so ABI that ChipWorker dlsym's. The - * real comm_* entry points come from src/common/platform_comm/comm_sim.cpp, - * which is compiled into this runtime via CMakeLists. - * =========================================================================== */ - -int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) { - (void)ctx; - (void)device_id; - return 0; -} - -void *create_comm_stream_ctx(DeviceContextHandle ctx) { - (void)ctx; - return NULL; -} - -int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { - (void)ctx; - (void)stream; - return 0; -} - -/* =========================================================================== - * Internal helpers called from runtime_maker.cpp via Runtime.host_api - * =========================================================================== */ - -void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, size_t size) { - if (runtime == NULL) return; - Runtime *r = static_cast(runtime); - r->record_tensor_pair(host_ptr, dev_ptr, size); +size_t get_aicpu_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->aicpu_dlopen_count(); + } catch (...) { + return 0; + } } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; - // No CANN dlog on sim; only HostLogger + runner state. - HostLogger::get_instance().set_level(static_cast(log_level)); - HostLogger::get_instance().set_info_v(log_info_v); - DeviceRunner *runner = static_cast(ctx); - runner->set_log_level(log_level); - runner->set_log_info_v(log_info_v); +size_t get_host_dlopen_count(DeviceContextHandle ctx) { + if (ctx == NULL) return 0; + try { + return static_cast(ctx)->host_dlopen_count(); + } catch (...) { + return 0; + } } } // extern "C" diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp index cf6618170..390ad3d19 100644 --- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp @@ -276,31 +276,27 @@ extern "C" { #endif /** - * Initialize a pre-allocated runtime with dynamic orchestration. - * - * This function loads the orchestration SO from binary data via a temp file, - * resolves the orchestration function via dlsym, then calls it to build the - * task graph. The orchestration function is responsible for: - * - Allocating device memory via device_malloc() - * - Copying data to device via copy_to_device() - * - Building the task graph - * - Recording tensor pairs via record_tensor_pair() - * - * @param runtime Pointer to pre-constructed Runtime - * @param callable ChipCallable containing orch binary, func_name, and child kernels - * @param orch_args Separated tensor/scalar arguments - * @return 0 on success, -1 on failure + * Stage the per-callable resources for the host_build_graph variant: upload + * kernel binaries and dlopen the orchestration SO on the host. The dlopen + * handle and resolved entry-symbol pointer are parked on the runtime via + * `pending_host_dlopen_handle_` / `pending_host_orch_func_ptr_` so the + * platform layer can hoist them into PreparedCallableState. Splitting this + * out of init_runtime_impl is what the hbg prepare_callable / run_prepared + * path rests on — the dlopen runs once per cid instead of every run. */ -int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) { - // Validate inputs +int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) { if (runtime == nullptr) { LOG_ERROR("Runtime pointer is null"); return -1; } + if (callable == nullptr) { + LOG_ERROR("Callable pointer is null"); + return -1; + } // Register kernel binaries from ChipCallable children if (callable->child_count() > 0) { - LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count()); + LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count()); for (int32_t i = 0; i < callable->child_count(); i++) { int func_id = callable->child_func_id(i); if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { @@ -329,7 +325,9 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip return -1; } - // Load orchestration SO from binary data via temp file + // Load orchestration SO from binary data via temp file. Held open across + // the lifetime of the prepared callable; closed by + // DeviceRunner::unregister_prepared_callable. std::string fd_path; if (!create_temp_so_file(orch_so_binary, orch_so_size, &fd_path)) { LOG_ERROR("Failed to create temp SO file"); @@ -343,7 +341,7 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip return -1; } - dlerror(); // Clear any existing error + dlerror(); OrchestrationFunc orch_func = reinterpret_cast(dlsym(handle, orch_func_name)); const char *dlsym_error = dlerror(); if (dlsym_error != nullptr) { @@ -354,11 +352,42 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip LOG_INFO_V0("Loaded orchestration function: %s", orch_func_name); - // Clear any previous tensor pairs + runtime->pending_host_dlopen_handle_ = handle; + runtime->pending_host_orch_func_ptr_ = reinterpret_cast(orch_func); + // hbg never uploads orch SO bytes to the device; clear the trb staging + // fields so DeviceRunner::register_prepared_callable cannot mistake this + // for a trb-shaped registration. + runtime->pending_orch_so_data_ = nullptr; + runtime->pending_orch_so_size_ = 0; + return 0; +} + +/** + * Per-run binding for hbg: invoke the previously-resolved orchestration entry + * point against the supplied args, then upload tensor info / allocation + * storage. Assumes prepare_callable_impl populated + * `pending_host_orch_func_ptr_` (either freshly during prepare_callable, or + * via DeviceRunner::bind_prepared_callable_to_runtime when run_prepared + * replays a prepared cid onto a fresh Runtime). + */ +int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + if (orch_args == nullptr) { + LOG_ERROR("orch_args pointer is null"); + return -1; + } + OrchestrationFunc orch_func = reinterpret_cast(runtime->pending_host_orch_func_ptr_); + if (orch_func == nullptr) { + LOG_ERROR("bind_prepared_to_runtime_impl: host orch_func pointer is null"); + return -1; + } + runtime->clear_tensor_pairs(); LOG_INFO_V0("=== Calling Orchestration Function ==="); - LOG_DEBUG( "Args count: %d (%d tensors + %d scalars)", orch_args->tensor_count() + orch_args->scalar_count(), orch_args->tensor_count(), orch_args->scalar_count() @@ -370,13 +399,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder }; - // Call orchestration function to build task graph - // The orchestration function handles device memory allocation and copy-to-device int rc = orch_func(reinterpret_cast(&orchestration_runtime), *orch_args); if (rc != 0) { LOG_ERROR("Orchestration function failed with code %d", rc); runtime->clear_tensor_pairs(); - dlclose(handle); return rc; } @@ -384,7 +410,6 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip if (rc != 0) { LOG_ERROR("Failed to upload tensor allocations: %d", rc); runtime->clear_tensor_pairs(); - dlclose(handle); return rc; } @@ -396,16 +421,10 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip runtime->clear_tensor_allocation_storage(); } runtime->clear_tensor_pairs(); - dlclose(handle); return rc; } LOG_INFO_V0("Runtime initialized. Ready for execution from Python."); - - // Host orchestration is complete once orch_func returns. The task graph now - // lives in Runtime, so the orchestration SO can be closed immediately. - dlclose(handle); - return 0; } diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h index 607783733..73e201494 100644 --- a/src/a5/runtime/host_build_graph/runtime/runtime.h +++ b/src/a5/runtime/host_build_graph/runtime/runtime.h @@ -448,6 +448,16 @@ class Runtime { */ void set_function_bin_addr(int func_id, uint64_t addr); + /** + * Replay a previously-uploaded kernel address onto a fresh Runtime + * without recording it in registered_kernel_func_ids_. See a2a3 hbg + * runtime.h for the full contract. + */ + void replay_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return; + func_id_to_addr_[func_id] = addr; + } + int get_registered_kernel_count() const { return registered_kernel_count_; } int get_registered_kernel_func_id(int index) const { @@ -468,15 +478,56 @@ class Runtime { // Device orchestration SO metadata (see a2a3 host_build_graph runtime.h). uint64_t dev_orch_so_addr_{0}; uint64_t dev_orch_so_size_{0}; - bool has_new_orch_so_{false}; + // Per-callable_id dispatch. hbg orch runs on host, so AICPU never reads + // `active_callable_id_`; the field exists for parity with the + // shared platform layer (DeviceRunner stamps it on every run). + int32_t active_callable_id_{-1}; + bool register_new_callable_id_{false}; const void *pending_orch_so_data_{nullptr}; size_t pending_orch_so_size_{0}; - void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) { + // Host-orchestration staging (hbg path). prepare_callable_impl + // dlopens the orch SO on the host and parks the handle + entry-symbol + // pointer here so DeviceRunner::register_prepared_callable_host_orch can + // claim them; bind_prepared_callable_to_runtime restores them onto a fresh + // Runtime so bind_prepared_to_runtime_impl can call orch_func without a + // second dlopen. + void *pending_host_dlopen_handle_{nullptr}; + void *pending_host_orch_func_ptr_{nullptr}; + + // Device-orchestration entry/config symbol names (trb path). Always + // empty on this hbg variant — included for API parity so the shared + // platform layer can call set_device_orch_func_name unconditionally. + char device_orch_func_name_[64]{}; + char device_orch_config_name_[64]{}; + + void set_device_orch_func_name(const char *name) { + device_orch_func_name_[0] = '\0'; + if (name) { + strncpy(device_orch_func_name_, name, sizeof(device_orch_func_name_) - 1); + device_orch_func_name_[sizeof(device_orch_func_name_) - 1] = '\0'; + } + } + const char *get_device_orch_func_name() const { return device_orch_func_name_; } + void set_device_orch_config_name(const char *name) { + device_orch_config_name_[0] = '\0'; + if (name) { + strncpy(device_orch_config_name_, name, sizeof(device_orch_config_name_) - 1); + device_orch_config_name_[sizeof(device_orch_config_name_) - 1] = '\0'; + } + } + const char *get_device_orch_config_name() const { return device_orch_config_name_; } + + void set_dev_orch_so(uint64_t dev_addr, uint64_t size) { dev_orch_so_addr_ = dev_addr; dev_orch_so_size_ = size; - has_new_orch_so_ = is_new; } + void set_active_callable_id(int32_t callable_id, bool is_new) { + active_callable_id_ = callable_id; + register_new_callable_id_ = is_new; + } + int32_t get_active_callable_id() const { return active_callable_id_; } + bool register_new_callable_id() const { return register_new_callable_id_; } }; #endif // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_RUNTIME_H_ diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f1936d467..e9b97d5ff 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -24,6 +24,7 @@ #include "aicpu/device_time.h" #include "aicpu/orch_so_file.h" +#include "callable_protocol.h" #include "pto2_dispatch_payload.h" #include "runtime.h" #include "spin_hint.h" @@ -89,6 +90,23 @@ static int32_t read_runtime_status(Runtime *runtime) { static PTO2Runtime *rt{nullptr}; +// Per-callable_id orchestration SO table. The executor dispatches +// `orch_so_table_[active_callable_id_]` (created on first sighting of +// that callable_id, kept warm across runs). +// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values +// (mailbox uint32 callable_id, register() returns small ints) and is shared +// with the host bounds check in DeviceRunner::register_prepared_callable — +// see src/common/task_interface/callable_protocol.h. + +struct OrchSoEntry { + bool in_use{false}; + void *handle{nullptr}; + char path[256]{}; + DeviceOrchestrationFunc func{nullptr}; + DeviceOrchestrationBindRuntimeFunc bind{nullptr}; + DeviceOrchestrationConfigFunc config_func{nullptr}; +}; + struct AicpuExecutor { int32_t sched_thread_num_; bool orch_to_sched_{false}; @@ -107,16 +125,15 @@ struct AicpuExecutor { std::atomic finished_count_{0}; std::atomic runtime_init_ready_{false}; - // Orchestration SO handle - defer dlclose until all tasks complete - void *orch_so_handle_{nullptr}; - char orch_so_path_[256]{}; // Path to orchestration SO file for cleanup - - // Shared orchestration function pointer (loaded by first orch thread, used by all) - DeviceOrchestrationFunc orch_func_{nullptr}; - DeviceOrchestrationBindRuntimeFunc orch_bind_runtime_{nullptr}; - DeviceOrchestrationConfigFunc orch_config_func_{nullptr}; + // Cached orch args pointer set by the orchestration thread before scheduler + // init; consumed by the (*p_func)(*orch_args_cached_) invocation below. const ChipStorageTaskArgs *orch_args_cached_{nullptr}; + // Per-callable_id table. Single orch thread today, so first-write/read + // race is not possible; if multiple orch threads are ever introduced, + // guard the in_use=false→true transition with a mutex. + OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]; + // ===== Scheduler context (owns all dispatch/completion/drain state) ===== SchedulerContext sched_ctx_; @@ -126,15 +143,14 @@ struct AicpuExecutor { void deinit(Runtime *runtime); ~AicpuExecutor() { - // Process-wide teardown (the single static instance dies here). The - // handle is otherwise kept alive across runs for cache-hit reuse. - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - orch_so_handle_ = nullptr; - } - if (orch_so_path_[0] != '\0') { - unlink(orch_so_path_); - orch_so_path_[0] = '\0'; + // Process-wide teardown (the single static instance dies here). Every + // in-use callable_id slot is dlclose()'d here; each is otherwise kept + // alive across runs for cache-hit reuse. + for (auto &e : orch_so_table_) { + if (!e.in_use) continue; + if (e.handle != nullptr) dlclose(e.handle); + if (e.path[0] != '\0') unlink(e.path); + e = OrchSoEntry{}; } } }; @@ -197,29 +213,37 @@ int32_t AicpuExecutor::run(Runtime *runtime) { if (runtime->get_orch_built_on_host()) { LOG_INFO_V0("Thread %d: Host orchestration mode, no-op", thread_idx); } else { - // Two paths: - // 1) has_new_orch_so == true → host believes the SO identity - // changed, so we drop the cached handle (if any), write the - // new bytes to disk, and dlopen + dlsym a fresh handle. - // 2) has_new_orch_so == false → host detected a cache hit, so - // we reuse `orch_so_handle_` / `orch_func_` / `orch_bind_runtime_` - // from the previous run untouched. sm_handle / rt below are - // always recreated because they bind this run's memory. - const bool reload_so = runtime->has_new_orch_so(); + // Per-callable_id dispatch: the orch SO state lives in + // `orch_so_table_[callable_id]` keyed by registration order; + // reload is governed by `register_new_callable_id_`. + const int32_t callable_id = runtime->get_active_callable_id(); + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + void **p_handle = &orch_so_table_[callable_id].handle; + char *p_path = orch_so_table_[callable_id].path; + DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func; + DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; + DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; + const bool reload_so = runtime->register_new_callable_id(); if (reload_so) { - LOG_INFO_V0("Thread %d: New orch SO detected, (re)loading", thread_idx); - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - orch_so_handle_ = nullptr; - orch_func_ = nullptr; - orch_bind_runtime_ = nullptr; - if (orch_so_path_[0] != '\0') { + LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id); + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; + *p_func = nullptr; + *p_bind = nullptr; + if (p_path[0] != '\0') { // Unlink the old file so the new open() lands on a // fresh inode — protects against SIGBUS / ETXTBSY when // the kernel still has the old mapping pinned. - unlink(orch_so_path_); - orch_so_path_[0] = '\0'; + unlink(p_path); + p_path[0] = '\0'; } } @@ -242,7 +266,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); for (int32_t i = 0; i < num_candidates && !file_created; i++) { - int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path)); + int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path)); if (fd < 0) { LOG_INFO_V0( "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno @@ -281,6 +305,14 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); + // Unlink the on-disk SO immediately: dlopen has already mmap'd + // the image, so the kernel keeps the inode alive until the + // matching dlclose / process exit. This prevents stale + // libdevice_orch__.so files from accumulating in + // /tmp when child processes exit via os._exit(0), which skips + // ~AicpuExecutor (worker.py: _sub/_chip/_child loops). + unlink(so_path); + const char *entry_symbol = runtime->get_device_orch_func_name(); if (entry_symbol == nullptr || entry_symbol[0] == '\0') { entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; @@ -333,15 +365,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) { bind_runtime_func = nullptr; } - orch_so_handle_ = handle; - orch_func_ = orch_func; - orch_bind_runtime_ = bind_runtime_func; - orch_config_func_ = config_func; - snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path); + *p_handle = handle; + *p_func = orch_func; + *p_bind = bind_runtime_func; + *p_config_func = config_func; + snprintf(p_path, 256, "%s", so_path); + orch_so_table_[callable_id].in_use = true; } else { - LOG_INFO_V0("Thread %d: Reusing cached orch SO handle=%p", thread_idx, orch_so_handle_); - if (orch_so_handle_ == nullptr || orch_func_ == nullptr) { - LOG_ERROR("Thread %d: has_new_orch_so=false but no cached SO handle/func", thread_idx); + LOG_INFO_V0( + "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id + ); + if (*p_handle == nullptr || *p_func == nullptr) { + LOG_ERROR( + "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx, + callable_id + ); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -349,8 +387,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } // Validate arg count on every run (reload or cache hit). - if (orch_config_func_ != nullptr) { - PTO2OrchestrationConfig cfg = orch_config_func_(runtime->get_orch_args()); + if (*p_config_func != nullptr) { + PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args()); LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count); if (cfg.expected_arg_count > 0) { const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); @@ -361,17 +399,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) { cfg.expected_arg_count ); // Clean up cached state so a subsequent run does a full reload. - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - orch_so_handle_ = nullptr; + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; } - if (orch_so_path_[0] != '\0') { - unlink(orch_so_path_); - orch_so_path_[0] = '\0'; + if (p_path[0] != '\0') { + unlink(p_path); + p_path[0] = '\0'; } - orch_func_ = nullptr; - orch_bind_runtime_ = nullptr; - orch_config_func_ = nullptr; + *p_func = nullptr; + *p_bind = nullptr; + *p_config_func = nullptr; + orch_so_table_[callable_id].in_use = false; // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -473,11 +512,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) { orch_cycle_start = get_sys_cnt_aicpu(); #endif framework_bind_runtime(rt); - if (orch_bind_runtime_ != nullptr) { - orch_bind_runtime_(rt); + if (*p_bind != nullptr) { + (*p_bind)(rt); } rt_scope_begin(rt); - orch_func_(*orch_args_cached_); + (*p_func)(*orch_args_cached_); rt_scope_end(rt); #if PTO2_PROFILING uint64_t orch_cycle_end = get_sys_cnt_aicpu(); @@ -637,13 +676,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) { if (prev_finished + 1 == thread_num_) { finished_.store(true, std::memory_order_release); // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we - // always tear them down here, but we keep orch_so_handle_ alive for - // the next run's cache-hit reuse (see run() reload_so branch). + // always tear them down here, but we keep the per-cid orch SO entries + // alive for the next run's cache-hit reuse (see run() reload_so branch). if (!runtime->get_orch_built_on_host() && rt != nullptr) { // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt. + const int32_t callable_id = runtime->get_active_callable_id(); framework_bind_runtime(nullptr); - if (orch_bind_runtime_ != nullptr) { - orch_bind_runtime_(nullptr); + if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) { + DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind; + if (bind != nullptr) { + bind(nullptr); + } } runtime_destroy(rt); } @@ -669,10 +712,9 @@ void AicpuExecutor::deinit(Runtime *runtime) { orch_to_sched_ = false; orch_args_cached_ = nullptr; - // orch_so_handle_ / orch_func_ / orch_bind_runtime_ / orch_config_func_ / orch_so_path_ are - // intentionally preserved across deinit: the next run reuses them when - // has_new_orch_so() == false. The destructor releases them at process - // teardown. + // orch_so_table_ entries are intentionally preserved across deinit: the + // next run reuses cached handles when register_new_callable_id() returns + // false. The destructor releases them at process teardown. // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) rt = nullptr; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 4c4e8dd9c..e70f9a309 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -92,31 +92,29 @@ static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *hos } /** - * Initialize a pre-allocated runtime for device orchestration. + * Stage the per-callable resources (kernel binaries + orchestration SO) into + * the supplied runtime so a subsequent bind_prepared_to_runtime_impl can use + * them. This is the cacheable half of init_runtime_impl: nothing here depends + * on per-run argument values, so the prepare_callable / run_prepared split + * lets us run this once per callable_id and amortize across runs. * - * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side). - * This function: - * - Copies tensor metadata and replaces host pointers with device pointers - * - Copies all tensor data to device - * - Records all tensors for copy-back - * - Copies orchestration SO to device memory - * - Sets up runtime state for device orchestration - * - * @param runtime Pointer to pre-constructed Runtime - * @param callable ChipCallable containing orch binary, func_name, and child kernels - * @param orch_args Separated tensor/scalar arguments + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param callable ChipCallable carrying the orch SO + child kernel binaries * @return 0 on success, -1 on failure */ -extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) { - // Validate inputs +extern "C" int prepare_callable_impl(Runtime *runtime, const ChipCallable *callable) { if (runtime == nullptr) { LOG_ERROR("Runtime pointer is null"); return -1; } + if (callable == nullptr) { + LOG_ERROR("Callable pointer is null"); + return -1; + } // Register kernel binaries from ChipCallable children if (callable->child_count() > 0) { - LOG_INFO_V0("Registering %d kernel(s) in init_runtime_impl", callable->child_count()); + LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count()); for (int32_t i = 0; i < callable->child_count(); i++) { int func_id = callable->child_func_id(i); if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { @@ -146,6 +144,32 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, return -1; } + // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume. + runtime->pending_orch_so_data_ = orch_so_binary; + runtime->pending_orch_so_size_ = orch_so_size; + LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); + return 0; +} + +/** + * Per-run binding: build device-side argument storage (tensor copy-out, GM + * heap, PTO2 shared memory) and publish it to the runtime. Assumes the + * callable-side state (kernel binaries, orch SO bytes, func/config names) + * is already populated by prepare_callable_impl. + * + * Splitting this from prepare_callable_impl matches the per-callable_id + * design: register/run_prepared invokes this every call, while the prep + * half runs only once per callable_id. + * + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param orch_args Separated tensor/scalar arguments for this run + * @return 0 on success, -1 on failure + */ +extern "C" int bind_prepared_to_runtime_impl(Runtime *runtime, const ChipStorageTaskArgs *orch_args) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } if (orch_args == nullptr) { LOG_ERROR("orch_args pointer is null"); return -1; @@ -153,7 +177,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, int tensor_count = orch_args->tensor_count(); int scalar_count = orch_args->scalar_count(); - LOG_INFO_V0("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); + LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); int64_t t_total_start = _now_ms(); @@ -196,13 +220,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, } int64_t t_args_end = _now_ms(); - // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume. - int64_t t_so_start = _now_ms(); - runtime->pending_orch_so_data_ = orch_so_binary; - runtime->pending_orch_so_size_ = orch_so_size; - LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); - int64_t t_so_end = _now_ms(); - // Read ready queue shard count from environment for AICPU scheduler { const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS"); @@ -282,7 +299,6 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, int64_t t_total_end = _now_ms(); LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start); - LOG_INFO_V0("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start); LOG_INFO_V0("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start); LOG_INFO_V0("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index e8bd2ff85..48e3c82b6 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -203,12 +203,14 @@ class Runtime { // Device orchestration SO (for dlopen on AICPU thread 3). // The SO bytes themselves live in a separately-allocated device buffer // owned by DeviceRunner; only the metadata below travels inside Runtime. - // `has_new_orch_so_` tells AICPU whether the host believes the SO identity - // changed since the previous run — when false AICPU reuses its cached - // dlopen handle and skips writing the file again. uint64_t dev_orch_so_addr_; uint64_t dev_orch_so_size_; - bool has_new_orch_so_; + // Per-callable_id dispatch. AICPU dispatches via + // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_` + // signals whether the host is delivering a freshly-registered + // callable_id (write+dlopen) or reusing an already-loaded one. + int32_t active_callable_id_; + bool register_new_callable_id_; char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; @@ -261,10 +263,16 @@ class Runtime { void set_orch_args(const ChipStorageTaskArgs &args); // Device orchestration SO binary (for dlopen on AICPU thread 3) - void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new); + void set_dev_orch_so(uint64_t dev_addr, uint64_t size); uint64_t get_dev_orch_so_addr() const; uint64_t get_dev_orch_so_size() const; - bool has_new_orch_so() const; + // Per-callable_id dispatch. callable_id must be in + // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU + // whether to (re)load the orch SO into orch_so_table_[callable_id] or + // reuse the cached entry. + void set_active_callable_id(int32_t callable_id, bool is_new); + int32_t get_active_callable_id() const; + bool register_new_callable_id() const; void set_device_orch_func_name(const char *name); const char *get_device_orch_func_name() const; void set_device_orch_config_name(const char *name); @@ -272,6 +280,13 @@ class Runtime { uint64_t get_function_bin_addr(int func_id) const; void set_function_bin_addr(int func_id, uint64_t addr); + /** + * Replay a previously-uploaded kernel address onto a fresh Runtime + * without recording it in registered_kernel_func_ids_. Used by + * DeviceRunner::bind_prepared_callable_to_runtime so prepared kernel + * binaries are not freed by validate_runtime_impl across runs. + */ + void replay_function_bin_addr(int func_id, uint64_t addr); int get_registered_kernel_count() const; int get_registered_kernel_func_id(int index) const; @@ -299,9 +314,16 @@ class Runtime { // Host-only staging for orchestration SO. runtime_maker publishes the // callable-owned pointer here; DeviceRunner consumes it before launching // the device-side execution and replaces it with the device-resident - // buffer metadata (dev_orch_so_addr_, ..., has_new_orch_so_). + // buffer metadata (dev_orch_so_addr_, dev_orch_so_size_). const void *pending_orch_so_data_{nullptr}; size_t pending_orch_so_size_{0}; + + // Host-orchestration staging (hbg path). Always nullptr on this trb + // variant — included for API parity with host_build_graph so the + // shared platform layer can branch on `pending_host_dlopen_handle_ != + // nullptr` at runtime instead of via a build-time macro. + void *pending_host_dlopen_handle_{nullptr}; + void *pending_host_orch_func_ptr_{nullptr}; }; #endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8f595e1a3..714ba3955 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -54,7 +54,8 @@ Runtime::Runtime() { // Initialize device orchestration SO binary dev_orch_so_addr_ = 0; dev_orch_so_size_ = 0; - has_new_orch_so_ = false; + active_callable_id_ = -1; + register_new_callable_id_ = false; device_orch_func_name_[0] = '\0'; device_orch_config_name_[0] = '\0'; @@ -104,18 +105,24 @@ void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } // Device orchestration SO metadata (bytes live in a separate device buffer -// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime). -void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) { +// owned by DeviceRunner; only the address/size travels in Runtime). +void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { dev_orch_so_addr_ = dev_addr; dev_orch_so_size_ = size; - has_new_orch_so_ = is_new; } uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; } uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; } -bool Runtime::has_new_orch_so() const { return has_new_orch_so_; } +void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) { + active_callable_id_ = callable_id; + register_new_callable_id_ = is_new; +} + +int32_t Runtime::get_active_callable_id() const { return active_callable_id_; } + +bool Runtime::register_new_callable_id() const { return register_new_callable_id_; } void Runtime::set_device_orch_func_name(const char *name) { if (name == nullptr) { @@ -162,6 +169,14 @@ void Runtime::set_function_bin_addr(int func_id, uint64_t addr) { func_id_to_addr_[func_id] = addr; } +void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); + return; + } + func_id_to_addr_[func_id] = addr; +} + int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; } int Runtime::get_registered_kernel_func_id(int index) const { diff --git a/src/common/hierarchical/orchestrator.cpp b/src/common/hierarchical/orchestrator.cpp index c5912a5b9..5a6e710f9 100644 --- a/src/common/hierarchical/orchestrator.cpp +++ b/src/common/hierarchical/orchestrator.cpp @@ -137,25 +137,25 @@ ContinuousTensor Orchestrator::alloc(const std::vector &shape, DataTyp // ============================================================================= SubmitResult -Orchestrator::submit_next_level(uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker) { +Orchestrator::submit_next_level(int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker) { std::vector affinities; if (worker >= 0) affinities = {worker}; - return submit_impl(WorkerType::NEXT_LEVEL, callable, /*callable_id=*/-1, config, {args}, std::move(affinities)); + return submit_impl(WorkerType::NEXT_LEVEL, callable_id, config, {args}, std::move(affinities)); } SubmitResult Orchestrator::submit_next_level_group( - uint64_t callable, const std::vector &args_list, const CallConfig &config, + int32_t callable_id, const std::vector &args_list, const CallConfig &config, const std::vector &workers ) { - return submit_impl(WorkerType::NEXT_LEVEL, callable, /*callable_id=*/-1, config, args_list, workers); + return submit_impl(WorkerType::NEXT_LEVEL, callable_id, config, args_list, workers); } SubmitResult Orchestrator::submit_sub(int32_t callable_id, const TaskArgs &args) { - return submit_impl(WorkerType::SUB, /*callable_ptr=*/0, callable_id, CallConfig{}, {args}); + return submit_impl(WorkerType::SUB, callable_id, CallConfig{}, {args}); } SubmitResult Orchestrator::submit_sub_group(int32_t callable_id, const std::vector &args_list) { - return submit_impl(WorkerType::SUB, /*callable_ptr=*/0, callable_id, CallConfig{}, args_list); + return submit_impl(WorkerType::SUB, callable_id, CallConfig{}, args_list); } // ============================================================================= @@ -163,8 +163,8 @@ SubmitResult Orchestrator::submit_sub_group(int32_t callable_id, const std::vect // ============================================================================= SubmitResult Orchestrator::submit_impl( - WorkerType worker_type, uint64_t callable_ptr, int32_t callable_id, const CallConfig &config, - std::vector args_list, std::vector affinities + WorkerType worker_type, int32_t callable_id, const CallConfig &config, std::vector args_list, + std::vector affinities ) { if (args_list.empty()) throw std::invalid_argument("Orchestrator: args_list must not be empty"); config.validate(); @@ -198,7 +198,6 @@ SubmitResult Orchestrator::submit_impl( s.reset(); s.worker_type = worker_type; - s.callable = callable_ptr; s.callable_id = callable_id; s.config = config; diff --git a/src/common/hierarchical/orchestrator.h b/src/common/hierarchical/orchestrator.h index b6880d3c1..f8abdb424 100644 --- a/src/common/hierarchical/orchestrator.h +++ b/src/common/hierarchical/orchestrator.h @@ -92,18 +92,19 @@ class Orchestrator { void copy_to(int worker_id, uint64_t dst, uint64_t src, size_t size); void copy_from(int worker_id, uint64_t dst, uint64_t src, size_t size); - // Submit a NEXT_LEVEL task. `callable` is the chip callable buffer pointer - // (uint64_t handle from Python — typically ChipCallable.buffer_ptr()). - // Tags inside `args` drive dependency inference; OUTPUT tensors with null - // data are auto-allocated from the HeapRing. + // Submit a NEXT_LEVEL task. `callable_id` is a cid registered via + // Worker.register(): the chip child looks it up in its COW-inherited + // Python registry to get the actual ChipCallable. + // Tags inside `args` drive dependency inference; OUTPUT tensors with + // null data are auto-allocated from the HeapRing. // `worker`: logical worker id for affinity (-1 = unconstrained). SubmitResult - submit_next_level(uint64_t callable, const TaskArgs &args, const CallConfig &config, int8_t worker = -1); + submit_next_level(int32_t callable_id, const TaskArgs &args, const CallConfig &config, int8_t worker = -1); // Submit a group of NEXT_LEVEL tasks: N args -> N workers, 1 DAG node. // `workers`: per-args affinity (empty = all unconstrained). SubmitResult submit_next_level_group( - uint64_t callable, const std::vector &args_list, const CallConfig &config, + int32_t callable_id, const std::vector &args_list, const CallConfig &config, const std::vector &workers = {} ); @@ -178,8 +179,8 @@ class Orchestrator { // Shared submit machinery. Takes `args_list` by value so the Orchestrator // can patch `tensor.data` on OUTPUT tensors flagged for auto-allocation. SubmitResult submit_impl( - WorkerType worker_type, uint64_t callable_ptr, int32_t callable_id, const CallConfig &config, - std::vector args_list, std::vector affinities = {} + WorkerType worker_type, int32_t callable_id, const CallConfig &config, std::vector args_list, + std::vector affinities = {} ); // Size, in aligned bytes, an OUTPUT tensor should occupy in the HeapRing. diff --git a/src/common/hierarchical/types.cpp b/src/common/hierarchical/types.cpp index e04f883f9..882a630c6 100644 --- a/src/common/hierarchical/types.cpp +++ b/src/common/hierarchical/types.cpp @@ -28,7 +28,6 @@ void TaskSlotState::reset() { output_keys.clear(); fanin_producers.clear(); worker_type = WorkerType::NEXT_LEVEL; - callable = 0; callable_id = -1; config = CallConfig{}; task_args.clear(); diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h index dbd91659e..f67fa6028 100644 --- a/src/common/hierarchical/types.h +++ b/src/common/hierarchical/types.h @@ -145,9 +145,12 @@ struct TaskSlotState { // --- Task data (stored on parent heap, lives until slot CONSUMED) --- WorkerType worker_type{WorkerType::NEXT_LEVEL}; - uint64_t callable{0}; // NEXT_LEVEL: ChipCallable buffer ptr; SUB: unused - int32_t callable_id{-1}; // SUB: registered callable id - CallConfig config{}; // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features) + // Unified callable id: NEXT_LEVEL chip callables and SUB fns share the + // same Worker.register() id space. The mailbox wire format writes this + // as a uint64 with the cid in the low 32 bits; dispatch_process reads + // it identically for both worker types. + int32_t callable_id{-1}; + CallConfig config{}; // NEXT_LEVEL config (block_dim, aicpu_thread_num, diagnostics sub-features) // Unified task-args storage: `task_args` is the single-task builder; // when `is_group_` is true, `task_args_list` carries one TaskArgs per diff --git a/src/common/hierarchical/worker_manager.cpp b/src/common/hierarchical/worker_manager.cpp index cb2f31b6e..2d0c40017 100644 --- a/src/common/hierarchical/worker_manager.cpp +++ b/src/common/hierarchical/worker_manager.cpp @@ -139,7 +139,7 @@ void WorkerThread::loop() { } void WorkerThread::dispatch_process(TaskSlotState &s, int32_t group_index) { - uint64_t callable = (s.worker_type == WorkerType::SUB) ? static_cast(s.callable_id) : s.callable; + uint64_t callable = static_cast(static_cast(s.callable_id)); TaskArgsView view = s.args_view(group_index); // Hold mailbox_mu_ for the entire round trip (write payload + state + diff --git a/src/common/task_interface/callable_protocol.h b/src/common/task_interface/callable_protocol.h new file mode 100644 index 000000000..4e3898804 --- /dev/null +++ b/src/common/task_interface/callable_protocol.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Per-callable_id protocol constants + * + * Single source of truth for the host↔AICPU per-callable_id dispatch protocol. + * Kept separate from callable.h so the AICPU side can include it without + * pulling in /. + * + * Both sides must agree on these bounds: + * - Host: DeviceRunner::register_prepared_callable rejects out-of-range ids. + * - AICPU: AicpuExecutor::run guards `orch_so_table_[callable_id]` access. + */ + +#pragma once + +#include + +// Hard cap on the number of distinct callable_ids that can be registered +// via Worker.register / DeviceRunner::register_prepared_callable. The AICPU +// executor reserves a fixed-size `orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]` +// keyed by callable_id, so this bound is part of the host↔AICPU protocol. +constexpr int32_t MAX_REGISTERED_CALLABLE_IDS = 64; diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 38680e77a..7e8fc72b6 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -148,8 +148,12 @@ void ChipWorker::init( copy_to_device_ctx_fn_ = load_symbol(handle, "copy_to_device_ctx"); copy_from_device_ctx_fn_ = load_symbol(handle, "copy_from_device_ctx"); get_runtime_size_fn_ = load_symbol(handle, "get_runtime_size"); - run_runtime_fn_ = load_symbol(handle, "run_runtime"); simpler_init_fn_ = load_symbol(handle, "simpler_init"); + prepare_callable_fn_ = load_symbol(handle, "prepare_callable"); + run_prepared_fn_ = load_symbol(handle, "run_prepared"); + unregister_callable_fn_ = load_symbol(handle, "unregister_callable"); + get_aicpu_dlopen_count_fn_ = load_symbol(handle, "get_aicpu_dlopen_count"); + get_host_dlopen_count_fn_ = load_symbol(handle, "get_host_dlopen_count"); finalize_device_fn_ = load_symbol(handle, "finalize_device"); // ACL lifecycle + comm_* are part of the uniform host_runtime.so ABI. // Every platform runtime exports all of them — runtimes that do not @@ -242,7 +246,11 @@ void ChipWorker::finalize() { copy_to_device_ctx_fn_ = nullptr; copy_from_device_ctx_fn_ = nullptr; get_runtime_size_fn_ = nullptr; - run_runtime_fn_ = nullptr; + prepare_callable_fn_ = nullptr; + run_prepared_fn_ = nullptr; + unregister_callable_fn_ = nullptr; + get_aicpu_dlopen_count_fn_ = nullptr; + get_host_dlopen_count_fn_ = nullptr; finalize_device_fn_ = nullptr; ensure_acl_ready_fn_ = nullptr; create_comm_stream_fn_ = nullptr; @@ -261,15 +269,35 @@ void ChipWorker::finalize() { } void ChipWorker::run(uint64_t callable, TaskArgsView args, const CallConfig &config) { - // L2 ABI edge: assemble the fixed-size ChipStorageTaskArgs POD from the - // view and hand it to the runtime. This conversion used to happen at - // submit time (stored on the slot); it now runs lazily in the worker so - // the slot can carry a single TaskArgs irrespective of the destination. + // The hierarchical layer (worker_manager.cpp) packs the cid produced by + // Worker.register() into this uint64. ChipWorker treats it as such — it + // must already have been prepared via prepare_callable. The legacy + // "callable buffer ptr → run_runtime" path is gone. + run_prepared(static_cast(static_cast(callable)), args, config); +} + +void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) { + if (!device_set_) { + throw std::runtime_error("ChipWorker device not set; call set_device() first"); + } + if (callable == nullptr) { + throw std::runtime_error("prepare_callable: callable must not be null"); + } + int rc = prepare_callable_fn_( + device_ctx_, callable_id, callable, device_id_, aicpu_binary_.data(), aicpu_binary_.size(), + aicore_binary_.data(), aicore_binary_.size() + ); + if (rc != 0) { + throw std::runtime_error("prepare_callable failed with code " + std::to_string(rc)); + } +} + +void ChipWorker::run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config) { ChipStorageTaskArgs chip_storage = view_to_chip_storage(args); - run(reinterpret_cast(callable), &chip_storage, config); + run_prepared(callable_id, &chip_storage, config); } -void ChipWorker::run(const void *callable, const void *args, const CallConfig &config) { +void ChipWorker::run_prepared(int32_t callable_id, const void *args, const CallConfig &config) { config.validate(); if (!device_set_) { throw std::runtime_error("ChipWorker device not set; call set_device() first"); @@ -277,14 +305,38 @@ void ChipWorker::run(const void *callable, const void *args, const CallConfig &c void *rt = runtime_buf_.data(); - int rc = run_runtime_fn_( - device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), + int rc = run_prepared_fn_( + device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_l2_swimlane, config.enable_dump_tensor, config.enable_pmu, config.output_prefix ); if (rc != 0) { - throw std::runtime_error("run_runtime failed with code " + std::to_string(rc)); + throw std::runtime_error("run_prepared failed with code " + std::to_string(rc)); + } +} + +void ChipWorker::unregister_callable(int32_t callable_id) { + if (!device_set_) { + throw std::runtime_error("ChipWorker device not set; call set_device() first"); + } + int rc = unregister_callable_fn_(device_ctx_, callable_id); + if (rc != 0) { + throw std::runtime_error("unregister_callable failed with code " + std::to_string(rc)); + } +} + +size_t ChipWorker::aicpu_dlopen_count() const { + if (!device_set_) { + return 0; + } + return get_aicpu_dlopen_count_fn_(device_ctx_); +} + +size_t ChipWorker::host_dlopen_count() const { + if (!device_set_) { + return 0; } + return get_host_dlopen_count_fn_(device_ctx_); } uint64_t ChipWorker::malloc(size_t size) { diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 3e529a511..c08b1c618 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -52,14 +52,28 @@ class ChipWorker : public IWorker { /// Terminal — the object cannot be reused after this. void finalize(); - // IWorker: build a ChipStorageTaskArgs POD from `args` and execute the - // runtime synchronously. `callable` is a ChipCallable buffer pointer - // cast to uint64. + // IWorker: dispatch the cid `callable` (packed into uint64 by the + // hierarchical layer) by delegating to run_prepared. The cid must + // already have been prepared via prepare_callable. void run(uint64_t callable, TaskArgsView args, const CallConfig &config) override; - // Direct invocation (used by Python wrapper and internal tests) — bypasses - // the TaskArgsView path and takes a ready-made ChipStorageTaskArgs POD. - void run(const void *callable, const void *args, const CallConfig &config); + // Per-callable_id preparation. Requires set_device() and a callable_id + // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64). + void prepare_callable(int32_t callable_id, const void *callable); + void run_prepared(int32_t callable_id, TaskArgsView args, const CallConfig &config); + void run_prepared(int32_t callable_id, const void *args, const CallConfig &config); + void unregister_callable(int32_t callable_id); + + /// Number of distinct callable_ids the AICPU has been asked to dlopen for + /// on the bound device. Returns 0 when no device is set or the runtime + /// variant has no per-cid registration support. Used by tests to assert + /// that prepare_callable + repeated run_prepared do not trigger redundant + /// AICPU dlopens. + size_t aicpu_dlopen_count() const; + + /// Number of host-side dlopens (host_build_graph variant). Mirrors + /// `aicpu_dlopen_count` for the trb path; returns 0 on device-orch variants. + size_t host_dlopen_count() const; uint64_t malloc(size_t size); void free(uint64_t ptr); @@ -102,11 +116,15 @@ class ChipWorker : public IWorker { using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t); using CopyFromDeviceCtxFn = int (*)(void *, void *, const void *, size_t); using GetRuntimeSizeFn = size_t (*)(); - using RunRuntimeFn = int (*)( - void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, - int, int, int, const char * - ); using SimplerInitFn = void (*)(void *, int, int); + using PrepareCallableFn = + int (*)(void *, int32_t, const void *, int, const uint8_t *, size_t, const uint8_t *, size_t); + using RunPreparedFn = int (*)( + void *, void *, int32_t, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, + int, int, const char * + ); + using UnregisterCallableFn = int (*)(void *, int32_t); + using GetAicpuDlopenCountFn = size_t (*)(void *); using FinalizeDeviceFn = int (*)(void *); using EnsureAclReadyFn = int (*)(void *, int); using CreateCommStreamFn = void *(*)(void *); @@ -127,8 +145,12 @@ class ChipWorker : public IWorker { CopyToDeviceCtxFn copy_to_device_ctx_fn_ = nullptr; CopyFromDeviceCtxFn copy_from_device_ctx_fn_ = nullptr; GetRuntimeSizeFn get_runtime_size_fn_ = nullptr; - RunRuntimeFn run_runtime_fn_ = nullptr; SimplerInitFn simpler_init_fn_ = nullptr; + PrepareCallableFn prepare_callable_fn_ = nullptr; + RunPreparedFn run_prepared_fn_ = nullptr; + UnregisterCallableFn unregister_callable_fn_ = nullptr; + GetAicpuDlopenCountFn get_aicpu_dlopen_count_fn_ = nullptr; + GetAicpuDlopenCountFn get_host_dlopen_count_fn_ = nullptr; FinalizeDeviceFn finalize_device_fn_ = nullptr; EnsureAclReadyFn ensure_acl_ready_fn_ = nullptr; CreateCommStreamFn create_comm_stream_fn_ = nullptr; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index b6588dc45..0ef16a13c 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -17,11 +17,12 @@ * * Public API — resolved by ChipWorker via dlsym: * create_device_context, destroy_device_context, - * get_runtime_size, set_device, run_runtime, finalize_device, - * device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx + * get_runtime_size, set_device, finalize_device, + * device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx, + * prepare_callable, run_prepared, unregister_callable * * Memory management: caller allocates a buffer of get_runtime_size() bytes - * and passes it to run_runtime(). Error codes: 0 = success, negative = error. + * and passes it to run_prepared(). Error codes: 0 = success, negative = error. */ #ifndef SRC_COMMON_WORKER_PTO_RUNTIME_C_API_H_ @@ -57,7 +58,7 @@ void destroy_device_context(DeviceContextHandle ctx); /** Return sizeof(Runtime) for caller buffer allocation. */ size_t get_runtime_size(void); -/** Set the target device. Must be called before the first run_runtime(). */ +/** Set the target device. Must be called before the first run_prepared(). */ int set_device(DeviceContextHandle ctx, int device_id); /** Allocate device memory in the given device context. */ @@ -72,42 +73,10 @@ int copy_to_device_ctx(DeviceContextHandle ctx, void *dev_ptr, const void *host_ /** Copy device memory to a host pointer within the given device context. */ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *dev_ptr, size_t size); -/** - * Build the task graph, execute on device, copy results back, and clean up. - * - * @param ctx Device context from create_device_context() - * @param runtime Caller-allocated buffer (size from get_runtime_size()) - * @param callable Opaque ChipCallable pointer (orchestration + kernel binaries) - * @param args Opaque ChipStorageTaskArgs pointer (tensor/scalar arguments) - * @param block_dim Number of AICore blocks - * @param aicpu_thread_num Number of AICPU scheduler threads - * @param device_id Target device - * @param aicpu_binary AICPU executor binary blob - * @param aicpu_size Size of AICPU binary - * @param aicore_binary AICore executor binary blob - * @param aicore_size Size of AICore binary - * @param enable_l2_swimlane 1 to enable perf swimlane collection, 0 to disable - * @param enable_dump_tensor 1 to enable tensor dump, 0 to disable - * @param enable_pmu 0 = PMU disabled; >0 = enabled, value selects event type - * @param output_prefix NUL-terminated directory path under which diagnostic - * artifacts (l2_perf_records.json / tensor_dump/ / - * pmu.csv) are written. Required (non-empty) whenever - * any diagnostic flag is enabled; ignored otherwise. - * - * Log configuration is applied separately via simpler_init() at ChipWorker - * init time and read from runner state when populating KernelArgs. - * @return 0 on success, negative on error - */ -int run_runtime( - DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, - int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix -); - /** * One-shot platform-side log init. Called once by ChipWorker::init() right * after dlopen, before any other entry. Pushes the user's chosen severity + - * INFO verbosity into HostLogger and into runner state (which run_runtime + * INFO verbosity into HostLogger and into runner state (which run_prepared * later forwards to AICPU via KernelArgs). * * On onboard, also calls dlog_setlevel(-1, log_level, 0) so CANN's runtime @@ -127,6 +96,84 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v); */ int finalize_device(DeviceContextHandle ctx); +/* =========================================================================== + * Per-callable_id preparation + * + * The triplet below decouples the one-shot prep work (kernel upload + orch SO + * H2D + caching keyed by `callable_id`) from each `run_prepared` invocation, + * so the per-run cost shrinks to "rebuild Runtime args + launch". Callers + * keep a stable small-int `callable_id` per ChipCallable; the platform side + * caches the prepared state in a fixed-size table (cap 64, see + * MAX_REGISTERED_CALLABLE_IDS in the AICPU executor) and rejects ids outside + * `[0, 64)`. Lifetime: caller must `unregister_callable` before + * `finalize_device` to release the device-side orch SO buffer; kernels stay + * resident until finalize regardless. + * =========================================================================== */ + +/** + * Stage a callable for repeated cheap launches under the given `callable_id`. + * + * Uploads child kernels into the DeviceRunner's func_id-keyed cache and + * copies the orchestration SO bytes into a device-resident buffer keyed by + * the SO's ELF Build-ID hash (so two callable_ids with identical SO share + * one buffer). Subsequent `run_prepared(callable_id, ...)` calls reuse this + * state. + * + * @return 0 on success, negative on error (NULL ctx, callable_id out of + * range, or upload/copy failure). + */ +int prepare_callable( + DeviceContextHandle ctx, int32_t callable_id, const void *callable, int device_id, const uint8_t *aicpu_binary, + size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size +); + +/** + * Launch a callable previously staged via `prepare_callable`. + * + * Looks up the prepared state by `callable_id`, restores the kernel func_id ↔ + * dev_addr table onto a fresh Runtime, and dispatches without re-uploading + * kernels or re-copying the orch SO. The AICPU side dispatches via + * `orch_so_table_[callable_id]` (see runtime.h::set_active_callable_id). The + * first run for a given callable_id sets `register_new_callable_id_` so the + * AICPU does its one-time dlopen. + * + * @return 0 on success, negative on error (no prep state, NULL ctx, etc.). + */ +int run_prepared( + DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, const char *output_prefix +); + +/** + * Drop the prepared state for `callable_id` and release the per-id share of + * the device orch SO buffer. The buffer itself is freed only when its + * hash-keyed refcount drops to zero (different callable_ids with identical + * SO share one allocation). + * + * Kernel binaries uploaded by `prepare_callable` remain resident — they are + * shared across callables by func_id and only released by `finalize_device`. + * + * @return 0 on success or if callable_id was not registered, negative on error. + */ +int unregister_callable(DeviceContextHandle ctx, int32_t callable_id); + +/** + * Number of distinct callable_ids the AICPU has been asked to dlopen for on + * the device bound to `ctx`. Returns 0 on runtime variants without per-cid + * registration support. Used by tests to assert that `prepare_callable` + + * repeated `run_prepared` calls do not trigger redundant AICPU dlopens. + */ +size_t get_aicpu_dlopen_count(DeviceContextHandle ctx); + +/** + * Number of host-side dlopens triggered by `prepare_callable` on the host + * orchestration variants (host_build_graph). Mirrors `get_aicpu_dlopen_count` + * for the trb path. Returns 0 on runtime variants whose orchestration runs on + * the device. + */ +size_t get_host_dlopen_count(DeviceContextHandle ctx); + #ifdef __cplusplus } #endif diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py b/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py new file mode 100644 index 000000000..2a4ed2406 --- /dev/null +++ b/tests/st/a2a3/host_build_graph/prepared_callable/conftest.py @@ -0,0 +1,61 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Isolated L2 worker for prepared_callable white-box tests. + +The default ``st_worker`` (root conftest) is shared across L2 ST classes +in a session-scoped pool — correct for ordinary business tests but not +for prepared_callable, which asserts on the worker's internal cid table +(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare +``RuntimeError``, SO cache hits). Sharing the worker breaks those +assertions: other tests' ``register()`` calls leave residue on the +hard-coded cids 0/1. + +Override ``st_worker`` here as class-scope, building a fresh L2 worker +that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close +per prepared_callable test class. + +The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/ +host_build_graph) share identical conftest content — keep them in sync. +""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture(scope="class") +def st_worker(request, st_platform, device_pool): + cls = request.node.cls + if cls is None or not hasattr(cls, "_st_runtime"): + pytest.skip("isolated st_worker requires a SceneTestCase subclass") + + runtime = cls._st_runtime + build = request.config.getoption("--build", default=False) + + ids = device_pool.allocate(1) + if not ids: + pytest.fail("no devices available for isolated L2 worker") + dev_id = ids[0] + try: + from simpler.worker import Worker # noqa: PLC0415 + + w = Worker( + level=2, + device_id=dev_id, + platform=st_platform, + runtime=runtime, + build=build, + ) + w.init() + try: + yield w + finally: + w.close() + finally: + device_pool.release(ids) diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py new file mode 100644 index 000000000..00a658cc6 --- /dev/null +++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""End-to-end test for ChipWorker.prepare_callable / run_prepared on host_build_graph. + +Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable for the hbg +variant: instead of the AICPU dlopening the orch SO once per cid, hbg dlopens +on the host inside prepare_callable and replays the cached handle/fn pointer +on every run_prepared. The dlopen counter to assert is `host_dlopen_count`, +not `aicpu_dlopen_count` (which stays 0 — AICPU never sees the orch SO). +""" + +import pytest +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs + +_VECTOR_KERNELS = "../vector_example/kernels" + +# White-box cids: this class owns the entire cid table of its isolated +# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional — +# they signify "the first two slots in a fresh table" rather than "any +# free cid". Naming them makes that intent explicit. +_CID_PRIMARY = 0 +_CID_SECONDARY = 1 + + +@scene_test(level=2, runtime="host_build_graph") +class TestPreparedCallableHbg(SceneTestCase): + """Exercise prepare_callable / run_prepared / unregister_callable on hbg. + + Requires an isolated L2 ``Worker`` (cid table starts empty); this is + provided by the directory-local ``conftest.py`` overriding ``st_worker`` + with a class-scope fixture. + """ + + CALLABLE = { + "orchestration": { + "source": f"{_VECTOR_KERNELS}/orchestration/example_orch.cpp", + "function_name": "build_example_graph", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3} + _PLATFORMS = ["a2a3sim", "a2a3"] + + CASES = [ + { + "name": "prepare_run_twice", + "platforms": _PLATFORMS, + "config": _COMMON_CONFIG, + "params": {"a": 2.0, "b": 3.0}, + }, + ] + + def generate_args(self, params): + size = 128 * 128 + a, b = params["a"], params["b"] + return TaskArgsBuilder( + Tensor("a", torch.full((size,), a, dtype=torch.float32)), + Tensor("b", torch.full((size,), b, dtype=torch.float32)), + Tensor("f", torch.zeros(size, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + # vector_example orchestration computes (a + b + 1) * (a + b + 2) + a, b = args.a, args.b + args.f[:] = (a + b + 1) * (a + b + 2) + + def _run_and_validate_l2( + self, + worker, + callable_obj, + case, + rounds=1, + skip_golden=False, + enable_l2_swimlane=False, + enable_dump_tensor=False, + enable_pmu=0, + output_prefix="", + ): + params = case.get("params", {}) + config_dict = case.get("config", {}) + orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", []) + + config = self._build_config(config_dict) + + worker.prepare_callable(_CID_PRIMARY, callable_obj) + worker.prepare_callable(_CID_SECONDARY, callable_obj) + + for _ in range(2): + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_PRIMARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_SECONDARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + worker.unregister_callable(_CID_PRIMARY) + worker.unregister_callable(_CID_SECONDARY) + + # ------------------------------------------------------------------ + # host_dlopen_count assertions (hbg path). + # + # hbg increments host_dlopen_count on every register_prepared_callable_host_orch + # invocation (i.e. each `prepare_callable` call), independent of how many + # times run_prepared is invoked afterwards. AICPU never dlopens the orch + # SO on this variant, so aicpu_dlopen_count stays at 0. + # ------------------------------------------------------------------ + + def _setup_dlopen_count_test(self, st_worker, st_platform): + case = self.CASES[0] + callable_obj = self.build_callable(st_platform) + config = self._build_config(case["config"]) + return callable_obj, config, case + + def _run_one(self, worker, cid, callable_obj, config, case): + params = case["params"] + orch_sig = self.CALLABLE["orchestration"]["signature"] + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + worker.run_prepared(cid, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker): + """prepare(primary) + run × 5 → host_dlopen delta == 1, aicpu == 0.""" + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.host_dlopen_count + baseline_aicpu = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 1, ( + f"expected exactly 1 new host dlopen for 5 runs of primary cid, " + f"got delta {st_worker.host_dlopen_count - baseline}" + ) + assert st_worker.aicpu_dlopen_count == baseline_aicpu, "hbg must not trigger any AICPU orch SO dlopens" + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker): + """prepare(primary)+prepare(secondary) + alternating runs × 5 → host_dlopen delta == 2.""" + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.host_dlopen_count + baseline_aicpu = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + st_worker.prepare_callable(_CID_SECONDARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 2, ( + f"expected exactly 2 new host dlopens for two cids interleaved, " + f"got delta {st_worker.host_dlopen_count - baseline}" + ) + assert st_worker.aicpu_dlopen_count == baseline_aicpu + finally: + st_worker.unregister_callable(_CID_PRIMARY) + st_worker.unregister_callable(_CID_SECONDARY) + + def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker): + """prepare(primary) twice → second call raises RuntimeError.""" + callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform) + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + with pytest.raises(RuntimeError): + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker): + """prepare+run+unregister+prepare+run on the same cid → host_dlopen delta == 2. + + Counter is monotonic — re-prepare always counts a fresh dlopen. + """ + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.host_dlopen_count + registered = False + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 1 + st_worker.unregister_callable(_CID_PRIMARY) + registered = False + after_unreg = st_worker.host_dlopen_count + assert after_unreg - baseline == 1, ( + f"unregister must NOT decrement the host dlopen counter; baseline={baseline}, after_unreg={after_unreg}" + ) + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 2, ( + f"after re-prepare expected counter +2 (two distinct host dlopens), " + f"got delta {st_worker.host_dlopen_count - baseline}" + ) + finally: + if registered: + st_worker.unregister_callable(_CID_PRIMARY) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py new file mode 100644 index 000000000..2a4ed2406 --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/conftest.py @@ -0,0 +1,61 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Isolated L2 worker for prepared_callable white-box tests. + +The default ``st_worker`` (root conftest) is shared across L2 ST classes +in a session-scoped pool — correct for ordinary business tests but not +for prepared_callable, which asserts on the worker's internal cid table +(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare +``RuntimeError``, SO cache hits). Sharing the worker breaks those +assertions: other tests' ``register()`` calls leave residue on the +hard-coded cids 0/1. + +Override ``st_worker`` here as class-scope, building a fresh L2 worker +that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close +per prepared_callable test class. + +The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/ +host_build_graph) share identical conftest content — keep them in sync. +""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture(scope="class") +def st_worker(request, st_platform, device_pool): + cls = request.node.cls + if cls is None or not hasattr(cls, "_st_runtime"): + pytest.skip("isolated st_worker requires a SceneTestCase subclass") + + runtime = cls._st_runtime + build = request.config.getoption("--build", default=False) + + ids = device_pool.allocate(1) + if not ids: + pytest.fail("no devices available for isolated L2 worker") + dev_id = ids[0] + try: + from simpler.worker import Worker # noqa: PLC0415 + + w = Worker( + level=2, + device_id=dev_id, + platform=st_platform, + runtime=runtime, + build=build, + ) + w.init() + try: + yield w + finally: + w.close() + finally: + device_pool.release(ids) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py new file mode 100644 index 000000000..62ced849b --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable. + +Reuses the vector_example orchestration + AIV kernels. Exercises: + - prepare_callable once, then run_prepared twice (second run proves the + AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload). + - Two distinct callable_ids sharing the same orch SO binary: verifies both + produce correct output independently. + - unregister_callable after runs complete: should not raise. + - aicpu_dlopen_count assertions covering: same-cid repeat, multi-cid + interleaving, double-prepare rejection, and unregister + re-prepare. +""" + +import pytest +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs + +_VECTOR_KERNELS = "../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels" + +# White-box cids: this class owns the entire cid table of its isolated +# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional — +# they signify "the first two slots in a fresh table" rather than "any +# free cid". Naming them makes that intent explicit. +_CID_PRIMARY = 0 +_CID_SECONDARY = 1 + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestPreparedCallable(SceneTestCase): + """Exercise prepare_callable / run_prepared / unregister_callable ABI. + + Requires an isolated L2 ``Worker`` (cid table starts empty); this is + provided by the directory-local ``conftest.py`` overriding ``st_worker`` + with a class-scope fixture. + """ + + CALLABLE = { + "orchestration": { + "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3} + _PLATFORMS = ["a2a3sim", "a2a3"] + + CASES = [ + { + "name": "prepare_run_twice", + "platforms": _PLATFORMS, + "config": _COMMON_CONFIG, + "params": {"a": 2.0, "b": 3.0}, + }, + ] + + def generate_args(self, params): + size = 128 * 128 + a, b = params["a"], params["b"] + return TaskArgsBuilder( + Tensor("a", torch.full((size,), a, dtype=torch.float32)), + Tensor("b", torch.full((size,), b, dtype=torch.float32)), + Tensor("f", torch.zeros(size, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) + + def _run_and_validate_l2( + self, + worker, + callable_obj, + case, + rounds=1, + skip_golden=False, + enable_l2_swimlane=False, + enable_dump_tensor=False, + enable_pmu=0, + output_prefix="", + ): + params = case.get("params", {}) + config_dict = case.get("config", {}) + orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", []) + + config = self._build_config(config_dict) + + # 1) prepare two callable_ids with the SAME callable (shared orch SO) + worker.prepare_callable(_CID_PRIMARY, callable_obj) + worker.prepare_callable(_CID_SECONDARY, callable_obj) + + # 2) run_prepared primary cid twice (second run proves dedup/cache hit) + for _ in range(2): + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_PRIMARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + # 3) run_prepared secondary cid — different slot, same SO, must also work + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_SECONDARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + # 4) unregister both — should not raise + worker.unregister_callable(_CID_PRIMARY) + worker.unregister_callable(_CID_SECONDARY) + + # ------------------------------------------------------------------ + # aicpu_dlopen_count assertions. + # + # The class-scope L2 worker is shared across test methods in this + # class (see ./conftest.py), so the counter can be non-zero on entry + # from prior methods. Each test below snapshots the counter on entry, + # asserts the *delta* introduced by the scenario, then unregisters + # everything it staged. unregister_callable does NOT decrement the + # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare). + # ------------------------------------------------------------------ + + def _setup_dlopen_count_test(self, st_worker, st_platform): + """Common fixture: build callable + config, return (callable, config, case).""" + case = self.CASES[0] + callable_obj = self.build_callable(st_platform) + config = self._build_config(case["config"]) + return callable_obj, config, case + + def _run_one(self, worker, cid, callable_obj, config, case): + params = case["params"] + orch_sig = self.CALLABLE["orchestration"]["signature"] + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + worker.run_prepared(cid, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker): + """Case A: prepare(primary) + run × 5 → dlopen_count delta == 1.""" + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 1, ( + f"expected exactly 1 new dlopen for 5 runs of primary cid, " + f"got delta {st_worker.aicpu_dlopen_count - baseline}" + ) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker): + """Case B: prepare(primary)+prepare(secondary) + alternating runs × 5 → delta == 2.""" + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + st_worker.prepare_callable(_CID_SECONDARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 2, ( + f"expected exactly 2 new dlopens for two cids interleaved, " + f"got delta {st_worker.aicpu_dlopen_count - baseline}" + ) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + st_worker.unregister_callable(_CID_SECONDARY) + + def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker): + """Case C: prepare(primary) + prepare(primary) → second call raises RuntimeError.""" + callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform) + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + with pytest.raises(RuntimeError): + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker): + """Case D: prepare+run+unregister+prepare+run on the same cid → delta == 2. + + unregister erases the cid from aicpu_seen_callable_ids_, so the second + prepare/run pair sets register_new_callable_id_ again and the AICPU + does a fresh dlopen. The counter is monotonic (does NOT decrement on + unregister), so the delta after the second cycle is 2. + """ + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.aicpu_dlopen_count + registered = False + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 1 + st_worker.unregister_callable(_CID_PRIMARY) + registered = False + after_unreg = st_worker.aicpu_dlopen_count + assert after_unreg - baseline == 1, ( + f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}" + ) + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 2, ( + f"after re-prepare expected counter +2 (two distinct AICPU dlopens), " + f"got delta {st_worker.aicpu_dlopen_count - baseline}" + ) + finally: + if registered: + st_worker.unregister_callable(_CID_PRIMARY) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a5/host_build_graph/prepared_callable/conftest.py b/tests/st/a5/host_build_graph/prepared_callable/conftest.py new file mode 100644 index 000000000..2a4ed2406 --- /dev/null +++ b/tests/st/a5/host_build_graph/prepared_callable/conftest.py @@ -0,0 +1,61 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Isolated L2 worker for prepared_callable white-box tests. + +The default ``st_worker`` (root conftest) is shared across L2 ST classes +in a session-scoped pool — correct for ordinary business tests but not +for prepared_callable, which asserts on the worker's internal cid table +(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare +``RuntimeError``, SO cache hits). Sharing the worker breaks those +assertions: other tests' ``register()`` calls leave residue on the +hard-coded cids 0/1. + +Override ``st_worker`` here as class-scope, building a fresh L2 worker +that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close +per prepared_callable test class. + +The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/ +host_build_graph) share identical conftest content — keep them in sync. +""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture(scope="class") +def st_worker(request, st_platform, device_pool): + cls = request.node.cls + if cls is None or not hasattr(cls, "_st_runtime"): + pytest.skip("isolated st_worker requires a SceneTestCase subclass") + + runtime = cls._st_runtime + build = request.config.getoption("--build", default=False) + + ids = device_pool.allocate(1) + if not ids: + pytest.fail("no devices available for isolated L2 worker") + dev_id = ids[0] + try: + from simpler.worker import Worker # noqa: PLC0415 + + w = Worker( + level=2, + device_id=dev_id, + platform=st_platform, + runtime=runtime, + build=build, + ) + w.init() + try: + yield w + finally: + w.close() + finally: + device_pool.release(ids) diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp new file mode 100644 index 000000000..8e2094807 --- /dev/null +++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]); + __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]); + __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]); + + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(vRows, vCols); + TileData src1Tile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + pipe_sync(); +} diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp new file mode 100644 index 000000000..056442e21 --- /dev/null +++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/aiv/kernel_add_scalar_inplace.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ float *inout = reinterpret_cast<__gm__ float *>(args[0]); + + union { + uint64_t u64; + float f32; + } converter; + converter.u64 = args[1]; + float scalar = converter.f32; + + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData srcTile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(srcTile, 0x0); + TASSIGN(dstTile, 0x10000); + + GlobalData inoutGlobal(inout); + + TLOAD(srcTile, inoutGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dstTile, srcTile, scalar); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(inoutGlobal, dstTile); + + pipe_sync(); +} diff --git a/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp b/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp new file mode 100644 index 000000000..8c8d807c4 --- /dev/null +++ b/tests/st/a5/host_build_graph/prepared_callable/kernels/orchestration/dump_tensor_orch.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Dump-tensor interface demo for host_build_graph. + * + * Demonstrates the two ways to register tensor metadata for dump: + * Task 0 (add): add_task() + set_tensor_info_to_task() + * Task 1 (add_scalar_inplace): add_task_with_tensor_info() + * + * Computation: f = (a + b) + 1 (a=2, b=3 → f=6) + */ + +#include "orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +int build_dump_tensor_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { + void *host_a = orch_args.tensor(0).data_as(); + void *host_b = orch_args.tensor(1).data_as(); + void *host_f = orch_args.tensor(2).data_as(); + size_t size_a = orch_args.tensor(0).nbytes(); + size_t size_b = orch_args.tensor(1).nbytes(); + size_t size_f = orch_args.tensor(2).nbytes(); + uint32_t size = orch_args.tensor(0).shapes[0]; + + TensorInfo ext_a_info = make_tensor_info_from_tensor_arg(orch_args.tensor(0)); + TensorInfo ext_b_info = make_tensor_info_from_tensor_arg(orch_args.tensor(1)); + TensorInfo ext_f_info = make_tensor_info_from_tensor_arg(orch_args.tensor(2)); + + void *dev_a = device_malloc(runtime, size_a); + copy_to_device(runtime, dev_a, host_a, size_a); + + void *dev_b = device_malloc(runtime, size_b); + copy_to_device(runtime, dev_b, host_b, size_b); + + void *dev_f = device_malloc(runtime, size_f); + record_tensor_pair(runtime, host_f, dev_f, size_f); + + // Task 0: a + b → f (add_task + set_tensor_info_to_task) + uint64_t args_t0[4] = { + reinterpret_cast(dev_a), + reinterpret_cast(dev_b), + reinterpret_cast(dev_f), + size, + }; + int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV); + TensorInfo t0_info[] = {ext_a_info, ext_b_info, ext_f_info}; + set_tensor_info_to_task(runtime, t0, t0_info, 3); + + // Task 1: f += 1.0 (add_task_with_tensor_info) + union { + float f32; + uint64_t u64; + } sc; + sc.f32 = 1.0f; + uint64_t args_t1[3] = {reinterpret_cast(dev_f), sc.u64, size}; + TensorInfo t1_info[] = {ext_f_info}; + int t1 = add_task_with_tensor_info(runtime, args_t1, 3, 1, CoreType::AIV, t1_info, 1); + + add_successor(runtime, t0, t1); + + return 0; +} + +} // extern "C" diff --git a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py new file mode 100644 index 000000000..1efd00806 --- /dev/null +++ b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""End-to-end test for ChipWorker.prepare_callable / run_prepared on a5/host_build_graph. + +Mirrors tests/st/a2a3/host_build_graph/prepared_callable for the a5 variant. +Reuses the dump_tensor example kernels (a + b + 1) since a5/hbg has no +vector_example today and dump_tensor already runs cleanly on a5sim. +""" + +import pytest +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs + +# White-box cids: this class owns the entire cid table of its isolated +# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional — +# they signify "the first two slots in a fresh table" rather than "any +# free cid". Naming them makes that intent explicit. +_CID_PRIMARY = 0 +_CID_SECONDARY = 1 + + +@scene_test(level=2, runtime="host_build_graph") +class TestPreparedCallableHbgA5(SceneTestCase): + """Exercise prepare_callable / run_prepared / unregister_callable on a5/hbg. + + Requires an isolated L2 ``Worker`` (cid table starts empty); this is + provided by the directory-local ``conftest.py`` overriding ``st_worker`` + with a class-scope fixture. + """ + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/dump_tensor_orch.cpp", + "function_name": "build_dump_tensor_graph", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add_scalar_inplace.cpp", + "core_type": "aiv", + "signature": [D.INOUT], + }, + ], + } + + _COMMON_CONFIG = {"aicpu_thread_num": 3, "block_dim": 3} + _PLATFORMS = ["a5sim", "a5"] + + CASES = [ + { + "name": "prepare_run_twice", + "platforms": _PLATFORMS, + "config": _COMMON_CONFIG, + "params": {"a": 2.0, "b": 3.0}, + }, + ] + + def generate_args(self, params): + size = 128 * 128 + a, b = params["a"], params["b"] + return TaskArgsBuilder( + Tensor("a", torch.full((size,), a, dtype=torch.float32)), + Tensor("b", torch.full((size,), b, dtype=torch.float32)), + Tensor("f", torch.zeros(size, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + # dump_tensor orchestration computes f = (a + b) + 1 + args.f[:] = (args.a + args.b) + 1 + + def _run_and_validate_l2( + self, + worker, + callable_obj, + case, + rounds=1, + skip_golden=False, + enable_l2_swimlane=False, + enable_dump_tensor=False, + enable_pmu=0, + output_prefix="", + ): + params = case.get("params", {}) + config_dict = case.get("config", {}) + orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", []) + + config = self._build_config(config_dict) + + worker.prepare_callable(_CID_PRIMARY, callable_obj) + worker.prepare_callable(_CID_SECONDARY, callable_obj) + + for _ in range(2): + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_PRIMARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_SECONDARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + worker.unregister_callable(_CID_PRIMARY) + worker.unregister_callable(_CID_SECONDARY) + + def _setup_dlopen_count_test(self, st_worker, st_platform): + case = self.CASES[0] + callable_obj = self.build_callable(st_platform) + config = self._build_config(case["config"]) + return callable_obj, config, case + + def _run_one(self, worker, cid, callable_obj, config, case): + params = case["params"] + orch_sig = self.CALLABLE["orchestration"]["signature"] + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + worker.run_prepared(cid, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker): + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.host_dlopen_count + baseline_aicpu = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 1 + assert st_worker.aicpu_dlopen_count == baseline_aicpu + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker): + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.host_dlopen_count + baseline_aicpu = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + st_worker.prepare_callable(_CID_SECONDARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 2 + assert st_worker.aicpu_dlopen_count == baseline_aicpu + finally: + st_worker.unregister_callable(_CID_PRIMARY) + st_worker.unregister_callable(_CID_SECONDARY) + + def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker): + callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform) + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + with pytest.raises(RuntimeError): + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker): + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.host_dlopen_count + registered = False + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 1 + st_worker.unregister_callable(_CID_PRIMARY) + registered = False + assert st_worker.host_dlopen_count - baseline == 1, "unregister must NOT decrement the host dlopen counter" + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.host_dlopen_count - baseline == 2 + finally: + if registered: + st_worker.unregister_callable(_CID_PRIMARY) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py new file mode 100644 index 000000000..2a4ed2406 --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/conftest.py @@ -0,0 +1,61 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Isolated L2 worker for prepared_callable white-box tests. + +The default ``st_worker`` (root conftest) is shared across L2 ST classes +in a session-scoped pool — correct for ordinary business tests but not +for prepared_callable, which asserts on the worker's internal cid table +(``aicpu_dlopen_count`` / ``host_dlopen_count`` deltas, double-prepare +``RuntimeError``, SO cache hits). Sharing the worker breaks those +assertions: other tests' ``register()`` calls leave residue on the +hard-coded cids 0/1. + +Override ``st_worker`` here as class-scope, building a fresh L2 worker +that does **not** enter ``_l2_worker_pool``. Cost: one extra init/close +per prepared_callable test class. + +The 4 prepared_callable directories (a2a3/a5 × tensormap_and_ringbuffer/ +host_build_graph) share identical conftest content — keep them in sync. +""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture(scope="class") +def st_worker(request, st_platform, device_pool): + cls = request.node.cls + if cls is None or not hasattr(cls, "_st_runtime"): + pytest.skip("isolated st_worker requires a SceneTestCase subclass") + + runtime = cls._st_runtime + build = request.config.getoption("--build", default=False) + + ids = device_pool.allocate(1) + if not ids: + pytest.fail("no devices available for isolated L2 worker") + dev_id = ids[0] + try: + from simpler.worker import Worker # noqa: PLC0415 + + w = Worker( + level=2, + device_id=dev_id, + platform=st_platform, + runtime=runtime, + build=build, + ) + w.init() + try: + yield w + finally: + w.close() + finally: + device_pool.release(ids) diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py new file mode 100644 index 000000000..a8a7cedf2 --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""End-to-end test for ChipWorker.prepare_callable / run_prepared / unregister_callable on a5/trb. + +Mirrors tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable. Reuses the +vector_example orchestration + AIV kernels. Exercises: + - prepare_callable once, then run_prepared twice (second run proves the + AICPU-side dlopen cache / host-side orch SO dedup is working — no re-upload). + - Two distinct callable_ids sharing the same orch SO binary: verifies both + produce correct output independently. + - unregister_callable after runs complete: should not raise. + - aicpu_dlopen_count assertions covering: same-cid repeat, multi-cid + interleaving, double-prepare rejection, and unregister + re-prepare. +""" + +import pytest +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _build_chip_task_args, _compare_outputs + +_VECTOR_KERNELS = "../../../../../examples/a5/tensormap_and_ringbuffer/vector_example/kernels" + +# White-box cids: this class owns the entire cid table of its isolated +# Worker (see ./conftest.py), so picking 0 and 1 directly is intentional — +# they signify "the first two slots in a fresh table" rather than "any +# free cid". Naming them makes that intent explicit. +_CID_PRIMARY = 0 +_CID_SECONDARY = 1 + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestPreparedCallable(SceneTestCase): + """Exercise prepare_callable / run_prepared / unregister_callable ABI on a5/trb. + + Requires an isolated L2 ``Worker`` (cid table starts empty); this is + provided by the directory-local ``conftest.py`` overriding ``st_worker`` + with a class-scope fixture. + """ + + CALLABLE = { + "orchestration": { + "source": f"{_VECTOR_KERNELS}/orchestration/example_orchestration.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_add_scalar.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{_VECTOR_KERNELS}/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3} + _PLATFORMS = ["a5sim", "a5"] + + CASES = [ + { + "name": "prepare_run_twice", + "platforms": _PLATFORMS, + "config": _COMMON_CONFIG, + "params": {"a": 2.0, "b": 3.0}, + }, + ] + + def generate_args(self, params): + size = 128 * 128 + a, b = params["a"], params["b"] + return TaskArgsBuilder( + Tensor("a", torch.full((size,), a, dtype=torch.float32)), + Tensor("b", torch.full((size,), b, dtype=torch.float32)), + Tensor("f", torch.zeros(size, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) + + def _run_and_validate_l2( + self, + worker, + callable_obj, + case, + rounds=1, + skip_golden=False, + enable_l2_swimlane=False, + enable_dump_tensor=False, + enable_pmu=0, + output_prefix="", + ): + params = case.get("params", {}) + config_dict = case.get("config", {}) + orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", []) + + config = self._build_config(config_dict) + + # 1) prepare two callable_ids with the SAME callable (shared orch SO) + worker.prepare_callable(_CID_PRIMARY, callable_obj) + worker.prepare_callable(_CID_SECONDARY, callable_obj) + + # 2) run_prepared primary cid twice (second run proves dedup/cache hit) + for _ in range(2): + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_PRIMARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + # 3) run_prepared secondary cid — different slot, same SO, must also work + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + + worker.run_prepared(_CID_SECONDARY, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + # 4) unregister both — should not raise + worker.unregister_callable(_CID_PRIMARY) + worker.unregister_callable(_CID_SECONDARY) + + # ------------------------------------------------------------------ + # aicpu_dlopen_count assertions. + # + # The class-scope L2 worker is shared across test methods in this + # class (see ./conftest.py), so the counter can be non-zero on entry + # from prior methods. Each test below snapshots the counter on entry, + # asserts the *delta* introduced by the scenario, then unregisters + # everything it staged. unregister_callable does NOT decrement the + # counter (the counter is monotonic — see test_dlopen_count_unregister_re_prepare). + # ------------------------------------------------------------------ + + def _setup_dlopen_count_test(self, st_worker, st_platform): + """Common fixture: build callable + config, return (callable, config, case).""" + case = self.CASES[0] + callable_obj = self.build_callable(st_platform) + config = self._build_config(case["config"]) + return callable_obj, config, case + + def _run_one(self, worker, cid, callable_obj, config, case): + params = case["params"] + orch_sig = self.CALLABLE["orchestration"]["signature"] + test_args = self.generate_args(params) + chip_args, output_names = _build_chip_task_args(test_args, orch_sig) + golden_args = test_args.clone() + self.compute_golden(golden_args, params) + worker.run_prepared(cid, chip_args, config=config) + _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + + def test_dlopen_count_same_cid_repeated_runs(self, st_platform, st_worker): + """Case A: prepare(primary) + run × 5 → dlopen_count delta == 1.""" + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 1, ( + f"expected exactly 1 new dlopen for 5 runs of primary cid, " + f"got delta {st_worker.aicpu_dlopen_count - baseline}" + ) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_two_cids_alternating(self, st_platform, st_worker): + """Case B: prepare(primary)+prepare(secondary) + alternating runs × 5 → delta == 2.""" + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.aicpu_dlopen_count + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + st_worker.prepare_callable(_CID_SECONDARY, callable_obj) + for _ in range(5): + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + self._run_one(st_worker, _CID_SECONDARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 2, ( + f"expected exactly 2 new dlopens for two cids interleaved, " + f"got delta {st_worker.aicpu_dlopen_count - baseline}" + ) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + st_worker.unregister_callable(_CID_SECONDARY) + + def test_dlopen_count_double_prepare_raises(self, st_platform, st_worker): + """Case C: prepare(primary) + prepare(primary) → second call raises RuntimeError.""" + callable_obj, _config, _case = self._setup_dlopen_count_test(st_worker, st_platform) + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + with pytest.raises(RuntimeError): + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + finally: + st_worker.unregister_callable(_CID_PRIMARY) + + def test_dlopen_count_unregister_re_prepare(self, st_platform, st_worker): + """Case D: prepare+run+unregister+prepare+run on the same cid → delta == 2. + + unregister erases the cid from aicpu_seen_callable_ids_, so the second + prepare/run pair sets register_new_callable_id_ again and the AICPU + does a fresh dlopen. The counter is monotonic (does NOT decrement on + unregister), so the delta after the second cycle is 2. + """ + callable_obj, config, case = self._setup_dlopen_count_test(st_worker, st_platform) + baseline = st_worker.aicpu_dlopen_count + registered = False + try: + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 1 + st_worker.unregister_callable(_CID_PRIMARY) + registered = False + after_unreg = st_worker.aicpu_dlopen_count + assert after_unreg - baseline == 1, ( + f"unregister must NOT decrement the dlopen counter; baseline={baseline}, after_unreg={after_unreg}" + ) + st_worker.prepare_callable(_CID_PRIMARY, callable_obj) + registered = True + self._run_one(st_worker, _CID_PRIMARY, callable_obj, config, case) + assert st_worker.aicpu_dlopen_count - baseline == 2, ( + f"after re-prepare expected counter +2 (two distinct AICPU dlopens), " + f"got delta {st_worker.aicpu_dlopen_count - baseline}" + ) + finally: + if registered: + st_worker.unregister_callable(_CID_PRIMARY) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/explicit_fatal/test_explicit_fatal.py b/tests/st/explicit_fatal/test_explicit_fatal.py index 8a88f0f41..f6c8a34c5 100644 --- a/tests/st/explicit_fatal/test_explicit_fatal.py +++ b/tests/st/explicit_fatal/test_explicit_fatal.py @@ -42,12 +42,13 @@ def test_explicit_fatal_reports(st_platform, st_device_ids): chip_callable = _build_chip_callable(st_platform) worker = Worker(level=2, platform=st_platform, runtime=RUNTIME, device_id=int(st_device_ids[0])) + cid = worker.register(chip_callable) worker.init() try: config = CallConfig() config.block_dim = 24 config.aicpu_thread_num = 4 - with pytest.raises(RuntimeError, match=r"run_runtime failed with code -9"): - worker.run(chip_callable, ChipStorageTaskArgs(), config) + with pytest.raises(RuntimeError, match=r"(run_runtime|run_prepared) failed with code -9"): + worker.run(cid, ChipStorageTaskArgs(), config) finally: worker.close() diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index b3caacd97..5ad49cd52 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -246,6 +246,26 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp) add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp) add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp) +# Per-callable_id orch SO file naming regression (see rtStreamSynchronize +# 507018 root cause). Compiles the a2a3 onboard `create_orch_so_file` +# against the test source so it runs on no-hw runners too. +add_executable(test_orch_so_file + common/test_orch_so_file.cpp + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/onboard/aicpu/orch_so_file.cpp +) +target_include_directories(test_orch_so_file PRIVATE + ${GTEST_INCLUDE_DIRS} + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include +) +target_compile_options(test_orch_so_file PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0) +target_link_libraries(test_orch_so_file PRIVATE + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread +) +add_test(NAME test_orch_so_file COMMAND test_orch_so_file) +set_tests_properties(test_orch_so_file PROPERTIES LABELS "no_hardware") + # --------------------------------------------------------------------------- # A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/) # --------------------------------------------------------------------------- diff --git a/tests/ut/cpp/common/test_orch_so_file.cpp b/tests/ut/cpp/common/test_orch_so_file.cpp new file mode 100644 index 000000000..6e1b32bd6 --- /dev/null +++ b/tests/ut/cpp/common/test_orch_so_file.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Regression test for the per-callable_id orch SO file naming contract. +// +// The onboard variants of `create_orch_so_file` (src/{a2a3,a5}/platform/ +// onboard/aicpu/orch_so_file.cpp) historically used pid-only naming, which +// silently broke once multi-callable dispatch was introduced on the same +// device process: the second cid's `O_TRUNC` open +// shredded the first cid's already-dlopen'd SO image and the next launch +// on cid=0 SIGBUS'd inside the AICPU executor (manifesting as +// `rtStreamSynchronize (AICPU) failed: 507018` on the host). +// +// The fix is to embed `callable_id` in the file name when cid >= 0. This +// test exercises the contract directly: distinct cids must produce distinct +// paths, and the legacy cid=-1 path must remain pid-only (no behavioural +// change for variants that never adopt per-cid dispatch). + +#include + +#include +#include +#include +#include + +#include + +#include "aicpu/orch_so_file.h" + +namespace { + +std::string mkscratch_dir() { + char templ[] = "/tmp/orch_so_file_ut_XXXXXX"; + const char *dir = mkdtemp(templ); + if (dir == nullptr) { + std::abort(); + } + return std::string(dir); +} + +void rmtree(const std::string &dir) { + std::string cmd = "rm -rf '" + dir + "'"; + (void)std::system(cmd.c_str()); +} + +} // namespace + +TEST(OrchSoFile, DistinctCallableIdsProduceDistinctPaths) { + // Repro for the 507018 SIGBUS bug: with pid-only naming, cid=0 and + // cid=1 collide on `libdevice_orch_.so` and the second + // O_TRUNC open silently shreds the first cid's already-dlopen'd + // image. Embedding the cid restores per-callable file isolation. + const std::string dir = mkscratch_dir(); + char path0[256] = {}; + char path1[256] = {}; + + int32_t fd0 = create_orch_so_file(dir.c_str(), /*callable_id=*/0, path0, sizeof(path0)); + ASSERT_GE(fd0, 0) << "create_orch_so_file(cid=0) failed"; + close(fd0); + + int32_t fd1 = create_orch_so_file(dir.c_str(), /*callable_id=*/1, path1, sizeof(path1)); + ASSERT_GE(fd1, 0) << "create_orch_so_file(cid=1) failed"; + close(fd1); + + EXPECT_STRNE(path0, path1) << "Distinct cids must yield distinct file paths " + "(otherwise O_TRUNC would corrupt the first SO)."; + + rmtree(dir); +} + +TEST(OrchSoFile, LegacySentinelKeepsPidOnlyNaming) { + // Variants that never adopt per-cid dispatch pass cid=-1; the file + // name must remain pid-only so existing callers see no change. + const std::string dir = mkscratch_dir(); + char path[256] = {}; + + int32_t fd = create_orch_so_file(dir.c_str(), /*callable_id=*/-1, path, sizeof(path)); + ASSERT_GE(fd, 0); + close(fd); + + char expected[256]; + std::snprintf(expected, sizeof(expected), "%s/libdevice_orch_%d.so", dir.c_str(), getpid()); + EXPECT_STREQ(path, expected) << "Legacy (cid=-1) path must remain pid-only"; + + rmtree(dir); +} diff --git a/tests/ut/cpp/hierarchical/test_orchestrator.cpp b/tests/ut/cpp/hierarchical/test_orchestrator.cpp index 7c0d45978..59371c6da 100644 --- a/tests/ut/cpp/hierarchical/test_orchestrator.cpp +++ b/tests/ut/cpp/hierarchical/test_orchestrator.cpp @@ -70,7 +70,7 @@ struct OrchestratorFixture : public ::testing::Test { TEST_F(OrchestratorFixture, IndependentTaskIsImmediatelyReady) { auto a = single_tensor_args(0xCAFE, TensorArgType::OUTPUT); - auto res = orch.submit_next_level(/*callable=*/0xDEAD, a, cfg); + auto res = orch.submit_next_level(/*callable_id=*/42, a, cfg); EXPECT_NE(res.task_slot, INVALID_SLOT); TaskSlot slot; @@ -82,13 +82,13 @@ TEST_F(OrchestratorFixture, IndependentTaskIsImmediatelyReady) { TEST_F(OrchestratorFixture, DependentTaskIsPending) { // Task A produces an OUTPUT at key 0xBEEF auto args_a = single_tensor_args(0xBEEF, TensorArgType::OUTPUT); - auto a = orch.submit_next_level(0xDEAD, args_a, cfg); + auto a = orch.submit_next_level(42, args_a, cfg); TaskSlot a_slot; rq.try_pop(a_slot); // Task B reads INPUT at the same key -- depends on A auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT); - auto b = orch.submit_next_level(0xDEAD, args_b, cfg); + auto b = orch.submit_next_level(42, args_b, cfg); EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING); EXPECT_EQ(S(b.task_slot).fanin_count, 1); @@ -98,7 +98,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) { TEST_F(OrchestratorFixture, TensorMapTracksProducer) { auto args_a = single_tensor_args(0x1234, TensorArgType::OUTPUT); - auto a = orch.submit_next_level(0xDEAD, args_a, cfg); + auto a = orch.submit_next_level(42, args_a, cfg); TaskSlot drain_slot; rq.try_pop(drain_slot); @@ -107,7 +107,7 @@ TEST_F(OrchestratorFixture, TensorMapTracksProducer) { TEST_F(OrchestratorFixture, OnConsumedCleansUpTensorMap) { auto args_a = single_tensor_args(0x42, TensorArgType::OUTPUT); - auto a = orch.submit_next_level(0xDEAD, args_a, cfg); + auto a = orch.submit_next_level(42, args_a, cfg); TaskSlot slot; rq.try_pop(slot); @@ -123,7 +123,7 @@ TEST_F(OrchestratorFixture, OnConsumedCleansUpTensorMap) { TEST_F(OrchestratorFixture, ScopeRegistersAndReleasesRef) { orch.scope_begin(); auto args_a = single_tensor_args(0x77, TensorArgType::OUTPUT); - auto a = orch.submit_next_level(0xDEAD, args_a, cfg); + auto a = orch.submit_next_level(42, args_a, cfg); TaskSlot slot; rq.try_pop(slot); @@ -147,13 +147,13 @@ TEST_F(OrchestratorFixture, ScopeRegistersAndReleasesRef) { TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) { // OUTPUT-tagged input registers a producer auto args_a = single_tensor_args(0xAAAA, TensorArgType::OUTPUT); - auto a = orch.submit_next_level(0xDEAD, args_a, cfg); + auto a = orch.submit_next_level(42, args_a, cfg); TaskSlot drain_slot; rq.try_pop(drain_slot); // Second task references same key but tagged NO_DEP -- should be independent auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP); - auto b = orch.submit_next_level(0xDEAD, args_b, cfg); + auto b = orch.submit_next_level(42, args_b, cfg); EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY); EXPECT_EQ(S(b.task_slot).fanin_count, 0); } @@ -161,7 +161,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) { TEST_F(OrchestratorFixture, GroupTaskStoresArgsListPerMember) { TaskArgs a0 = single_tensor_args(0xA0, TensorArgType::OUTPUT); TaskArgs a1 = single_tensor_args(0xA1, TensorArgType::OUTPUT); - auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg); + auto res = orch.submit_next_level_group(42, {a0, a1}, cfg); EXPECT_NE(res.task_slot, INVALID_SLOT); EXPECT_TRUE(S(res.task_slot).is_group()); @@ -179,7 +179,7 @@ TEST_F(OrchestratorFixture, GroupTaskStoresArgsListPerMember) { TEST_F(OrchestratorFixture, SingleTaskStoresTaskArgsDirectly) { TaskArgs a0 = single_tensor_args(0xC0, TensorArgType::OUTPUT); - auto res = orch.submit_next_level(0xDEAD, a0, cfg); + auto res = orch.submit_next_level(42, a0, cfg); ASSERT_NE(res.task_slot, INVALID_SLOT); EXPECT_FALSE(S(res.task_slot).is_group()); EXPECT_EQ(S(res.task_slot).group_size(), 1); @@ -200,7 +200,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) { t.dtype = DataType::UINT8; args.add_tensor(t, TensorArgType::OUTPUT); - auto res = orch.submit_next_level(0xDEAD, args, cfg); + auto res = orch.submit_next_level(42, args, cfg); ASSERT_NE(res.task_slot, INVALID_SLOT); uint64_t data = S(res.task_slot).task_args.tensor(0).data; @@ -220,7 +220,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) { // the alloc-slot (so its HeapRing slab stays live while they write) // must tag the buffer INOUT. auto creator_args = single_tensor_args(0xFEED, TensorArgType::OUTPUT); - auto creator = orch.submit_next_level(0xDEAD, creator_args, cfg); + auto creator = orch.submit_next_level(42, creator_args, cfg); TaskSlot drain; rq.try_pop(drain); // Mark the creator COMPLETED so the new submit mimics the alloc-slot @@ -228,7 +228,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) { S(creator.task_slot).state.store(TaskState::COMPLETED, std::memory_order_relaxed); auto writer_args = single_tensor_args(0xFEED, TensorArgType::INOUT); - auto writer = orch.submit_next_level(0xDEAD, writer_args, cfg); + auto writer = orch.submit_next_level(42, writer_args, cfg); TaskSlot writer_slot; rq.try_pop(writer_slot); @@ -259,13 +259,13 @@ TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) { }; for (Case c : {Case{0xABCD, TensorArgType::OUTPUT}, Case{0xBEEF, TensorArgType::OUTPUT_EXISTING}}) { auto prior_args = single_tensor_args(c.key, TensorArgType::OUTPUT); - auto prior = orch.submit_next_level(0xDEAD, prior_args, cfg); + auto prior = orch.submit_next_level(42, prior_args, cfg); TaskSlot drain; rq.try_pop(drain); S(prior.task_slot).state.store(TaskState::COMPLETED, std::memory_order_relaxed); auto writer_args = single_tensor_args(c.key, c.tag); - auto writer = orch.submit_next_level(0xDEAD, writer_args, cfg); + auto writer = orch.submit_next_level(42, writer_args, cfg); EXPECT_EQ(tm.lookup(TensorKey{c.key, -1}), writer.task_slot); EXPECT_EQ(S(writer.task_slot).fanin_count, 0); diff --git a/tests/ut/cpp/hierarchical/test_scheduler.cpp b/tests/ut/cpp/hierarchical/test_scheduler.cpp index a66dcfd27..2fc7ba8c1 100644 --- a/tests/ut/cpp/hierarchical/test_scheduler.cpp +++ b/tests/ut/cpp/hierarchical/test_scheduler.cpp @@ -267,13 +267,13 @@ struct SchedulerFixture : public ::testing::Test { TEST_F(SchedulerFixture, IndependentTaskDispatchedAndConsumed) { auto args_a = single_tensor_args(0xCAFE, TensorArgType::OUTPUT); - auto res = orch.submit_next_level(0xDEAD, args_a, cfg); + auto res = orch.submit_next_level(42, args_a, cfg); TaskSlot slot = res.task_slot; mock_worker.wait_running(); ASSERT_GE(mock_worker.dispatched_count(), 1); EXPECT_EQ(mock_worker.dispatched[0].tensor_key, 0xCAFEu); - EXPECT_EQ(mock_worker.dispatched[0].callable, 0xDEADu); + EXPECT_EQ(mock_worker.dispatched[0].callable, 42u); mock_worker.complete(); wait_consumed(slot); @@ -281,14 +281,14 @@ TEST_F(SchedulerFixture, IndependentTaskDispatchedAndConsumed) { TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) { auto args_a = single_tensor_args(0xBEEF, TensorArgType::OUTPUT); - auto a = orch.submit_next_level(0xAA, args_a, cfg); + auto a = orch.submit_next_level(10, args_a, cfg); auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT); - auto b = orch.submit_next_level(0xBB, args_b, cfg); + auto b = orch.submit_next_level(11, args_b, cfg); EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING); mock_worker.wait_running(); - EXPECT_EQ(mock_worker.dispatched[0].callable, 0xAAu); + EXPECT_EQ(mock_worker.dispatched[0].callable, 10u); mock_worker.complete(); // A done auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(300); @@ -296,7 +296,7 @@ TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } ASSERT_GE(mock_worker.dispatched_count(), 2); - EXPECT_EQ(mock_worker.dispatched[1].callable, 0xBBu); + EXPECT_EQ(mock_worker.dispatched[1].callable, 11u); mock_worker.complete(); // B done wait_consumed(b.task_slot); @@ -375,7 +375,7 @@ TEST_F(GroupSchedulerFixture, GroupDispatchesToNWorkers) { TaskArgs a0 = single_tensor_args(0xA0, TensorArgType::OUTPUT); TaskArgs a1 = single_tensor_args(0xA1, TensorArgType::OUTPUT); - auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg); + auto res = orch.submit_next_level_group(42, {a0, a1}, cfg); TaskSlot slot = res.task_slot; worker_a.wait_running(); @@ -400,7 +400,7 @@ TEST_F(GroupSchedulerFixture, GroupDispatchesToNWorkers) { TEST_F(GroupSchedulerFixture, GroupCompletesOnlyWhenAllDone) { TaskArgs a0 = single_tensor_args(0xB0, TensorArgType::OUTPUT); TaskArgs a1 = single_tensor_args(0xB1, TensorArgType::OUTPUT); - auto res = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg); + auto res = orch.submit_next_level_group(42, {a0, a1}, cfg); TaskSlot slot = res.task_slot; worker_a.wait_running(); @@ -491,7 +491,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated) // Submit a next-level task; the only chip worker begins running it and // stays blocked until we call complete() on it. auto chip_args = single_tensor_args(0xAAA, TensorArgType::OUTPUT); - auto chip = orch.submit_next_level(0xCDCD, chip_args, cfg); + auto chip = orch.submit_next_level(20, chip_args, cfg); next_level_worker.wait_running(); ASSERT_TRUE(next_level_worker.is_running.load()); @@ -522,10 +522,10 @@ TEST_F(GroupSchedulerFixture, GroupDependencyChain) { // Task B reads INPUT at the same key -- depends on group A. TaskArgs a0 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT); TaskArgs a1 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT); - auto a = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg); + auto a = orch.submit_next_level_group(42, {a0, a1}, cfg); auto args_b = single_tensor_args(0xCAFE, TensorArgType::INPUT); - auto b = orch.submit_next_level(0xDEAD, args_b, cfg); + auto b = orch.submit_next_level(42, args_b, cfg); EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING); worker_a.wait_running(); diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 520254cc5..d6489dc09 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -68,19 +68,6 @@ def test_initial_state(self): assert worker.device_set is False assert worker.device_id == -1 - def test_run_before_set_device_raises(self): - from _task_interface import ChipCallable, ChipStorageTaskArgs # noqa: PLC0415 - - worker = _ChipWorker() - config = CallConfig() - args = ChipStorageTaskArgs() - - # Build a minimal ChipCallable for the test - callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[]) - - with pytest.raises(RuntimeError, match="device not set"): - worker.run(callable_obj, args, config) - def test_set_device_before_init_raises(self): worker = _ChipWorker() with pytest.raises(RuntimeError, match="not initialized"): @@ -110,6 +97,28 @@ def test_init_with_nonexistent_lib_raises(self): with pytest.raises(RuntimeError, match="dlopen"): worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so") + def test_prepare_callable_before_set_device_raises(self): + from _task_interface import ChipCallable # noqa: PLC0415 + + worker = _ChipWorker() + callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[]) + with pytest.raises(RuntimeError, match="device not set"): + worker.prepare_callable(0, callable_obj) + + def test_run_prepared_before_set_device_raises(self): + from _task_interface import ChipStorageTaskArgs # noqa: PLC0415 + + worker = _ChipWorker() + config = CallConfig() + args = ChipStorageTaskArgs() + with pytest.raises(RuntimeError, match="device not set"): + worker.run_prepared(0, args, config) + + def test_unregister_callable_before_set_device_raises(self): + worker = _ChipWorker() + with pytest.raises(RuntimeError, match="device not set"): + worker.unregister_callable(0) + # ============================================================================ # Python-level ChipWorker wrapper tests