diff --git a/docs/chip-level-arch.md b/docs/chip-level-arch.md index 3657b0244..06dd64445 100644 --- a/docs/chip-level-arch.md +++ b/docs/chip-level-arch.md @@ -113,7 +113,7 @@ runner.finalize(); ```c DeviceContextHandle ctx = create_device_context(); -set_device(ctx, device_id); +simpler_init(ctx, device_id, log_level, log_info_v); // attach + log config size_t size = get_runtime_size(); run_runtime(ctx, runtime, callable, args, block_dim, aicpu_thread_num, device_id, @@ -129,8 +129,8 @@ destroy_device_context(ctx); from simpler.task_interface import ChipWorker, ChipCallable, ChipStorageTaskArgs, CallConfig worker = ChipWorker() -worker.init(host_lib_path, aicpu_path, aicore_path, sim_context_lib_path="") -worker.set_device(device_id) +worker.init(host_lib_path, aicpu_path, aicore_path, simpler_log_lib_path, + device_id, sim_context_lib_path="") config = CallConfig() config.block_dim = 24 @@ -171,20 +171,21 @@ Python test_*.py (SceneTestCase) ├─→ KernelCompiler(platform).compile_orchestration(runtime, source) → orch .so │ └─→ ChipWorker() - └─→ init(host_path, aicpu_path, aicore_path) - └─→ dlopen(host.so) → resolve C API symbols via dlsym + └─→ init(host_path, aicpu_path, aicore_path, simpler_log_path, device_id) + ├─→ dlopen(host.so) → resolve C API symbols via dlsym + ├─→ create_device_context() → DeviceContextHandle + └─→ simpler_init(ctx, device_id, log_level, log_info_v) + └─→ DeviceRunner::attach_current_thread(device_id) + ├─→ rtSetDevice(device_id) on onboard + └─→ pto_cpu_sim_bind+acquire on sim ``` ### 2. Initialization Phase -```text -worker.set_device(device_id) - │ - └─→ create_device_context() → DeviceContextHandle - └─→ set_device(ctx, device_id) - ├─→ Initialize device (CANN on hardware, no-op on sim) - └─→ Allocate device streams -``` +The thread that called `init()` is now attached to `device_id`. Streams are +created lazily on the first `run()` call (`prepare_run_context`). Subsequent +device-ops (`malloc`, `copy_to`, `copy_from`, `free`) reuse that per-thread +binding — they must be called from the same thread that called `init()`. ### 3. Execution Phase diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md index 2c622cf3d..63a63b09c 100644 --- a/docs/dynamic-linking.md +++ b/docs/dynamic-linking.md @@ -255,17 +255,18 @@ different tasks have different configurations. ## Execution Lifecycle -### Simulation (in-process, per-task init/reset) +### Simulation (in-process, per-task) ```text -ChipWorker.init(host_path, aicpu_path, aicore_path) +ChipWorker.init(host_path, aicpu_path, aicore_path, device_id) dlopen(host_runtime.so, RTLD_GLOBAL) - dlsym: create_device_context, destroy_device_context, set_device, + dlsym: create_device_context, destroy_device_context, simpler_init, get_runtime_size, run_runtime, finalize_device - -ChipWorker.set_device(device_id) create_device_context() → DeviceContextHandle - set_device(ctx, device_id) + simpler_init(ctx, device_id, log_level, log_info_v) + DeviceRunner::attach_current_thread(device_id) + pto_cpu_sim_bind_device(device_id) + pto_cpu_sim_acquire_device(device_id) ChipWorker.run(callable, args, config) run_runtime(ctx, buf, callable, args, ...) @@ -280,12 +281,9 @@ ChipWorker.run(callable, args, config) validate_runtime_impl(r) copy results, remove kernels r->~Runtime() -ChipWorker.reset_device() +ChipWorker.finalize() finalize_device(ctx) destroy_device_context(ctx) - -ChipWorker.finalize() - reset_device() (if needed) dlclose(host_runtime.so) -fno-gnu-unique ensures real unload ``` @@ -294,11 +292,11 @@ ChipWorker.finalize() ```text device_worker_main(device_id) for each runtime_group: - ChipWorker.init(host_path, aicpu_path, aicore_path) + ChipWorker.init(host_path, aicpu_path, aicore_path, device_id) dlopen(host_runtime.so, RTLD_GLOBAL) - ChipWorker.set_device(device_id) create_device_context() - set_device(ctx, device_id) rtSetDevice() + simpler_init(ctx, device_id, log_level, log_info_v) + DeviceRunner::attach_current_thread(device_id) rtSetDevice() for each task in group: ChipWorker.run(callable, args, config) @@ -312,10 +310,8 @@ device_worker_main(device_id) launch_aicore_kernel() rtRegisterAllKernel + rtKernelLaunch validate_runtime_impl() rtMemcpy results back to host - ChipWorker.reset_device() + ChipWorker.finalize() finalize_device(ctx) rtDeviceReset() destroy_device_context(ctx) - - ChipWorker.finalize() dlclose(host_runtime.so) ``` diff --git a/docs/getting-started.md b/docs/getting-started.md index a7232d2bf..b0b2fe413 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -159,18 +159,15 @@ from simpler_setup.runtime_builder import RuntimeBuilder builder = RuntimeBuilder(platform="a2a3sim") binaries = builder.get_binaries("tensormap_and_ringbuffer") -# Create worker and initialize with platform binaries +# Create worker and initialize with platform binaries (attaches the calling +# thread to device 0 internally — no separate set_device step required) worker = ChipWorker() -worker.init(host_path=str(binaries.host_path), - aicpu_path=str(binaries.aicpu_path), - aicore_path=str(binaries.aicore_path)) -worker.set_device(device_id=0) +worker.init(device_id=0, bins=binaries) # Execute callable on device worker.run(chip_callable, orch_args, block_dim=24) # Cleanup -worker.reset_device() worker.finalize() ``` diff --git a/examples/workers/l2/hello_worker/main.py b/examples/workers/l2/hello_worker/main.py index fd5b56b9f..bf85efbf4 100644 --- a/examples/workers/l2/hello_worker/main.py +++ b/examples/workers/l2/hello_worker/main.py @@ -14,7 +14,7 @@ 1. Your venv can import ``simpler.Worker`` (i.e. the nanobind extension is built). 2. Pre-built runtime binaries exist under ``build/lib//tensormap_and_ringbuffer/`` so that ``RuntimeBuilder`` can find them on ``Worker.init()``. - 3. ``set_device()`` + ACL init on the chosen platform works end-to-end. + 3. ``ChipWorker.init(device_id)`` + ACL init on the chosen platform works end-to-end. If this example runs cleanly, moving on to ``vector_add/`` (which adds a real kernel, TaskArgs, and a golden check) is safe. diff --git a/examples/workers/l2/worker_malloc/README.md b/examples/workers/l2/worker_malloc/README.md index 615f337c8..9ea6b1d7c 100644 --- a/examples/workers/l2/worker_malloc/README.md +++ b/examples/workers/l2/worker_malloc/README.md @@ -12,12 +12,12 @@ the `Worker` API in isolation: There is **no `worker.run()` call** anywhere — that's deliberate. On real hardware the CANN device context is per-thread, so `rtMalloc` only succeeds -on a thread previously bound by `rtSetDevice`. `Worker.init()` is the only -thing that performs that bind for the Python caller thread; if its `set_device` -path is broken, `worker.malloc()` fails with CANN error 107002 *before* any -kernel ever runs. Every example that does `init() -> run() -> ...` accidentally -masks that bug because the run path re-binds the device on the same thread -just before allocations happen. This example doesn't. +on a thread previously bound by `rtSetDevice`. `Worker.init(...)` is the +only thing that performs that bind for the Python caller thread; if that +path is broken, `worker.malloc()` fails with CANN error 107002 *before* +any kernel ever runs. Every example that does `init() -> run() -> ...` +accidentally masks that bug because the run path re-binds the device on the +same thread just before allocations happen. This example doesn't. ## Run @@ -45,6 +45,6 @@ Same for `a5sim` / `a5`. If you see `rtMalloc failed: 107002` on `a2a3` / `a5` (but the same example passes on `a2a3sim` / `a5sim`), the per-thread `rtSetDevice` is not happening -during `Worker.init()` — see `src/{arch}/platform/onboard/host/pto_runtime_c_api.cpp` -and confirm the C-API `set_device` actually calls -`DeviceRunner::attach_current_thread`. +during `Worker.init()` — see `simpler_init` in +`src/{arch}/platform/onboard/host/pto_runtime_c_api.cpp` and confirm it +forwards to `DeviceRunner::attach_current_thread`. diff --git a/examples/workers/l2/worker_malloc/main.py b/examples/workers/l2/worker_malloc/main.py index 98b0b787c..e6d7ec2cd 100644 --- a/examples/workers/l2/worker_malloc/main.py +++ b/examples/workers/l2/worker_malloc/main.py @@ -19,13 +19,14 @@ Why a standalone example for these? On real hardware (a2a3 / a5 onboard) the CANN device context is per-thread, so ``rtMalloc`` only succeeds on a thread -that previously executed ``rtSetDevice``. Until you call ``worker.run(...)`` -the only thing that has bound the device on the calling Python thread is -``Worker.init() -> ChipWorker::set_device(...)``. If that path is broken, -this example fails at the first ``worker.malloc`` with CANN error 107002. -``vector_add`` happens to mask that bug because its first malloc lands on -the same thread that ``run()`` later attaches; this example doesn't ``run`` -at all, so it's a focused regression check for the standalone alloc path. +that previously executed ``rtSetDevice``. ``Worker.init(...)`` is now the +single point that performs that bind for the Python caller thread (folded +down from the previous explicit ``ChipWorker::set_device``). If that path +breaks, this example fails at the first ``worker.malloc`` with CANN error +107002. ``vector_add`` happens to mask such a bug because its first malloc +lands on the same thread that ``run()`` later attaches; this example doesn't +``run`` at all, so it's a focused regression check for the standalone alloc +path. Run: python examples/workers/l2/worker_malloc/main.py -p a2a3sim -d 0 diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 344758b78..857f146ee 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -616,11 +616,9 @@ NB_MODULE(_task_interface, m) { .def(nb::init<>()) .def( "init", &ChipWorker::init, nb::arg("host_lib_path"), nb::arg("aicpu_path"), nb::arg("aicore_path"), - nb::arg("simpler_log_lib_path"), nb::arg("sim_context_lib_path") = "", nb::arg("log_level") = 1, - nb::arg("log_info_v") = 5 + nb::arg("simpler_log_lib_path"), nb::arg("device_id"), nb::arg("sim_context_lib_path") = "", + nb::arg("log_level") = 1, nb::arg("log_info_v") = 5 ) - .def("set_device", &ChipWorker::set_device, nb::arg("device_id")) - .def("reset_device", &ChipWorker::reset_device) .def("finalize", &ChipWorker::finalize) .def( "run", @@ -650,7 +648,6 @@ NB_MODULE(_task_interface, m) { ) .def_prop_ro("device_id", &ChipWorker::device_id) .def_prop_ro("initialized", &ChipWorker::initialized) - .def_prop_ro("device_set", &ChipWorker::device_set) .def("malloc", &ChipWorker::malloc, nb::arg("size")) .def("free", &ChipWorker::free, nb::arg("ptr")) .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size")) diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index 71ac81122..adb9b90db 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -230,30 +230,29 @@ class ChipContext: class ChipWorker: """Unified execution interface wrapping the host runtime C API. - The runtime library is bound once via init() and cannot be changed. - Devices can be set and reset independently. + The runtime library and target device are bound once via init() and + cannot be changed. Usage:: worker = ChipWorker() - worker.init(host_path="build/lib/.../host.so", - aicpu_path="build/lib/.../aicpu.so", - aicore_path="build/lib/.../aicore.o") - worker.set_device(device_id=0) + worker.init(device_id=0, bins=bins) worker.run(chip_callable, orch_args, block_dim=24) - worker.reset_device() worker.finalize() """ def __init__(self): self._impl = _ChipWorker() - def init(self, bins, log_level=None, log_info_v=None): - """Load host runtime library and cache platform binaries. + def init(self, device_id, bins, log_level=None, log_info_v=None): + """Attach the calling thread to ``device_id``, load the host runtime + library, and cache platform binaries. - Can only be called once — the runtime cannot be changed. + Can only be called once — the runtime and device cannot be changed + after init. Args: + device_id: NPU device ID to attach the calling thread to. bins: A `simpler_setup.runtime_builder.RuntimeBinaries` (or any object exposing host_path / aicpu_path / aicore_path / simpler_log_path / sim_context_path). @@ -279,25 +278,12 @@ def init(self, bins, log_level=None, log_info_v=None): str(bins.aicpu_path), str(bins.aicore_path), str(bins.simpler_log_path), + int(device_id), str(bins.sim_context_path) if bins.sim_context_path else "", log_level, log_info_v, ) - def set_device(self, device_id): - """Set the target NPU device. - - Requires init() first. Can be called after reset_device() to switch devices. - - Args: - device_id: NPU device ID. - """ - self._impl.set_device(device_id) - - def reset_device(self): - """Release device resources. The runtime binding remains intact.""" - self._impl.reset_device() - def finalize(self): """Tear down everything: device resources and runtime library. @@ -389,9 +375,13 @@ def bootstrap_context( # noqa: PLR0912 -- config validation + comm setup + wind cfg: ChipBootstrapConfig, channel: Optional[ChipBootstrapChannel] = None, ) -> ChipBootstrapResult: - """One-shot per-chip bootstrap: set device, build communicator, slice window, + """One-shot per-chip bootstrap: build communicator, slice window, stage inputs from host shared memory, and (optionally) publish the result. + The target device must already be attached via ``init(bins, device_id)`` + before invoking this method; ``device_id`` is supplied here only to + catch a caller that wired up the wrong device on the wrong worker. + Runs inside a forked chip child. If ``channel`` is provided (the Worker-orchestrated integration path), the result is written as SUCCESS or — on any exception — as ERROR (code=1, @@ -428,7 +418,11 @@ def bootstrap_context( # noqa: PLR0912 -- config validation + comm setup + wind f"matching HostBufferStaging in host_outputs; none found" ) from None - self.set_device(device_id) + if self.device_id != device_id: + raise RuntimeError( + f"bootstrap_context(device_id={device_id}) called on a ChipWorker " + f"already initialized for device_id={self.device_id}" + ) device_ctx = 0 local_base = 0 @@ -517,7 +511,3 @@ def device_id(self): @property def initialized(self): return self._impl.initialized - - @property - def device_set(self): - return self._impl.device_set diff --git a/python/simpler/worker.py b/python/simpler/worker.py index 073084dc6..a26c58bd8 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -276,8 +276,7 @@ def _chip_process_loop( try: cw = ChipWorker() - cw.init(bins, log_level=log_level, log_info_v=log_info_v) - cw.set_device(device_id) + cw.init(device_id, bins, log_level=log_level, log_info_v=log_info_v) except Exception as e: _tb.print_exc() # Write the message so any parent reader that *does* inspect this @@ -365,7 +364,7 @@ def _chip_process_loop_with_bootstrap( # noqa: PLR0912 cw = ChipWorker() try: - cw.init(bins, log_level=log_level, log_info_v=log_info_v) + cw.init(device_id, bins, log_level=log_level, log_info_v=log_info_v) except Exception as e: # noqa: BLE001 traceback.print_exc() channel.write_error(1, f"{type(e).__name__}: chip_worker.init: {e}") @@ -662,8 +661,7 @@ def _init_level2(self) -> None: binaries = builder.get_binaries(runtime, build=self._config.get("build", False)) self._chip_worker = ChipWorker() - self._chip_worker.init(binaries) - self._chip_worker.set_device(device_id) + self._chip_worker.init(device_id, binaries) def _init_hierarchical(self) -> None: device_ids = self._config.get("device_ids", []) @@ -687,8 +685,8 @@ def _init_hierarchical(self) -> None: binaries = builder.get_binaries(runtime, build=self._config.get("build", False)) # Stash the full RuntimeBinaries so forked chip children can - # construct a ChipWorker with one call (`cw.init(bins)`) instead - # of taking ~10 path strings via positional args. Forked-child + # construct a ChipWorker with one call (`cw.init(device_id, bins)`) + # instead of taking ~10 path strings via positional args. Forked-child # invocation is `os.fork()` + direct function call, so no pickle # barrier — the bins object is just a Python value passed through. self._l3_bins = binaries diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index 54c6519ef..89c06fb0f 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -808,8 +808,7 @@ def _create_worker(cls, platform, device_id=0, build=False): bins = cls._get_binaries(platform, build=build) w = ChipWorker() - w.init(bins) - w.set_device(device_id) + w.init(device_id, bins) return w # ------------------------------------------------------------------ diff --git a/src/a2a3/platform/include/host/profiling_common/profiler_base.h b/src/a2a3/platform/include/host/profiling_common/profiler_base.h index 040bb9722..2c3c18d2f 100644 --- a/src/a2a3/platform/include/host/profiling_common/profiler_base.h +++ b/src/a2a3/platform/include/host/profiling_common/profiler_base.h @@ -306,12 +306,16 @@ class ProfilerBase { using ReadyEntry = typename Module::ReadyEntry; using ReadyBufferInfo = typename Module::ReadyBufferInfo; - ProfilerBase() = default; - ~ProfilerBase() = default; - ProfilerBase(const ProfilerBase &) = delete; ProfilerBase &operator=(const ProfilerBase &) = delete; +private: + // CRTP base — only the Derived class may construct/destruct. + friend Derived; + ProfilerBase() = default; + ~ProfilerBase() = default; + +public: /** * Stash the memory context produced by Derived::init(). Must be called on * the init() success path; if init aborts before this, start(tf) is a diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index b43e5c5d7..a9ba584ad 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -1002,7 +1002,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data return 0; } - // Device must be set first (set_device() must be called before upload_kernel_binary()) + // Run context (streams) must be prepared first. if (stream_aicpu_ == nullptr) { LOG_ERROR("Run context not prepared before upload_kernel_binary()"); return 0; diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 8dd4dc816..117a8fa1c 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -421,7 +421,10 @@ class DeviceRunner { void release_run_context(); private: - // Internal state + // Internal state. device_id_ is set once in attach_current_thread() (called + // from simpler_init during ChipWorker::init) and read on every subsequent + // op. All ChipWorker callers run on the same thread that called init, so + // plain int + the init→user happens-before edge is sufficient. int device_id_{-1}; int block_dim_{0}; int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index e4b7d3b20..72d4e08b6 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -119,15 +119,6 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx)->attach_current_thread(device_id); - } catch (...) { - return -1; - } -} - int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) { if (ctx == NULL) return -1; try { @@ -279,8 +270,20 @@ void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, si r->record_tensor_pair(host_ptr, dev_ptr, size); } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; +int simpler_init(DeviceContextHandle ctx, int device_id, int log_level, int log_info_v) { + if (ctx == NULL) return -1; + + // Attach FIRST so that an attach failure does not leave process-wide side + // effects (CANN dlog level, HostLogger singleton) mutated. Subsequent + // logger writes only happen on the success path. + DeviceRunner *runner = static_cast(ctx); + int rc; + try { + rc = runner->attach_current_thread(device_id); + } catch (...) { + return -1; + } + if (rc != 0) return rc; // CANN dlog: derive from simpler logger choice unless ASCEND_GLOBAL_LOG_LEVEL // is externally configured. @@ -291,9 +294,9 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { HostLogger::get_instance().set_level(static_cast(log_level)); HostLogger::get_instance().set_info_v(log_info_v); - DeviceRunner *runner = static_cast(ctx); runner->set_log_level(log_level); runner->set_log_info_v(log_info_v); + return 0; } } // extern "C" diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 26db1e3d6..fa33e5eb7 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -112,10 +112,33 @@ std::thread DeviceRunner::create_thread(std::function fn) { }); } +int DeviceRunner::attach_current_thread(int device_id) { + if (device_id < 0) { + LOG_ERROR("Invalid device_id: %d", device_id); + return -1; + } + if (device_id_ != -1 && device_id_ != device_id) { + LOG_ERROR( + "DeviceRunner already initialized on device %d; finalize before switching to device %d", device_id_, + device_id + ); + return -1; + } + + // Per-thread bind so sim hooks (TPUSH/TPOP, identity helpers) route through + // the correct context. acquire is process-wide and idempotent (no-op after + // first call for a given device_id), so it is safe to fold in here. + pto_cpu_sim_bind_device(device_id); + pto_cpu_sim_acquire_device(device_id); + device_id_ = device_id; + return 0; +} + int DeviceRunner::ensure_device_initialized( int device_id, const std::vector &aicpu_so_binary, const std::vector &aicore_kernel_binary ) { - device_id_ = device_id; + int rc = attach_current_thread(device_id); + if (rc != 0) return rc; return ensure_binaries_loaded(aicpu_so_binary, aicore_kernel_binary); } diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 210aeb9ba..0c0172ab7 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -15,7 +15,7 @@ * It provides a compatible interface with the onboard DeviceRunner for the * core operations (allocate, copy, run, finalize, upload/remove kernel). * The onboard version exposes additional low-level methods (launch_aicpu_kernel, - * launch_aicore_kernel, ensure_device_set) for custom workflows. + * launch_aicore_kernel, ensure_device_initialized) for custom workflows. * * Key differences from onboard: * - Uses host memory instead of device memory @@ -169,6 +169,21 @@ class DeviceRunner { void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } const std::string &output_prefix() const { return output_prefix_; } + /** + * Attach the calling thread to the simulated device. + * + * Mirrors the onboard contract: binds the caller's TLS to `device_id` + * (so sim hooks routing through `pto_cpu_sim_get_bound_device()` see the + * right context) and idempotently acquires the process-wide sim device + * registry entry. Called from `simpler_init` and re-invoked at the top + * of every device-op so any caller thread becomes the bound thread for + * the op without requiring an explicit pre-attach step. + * + * @param device_id Device ID (>= 0). + * @return 0 on success, negative on invalid id / device-id mismatch. + */ + int attach_current_thread(int device_id); + /** * Print handshake results */ @@ -211,7 +226,10 @@ class DeviceRunner { void remove_kernel_binary(int func_id); private: - // Configuration + // Configuration. device_id_ is set once in attach_current_thread() during + // simpler_init and read by run() / create_thread() afterward — single- + // threaded with respect to the user's call sequence, so plain int is + // sufficient. int device_id_{-1}; int block_dim_{0}; int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index b8315b31a..974cc7bbb 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -115,13 +115,6 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_castrecord_tensor_pair(host_ptr, dev_ptr, size); } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; +int simpler_init(DeviceContextHandle ctx, int device_id, int log_level, int log_info_v) { + if (ctx == NULL) return -1; + + // Attach FIRST so that an attach failure (e.g. invalid device_id) does not + // leave the process-wide HostLogger singleton mutated. + DeviceRunner *runner = static_cast(ctx); + int rc; + try { + rc = runner->attach_current_thread(device_id); + } catch (...) { + return -1; + } + if (rc != 0) return rc; + // No CANN dlog on sim. HostLogger::get_instance().set_level(static_cast(log_level)); HostLogger::get_instance().set_info_v(log_info_v); - DeviceRunner *runner = static_cast(ctx); runner->set_log_level(log_level); runner->set_log_info_v(log_info_v); + return 0; } } // extern "C" diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 149feb7da..27ad3a4e1 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -811,7 +811,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data return 0; } - // Device must be set first (set_device() must be called before upload_kernel_binary()) + // Run context (streams) must be prepared first. if (stream_aicpu_ == nullptr) { LOG_ERROR("Run context not prepared before upload_kernel_binary()"); return 0; diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 4c5fab748..e9b41f3d1 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -371,7 +371,10 @@ class DeviceRunner { void release_run_context(); private: - // Internal state + // Internal state. device_id_ is set once in attach_current_thread() (called + // from simpler_init during ChipWorker::init) and read on every subsequent + // op. All ChipWorker callers run on the same thread that called init, so + // plain int + the init→user happens-before edge is sufficient. int device_id_{-1}; int block_dim_{0}; int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index fa151b1ab..d9de639eb 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -119,15 +119,6 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx)->attach_current_thread(device_id); - } catch (...) { - return -1; - } -} - void *device_malloc_ctx(DeviceContextHandle ctx, size_t size) { if (ctx == NULL) return NULL; try { @@ -311,8 +302,20 @@ void record_tensor_pair(RuntimeHandle runtime, void *host_ptr, void *dev_ptr, si r->record_tensor_pair(host_ptr, dev_ptr, size); } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; +int simpler_init(DeviceContextHandle ctx, int device_id, int log_level, int log_info_v) { + if (ctx == NULL) return -1; + + // Attach FIRST so that an attach failure does not leave process-wide side + // effects (CANN dlog level, HostLogger singleton) mutated. Subsequent + // logger writes only happen on the success path. + DeviceRunner *runner = static_cast(ctx); + int rc; + try { + rc = runner->attach_current_thread(device_id); + } catch (...) { + return -1; + } + if (rc != 0) return rc; // CANN dlog: derive from simpler logger choice unless ASCEND_GLOBAL_LOG_LEVEL // is externally configured. dlog_setlevel mutates libunified_dlog.so's @@ -326,9 +329,9 @@ void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { HostLogger::get_instance().set_info_v(log_info_v); // Snapshot into runner — read by run_runtime when populating KernelArgs - DeviceRunner *runner = static_cast(ctx); runner->set_log_level(log_level); runner->set_log_info_v(log_info_v); + return 0; } } // extern "C" diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 015419665..76bd462c7 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -107,10 +107,30 @@ std::thread DeviceRunner::create_thread(std::function fn) { }); } +int DeviceRunner::attach_current_thread(int device_id) { + if (device_id < 0) { + LOG_ERROR("Invalid device_id: %d", device_id); + return -1; + } + if (device_id_ != -1 && device_id_ != device_id) { + LOG_ERROR( + "DeviceRunner already initialized on device %d; finalize before switching to device %d", device_id_, + device_id + ); + return -1; + } + + pto_cpu_sim_bind_device(device_id); + pto_cpu_sim_acquire_device(device_id); + device_id_ = device_id; + return 0; +} + int DeviceRunner::ensure_device_initialized( int device_id, const std::vector &aicpu_so_binary, const std::vector &aicore_kernel_binary ) { - device_id_ = device_id; + int rc = attach_current_thread(device_id); + if (rc != 0) return rc; return ensure_binaries_loaded(aicpu_so_binary, aicore_kernel_binary); } diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 636149f18..4dd982e00 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -167,6 +167,20 @@ class DeviceRunner { void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } const std::string &output_prefix() const { return output_prefix_; } + /** + * Attach the calling thread to the simulated device. + * + * Mirrors the onboard contract: binds the caller's TLS to `device_id` + * and idempotently acquires the process-wide sim device registry entry. + * Called from `simpler_init` and re-invoked at the top of every device-op + * so any caller thread becomes the bound thread for the op without + * requiring an explicit pre-attach step. + * + * @param device_id Device ID (>= 0). + * @return 0 on success, negative on invalid id / device-id mismatch. + */ + int attach_current_thread(int device_id); + /** * Print handshake results */ @@ -209,7 +223,10 @@ class DeviceRunner { void remove_kernel_binary(int func_id); private: - // Configuration + // Configuration. device_id_ is set once in attach_current_thread() during + // simpler_init and read by run() / create_thread() afterward — single- + // threaded with respect to the user's call sequence, so plain int is + // sufficient. int device_id_{-1}; int block_dim_{0}; int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index cd16e3734..46a83b2c0 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -115,13 +115,6 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_castrecord_tensor_pair(host_ptr, dev_ptr, size); } -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v) { - if (ctx == NULL) return; +int simpler_init(DeviceContextHandle ctx, int device_id, int log_level, int log_info_v) { + if (ctx == NULL) return -1; + + // Attach FIRST so that an attach failure (e.g. invalid device_id) does not + // leave the process-wide HostLogger singleton mutated. + DeviceRunner *runner = static_cast(ctx); + int rc; + try { + rc = runner->attach_current_thread(device_id); + } catch (...) { + return -1; + } + if (rc != 0) return rc; + // No CANN dlog on sim; only HostLogger + runner state. HostLogger::get_instance().set_level(static_cast(log_level)); HostLogger::get_instance().set_info_v(log_info_v); - DeviceRunner *runner = static_cast(ctx); runner->set_log_level(log_level); runner->set_log_info_v(log_info_v); + return 0; } } // extern "C" diff --git a/src/common/sim_context/cpu_sim_context.h b/src/common/sim_context/cpu_sim_context.h index ebcb875d3..92c23eb16 100644 --- a/src/common/sim_context/cpu_sim_context.h +++ b/src/common/sim_context/cpu_sim_context.h @@ -20,10 +20,11 @@ * thread's device_id (set via pto_cpu_sim_bind_device). * * Invariant: each simulated device_id has a single owner ChipWorker per - * process. The owner calls acquire_device() at set_device() time and - * release_device() at finalize_device() time, after all worker threads - * for that device have been joined. Concurrent access from multiple - * ChipWorkers to the same device_id is undefined behavior. + * process. The owner calls acquire_device() inside DeviceRunner's + * attach_current_thread() (driven by ChipWorker::init) and release_device() + * at finalize_device() time, after all worker threads for that device + * have been joined. Concurrent access from multiple ChipWorkers to the + * same device_id is undefined behavior. */ #pragma once diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 38680e77a..6bbe8f577 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -100,7 +100,8 @@ ChipWorker::~ChipWorker() { finalize(); } void ChipWorker::init( const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, - const std::string &simpler_log_lib_path, const std::string &sim_context_lib_path, int log_level, int log_info_v + const std::string &simpler_log_lib_path, int device_id, const std::string &sim_context_lib_path, int log_level, + int log_info_v ) { if (finalized_) { throw std::runtime_error("ChipWorker already finalized; cannot reinitialize"); @@ -108,6 +109,9 @@ void ChipWorker::init( if (initialized_) { throw std::runtime_error("ChipWorker already initialized; runtime cannot be changed"); } + if (device_id < 0) { + throw std::runtime_error("ChipWorker::init requires a non-negative device_id"); + } // Load libsimpler_log FIRST with RTLD_GLOBAL so that subsequent host_runtime // / cpu_sim_context / sim aicore .so loads can resolve their HostLogger and @@ -142,7 +146,6 @@ void ChipWorker::init( try { create_device_context_fn_ = load_symbol(handle, "create_device_context"); destroy_device_context_fn_ = load_symbol(handle, "destroy_device_context"); - set_device_fn_ = load_symbol(handle, "set_device"); device_malloc_ctx_fn_ = load_symbol(handle, "device_malloc_ctx"); device_free_ctx_fn_ = load_symbol(handle, "device_free_ctx"); copy_to_device_ctx_fn_ = load_symbol(handle, "copy_to_device_ctx"); @@ -185,35 +188,49 @@ void ChipWorker::init( runtime_buf_.resize(get_runtime_size_fn_()); - // One-shot platform-side log init: pushes user's simpler-logger choice - // into HostLogger + runner state, and (onboard) into CANN dlog. - simpler_init_fn_(device_ctx_, log_level, log_info_v); - - initialized_ = true; -} - -void ChipWorker::set_device(int device_id) { - if (!initialized_) { - throw std::runtime_error("ChipWorker not initialized; call init() first"); - } - if (device_set_) { - throw std::runtime_error("Device already set; call reset_device() before switching devices"); + // One-shot platform-side init: attach the calling thread to `device_id` + // (rtSetDevice on onboard, sim bind+acquire on sim) and push the user's + // simpler-logger choice into HostLogger + runner state (and CANN dlog + // onboard). Subsequent device-ops re-attach their caller threads + // idempotently against the recorded device id. + int init_rc = simpler_init_fn_(device_ctx_, device_id, log_level, log_info_v); + if (init_rc != 0) { + // Symmetric teardown: drop the device context, clear all dlsym'd + // function pointers, dlclose, and discard cached binaries so the + // ChipWorker is back to its zero-initialized state. Mirror finalize() + // exactly minus finalize_device_fn_ (we never reached the + // initialized_=true point, so device-side teardown is unnecessary). + destroy_device_context_fn_(device_ctx_); + device_ctx_ = nullptr; + dlclose(handle); + lib_handle_ = nullptr; + create_device_context_fn_ = nullptr; + destroy_device_context_fn_ = nullptr; + device_malloc_ctx_fn_ = nullptr; + device_free_ctx_fn_ = nullptr; + copy_to_device_ctx_fn_ = nullptr; + copy_from_device_ctx_fn_ = nullptr; + get_runtime_size_fn_ = nullptr; + run_runtime_fn_ = nullptr; + simpler_init_fn_ = nullptr; + finalize_device_fn_ = nullptr; + ensure_acl_ready_fn_ = nullptr; + create_comm_stream_fn_ = nullptr; + destroy_comm_stream_fn_ = nullptr; + comm_init_fn_ = nullptr; + comm_alloc_windows_fn_ = nullptr; + comm_get_local_window_base_fn_ = nullptr; + comm_get_window_size_fn_ = nullptr; + comm_barrier_fn_ = nullptr; + comm_destroy_fn_ = nullptr; + runtime_buf_.clear(); + aicpu_binary_.clear(); + aicore_binary_.clear(); + throw std::runtime_error("simpler_init failed with code " + std::to_string(init_rc)); } - int rc = set_device_fn_(device_ctx_, device_id); - if (rc != 0) { - throw std::runtime_error("set_device failed with code " + std::to_string(rc)); - } device_id_ = device_id; - device_set_ = true; -} - -void ChipWorker::reset_device() { - if (device_set_ && finalize_device_fn_) { - finalize_device_fn_(device_ctx_); - } - device_id_ = -1; - device_set_ = false; + initialized_ = true; } void ChipWorker::finalize() { @@ -225,7 +242,9 @@ void ChipWorker::finalize() { } comm_stream_ = nullptr; - reset_device(); + if (device_ctx_ != nullptr && finalize_device_fn_ != nullptr && initialized_) { + finalize_device_fn_(device_ctx_); + } if (device_ctx_ != nullptr && destroy_device_context_fn_ != nullptr) { destroy_device_context_fn_(device_ctx_); device_ctx_ = nullptr; @@ -236,7 +255,6 @@ void ChipWorker::finalize() { lib_handle_ = nullptr; create_device_context_fn_ = nullptr; destroy_device_context_fn_ = nullptr; - set_device_fn_ = nullptr; device_malloc_ctx_fn_ = nullptr; device_free_ctx_fn_ = nullptr; copy_to_device_ctx_fn_ = nullptr; @@ -257,6 +275,7 @@ void ChipWorker::finalize() { aicpu_binary_.clear(); aicore_binary_.clear(); initialized_ = false; + device_id_ = -1; finalized_ = true; } @@ -271,8 +290,8 @@ void ChipWorker::run(uint64_t callable, TaskArgsView args, const CallConfig &con void ChipWorker::run(const void *callable, const void *args, const CallConfig &config) { config.validate(); - if (!device_set_) { - throw std::runtime_error("ChipWorker device not set; call set_device() first"); + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); } void *rt = runtime_buf_.data(); @@ -288,8 +307,8 @@ void ChipWorker::run(const void *callable, const void *args, const CallConfig &c } uint64_t ChipWorker::malloc(size_t size) { - if (!device_set_) { - throw std::runtime_error("ChipWorker device not set; call set_device() first"); + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); } void *ptr = device_malloc_ctx_fn_(device_ctx_, size); if (ptr == nullptr) { @@ -299,15 +318,15 @@ uint64_t ChipWorker::malloc(size_t size) { } void ChipWorker::free(uint64_t ptr) { - if (!device_set_) { - throw std::runtime_error("ChipWorker device not set; call set_device() first"); + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); } device_free_ctx_fn_(device_ctx_, reinterpret_cast(ptr)); } void ChipWorker::copy_to(uint64_t dst, uint64_t src, size_t size) { - if (!device_set_) { - throw std::runtime_error("ChipWorker device not set; call set_device() first"); + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); } int rc = copy_to_device_ctx_fn_(device_ctx_, reinterpret_cast(dst), reinterpret_cast(src), size); @@ -317,8 +336,8 @@ void ChipWorker::copy_to(uint64_t dst, uint64_t src, size_t size) { } void ChipWorker::copy_from(uint64_t dst, uint64_t src, size_t size) { - if (!device_set_) { - throw std::runtime_error("ChipWorker device not set; call set_device() first"); + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); } int rc = copy_from_device_ctx_fn_(device_ctx_, reinterpret_cast(dst), reinterpret_cast(src), size); @@ -328,8 +347,8 @@ void ChipWorker::copy_from(uint64_t dst, uint64_t src, size_t size) { } uint64_t ChipWorker::comm_init(int rank, int nranks, const std::string &rootinfo_path) { - if (!device_set_) { - throw std::runtime_error("ChipWorker device not set; call set_device() first"); + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); } if (comm_stream_ != nullptr) { throw std::runtime_error("comm_init: a comm session is already active on this ChipWorker"); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 3e529a511..422313f0e 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -28,26 +28,20 @@ class ChipWorker : public IWorker { ChipWorker(const ChipWorker &) = delete; ChipWorker &operator=(const ChipWorker &) = delete; - /// Bind the runtime library and cache platform binaries. - /// Can only be called once per lifetime — the runtime cannot be changed. + /// Bind the runtime library, cache platform binaries, and attach the + /// calling thread to `device_id`. Can only be called once per lifetime — + /// the runtime and device cannot be changed after init. + /// /// `log_level` (0=DEBUG..4=NUL) and `log_info_v` (0..9) are pushed into /// HostLogger + runner state and (onboard) into CANN dlog at this point; /// they reflect the user's `simpler` Python logger at Worker.init() time /// and are then fixed for this ChipWorker's lifetime. void init( const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, - const std::string &simpler_log_lib_path, const std::string &sim_context_lib_path = "", int log_level = 1, - int log_info_v = 5 + const std::string &simpler_log_lib_path, int device_id, const std::string &sim_context_lib_path = "", + int log_level = 1, int log_info_v = 5 ); - /// Set the target NPU device. Requires init() first. - /// Can be called after reset_device() to switch to a different device. - void set_device(int device_id); - - /// Release device resources only. The runtime binding remains intact. - /// After this, set_device() can be called again with a new device ID. - void reset_device(); - /// Tear down everything: device resources and runtime library. /// Terminal — the object cannot be reused after this. void finalize(); @@ -91,12 +85,10 @@ class ChipWorker : public IWorker { int device_id() const { return device_id_; } bool initialized() const { return initialized_; } - bool device_set() const { return device_set_; } private: using CreateDeviceContextFn = void *(*)(); using DestroyDeviceContextFn = void (*)(void *); - using SetDeviceFn = int (*)(void *, int); using DeviceMallocCtxFn = void *(*)(void *, size_t); using DeviceFreeCtxFn = void (*)(void *, void *); using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t); @@ -106,7 +98,7 @@ class ChipWorker : public IWorker { void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int, int, int, const char * ); - using SimplerInitFn = void (*)(void *, int, int); + using SimplerInitFn = int (*)(void *, int, int, int); using FinalizeDeviceFn = int (*)(void *); using EnsureAclReadyFn = int (*)(void *, int); using CreateCommStreamFn = void *(*)(void *); @@ -121,7 +113,6 @@ class ChipWorker : public IWorker { void *lib_handle_ = nullptr; CreateDeviceContextFn create_device_context_fn_ = nullptr; DestroyDeviceContextFn destroy_device_context_fn_ = nullptr; - SetDeviceFn set_device_fn_ = nullptr; DeviceMallocCtxFn device_malloc_ctx_fn_ = nullptr; DeviceFreeCtxFn device_free_ctx_fn_ = nullptr; CopyToDeviceCtxFn copy_to_device_ctx_fn_ = nullptr; @@ -149,9 +140,13 @@ class ChipWorker : public IWorker { std::vector runtime_buf_; std::vector aicpu_binary_; std::vector aicore_binary_; + // device_id_ is set once in init() and never modified afterward. All + // ChipWorker callers run on the thread that called init() (the same + // thread is the only one that subsequently calls malloc / copy_to / + // run / finalize), so plain `int` is sufficient — no cross-thread + // synchronization required. int device_id_ = -1; bool initialized_ = false; - bool device_set_ = false; bool finalized_ = false; }; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index b6588dc45..dd2d4e663 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -15,10 +15,20 @@ * Both the ChipWorker (consumer, resolves public symbols via dlsym) and the * platform implementations (producers, define all symbols) include this file. * - * Public API — resolved by ChipWorker via dlsym: - * create_device_context, destroy_device_context, - * get_runtime_size, set_device, run_runtime, finalize_device, - * device_malloc_ctx, device_free_ctx, copy_to_device_ctx, copy_from_device_ctx + * Public API — resolved by ChipWorker via dlsym (every host_runtime.so must + * export ALL of these; runtimes without a real backend ship not-supported + * stubs rather than omitting symbols, so ChipWorker can dlsym the full set + * unconditionally without per-symbol probing): + * - lifecycle: create_device_context, destroy_device_context, + * simpler_init, finalize_device + * - sizing: get_runtime_size + * - device-mem: device_malloc_ctx, device_free_ctx, + * copy_to_device_ctx, copy_from_device_ctx + * - run: run_runtime + * - ACL/stream: ensure_acl_ready_ctx, create_comm_stream_ctx, + * destroy_comm_stream_ctx + * - comm: comm_init, comm_alloc_windows, comm_get_local_window_base, + * comm_get_window_size, comm_barrier, comm_destroy * * Memory management: caller allocates a buffer of get_runtime_size() bytes * and passes it to run_runtime(). Error codes: 0 = success, negative = error. @@ -57,9 +67,6 @@ void destroy_device_context(DeviceContextHandle ctx); /** Return sizeof(Runtime) for caller buffer allocation. */ size_t get_runtime_size(void); -/** Set the target device. Must be called before the first run_runtime(). */ -int set_device(DeviceContextHandle ctx, int device_id); - /** Allocate device memory in the given device context. */ void *device_malloc_ctx(DeviceContextHandle ctx, size_t size); @@ -105,10 +112,17 @@ int run_runtime( ); /** - * One-shot platform-side log init. Called once by ChipWorker::init() right - * after dlopen, before any other entry. Pushes the user's chosen severity + - * INFO verbosity into HostLogger and into runner state (which run_runtime - * later forwards to AICPU via KernelArgs). + * One-shot platform-side init. Called once by ChipWorker::init() right + * after dlopen, before any other entry. Two responsibilities: + * + * 1. Attach the calling thread to `device_id` (rtSetDevice on onboard, + * pto_cpu_sim_bind_device + pto_cpu_sim_acquire_device on sim) and + * record the device id on the DeviceRunner so subsequent device-ops + * can re-attach their own caller threads idempotently. + * + * 2. Push the user's chosen severity + INFO verbosity into HostLogger + * and into runner state (which run_runtime later forwards to AICPU + * via KernelArgs). * * On onboard, also calls dlog_setlevel(-1, log_level, 0) so CANN's runtime * filter matches the simpler logger — unless ASCEND_GLOBAL_LOG_LEVEL was @@ -118,8 +132,10 @@ int run_runtime( * * `log_level` is CANN-aligned: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL. * `log_info_v` ∈ [0, 9]; only meaningful when severity is INFO. + * + * Returns 0 on success, negative on attach failure. */ -void simpler_init(DeviceContextHandle ctx, int log_level, int log_info_v); +int simpler_init(DeviceContextHandle ctx, int device_id, int log_level, int log_info_v); /** * Release all device resources held by the context. diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 520254cc5..eca089524 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -65,10 +65,9 @@ class TestChipWorkerStateMachine: def test_initial_state(self): worker = _ChipWorker() assert worker.initialized is False - assert worker.device_set is False assert worker.device_id == -1 - def test_run_before_set_device_raises(self): + def test_run_before_init_raises(self): from _task_interface import ChipCallable, ChipStorageTaskArgs # noqa: PLC0415 worker = _ChipWorker() @@ -78,20 +77,8 @@ def test_run_before_set_device_raises(self): # Build a minimal ChipCallable for the test callable_obj = ChipCallable.build(signature=[], func_name="test", binary=b"\x00", children=[]) - with pytest.raises(RuntimeError, match="device not set"): - worker.run(callable_obj, args, config) - - def test_set_device_before_init_raises(self): - worker = _ChipWorker() with pytest.raises(RuntimeError, match="not initialized"): - worker.set_device(0) - - def test_reset_device_idempotent(self): - worker = _ChipWorker() - # reset_device() on an uninitialized worker should not raise - worker.reset_device() - worker.reset_device() - assert worker.device_set is False + worker.run(callable_obj, args, config) def test_finalize_idempotent(self): worker = _ChipWorker() @@ -103,12 +90,21 @@ def test_init_after_finalize_raises(self): worker = _ChipWorker() worker.finalize() with pytest.raises(RuntimeError, match="finalized"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so") + worker.init( + "/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so", device_id=0 + ) def test_init_with_nonexistent_lib_raises(self): worker = _ChipWorker() with pytest.raises(RuntimeError, match="dlopen"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so") + worker.init( + "/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so", device_id=0 + ) + + def test_init_with_negative_device_id_raises(self): + worker = _ChipWorker() + with pytest.raises(RuntimeError, match="device_id"): + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "/nonexistent/libsimpler_log.so", -1) # ============================================================================ @@ -125,5 +121,4 @@ def test_import(self): worker = ChipWorker() assert worker.initialized is False - assert worker.device_set is False assert isinstance(PyCallConfig(), CallConfig) diff --git a/tests/ut/py/test_worker/test_bootstrap_context_hw.py b/tests/ut/py/test_worker/test_bootstrap_context_hw.py index 9d4397580..d0fed6f70 100644 --- a/tests/ut/py/test_worker/test_bootstrap_context_hw.py +++ b/tests/ut/py/test_worker/test_bootstrap_context_hw.py @@ -58,7 +58,7 @@ def _bootstrap_rank_entry( # noqa: PLR0913 ) worker = ChipWorker() - worker.init(bins) + worker.init(device_id, bins) result["stage"] = "init" cfg = ChipBootstrapConfig( diff --git a/tests/ut/py/test_worker/test_bootstrap_context_sim.py b/tests/ut/py/test_worker/test_bootstrap_context_sim.py index 90b01dd81..2f58757f3 100644 --- a/tests/ut/py/test_worker/test_bootstrap_context_sim.py +++ b/tests/ut/py/test_worker/test_bootstrap_context_sim.py @@ -87,7 +87,7 @@ def _rank_entry( # noqa: PLR0913 ) worker = ChipWorker() - worker.init(bins) + worker.init(rank, bins) result["stage"] = "init" cfg = ChipBootstrapConfig( @@ -343,7 +343,7 @@ def _store_rank_entry( # noqa: PLR0913 ) worker = ChipWorker() - worker.init(bins) + worker.init(rank, bins) cfg = ChipBootstrapConfig( comm=ChipCommBootstrapConfig( @@ -545,7 +545,7 @@ def _missing_output_staging_rank_entry( ) worker = ChipWorker() - worker.init(bins) + worker.init(0, bins) shm = SharedMemory(name=channel_shm_name) try: diff --git a/tests/ut/py/test_worker/test_platform_comm.py b/tests/ut/py/test_worker/test_platform_comm.py index f62093154..0144025a7 100644 --- a/tests/ut/py/test_worker/test_platform_comm.py +++ b/tests/ut/py/test_worker/test_platform_comm.py @@ -12,7 +12,7 @@ This is the Python twin of tests/ut/cpp/test_hccl_comm.cpp. It drives the full comm lifecycle entirely through ChipWorker's public Python API: - ChipWorker.init → set_device → comm_init → comm_alloc_windows + ChipWorker.init(device_id) → comm_init → comm_alloc_windows → comm_get_local_window_base → comm_get_window_size → copy_from (reads back CommContext) → comm_barrier (known-issue tolerant) → comm_destroy → finalize @@ -84,12 +84,9 @@ def _rank_entry( from simpler.task_interface import ChipWorker worker = ChipWorker() - worker.init(bins) + worker.init(device_id, bins) result["stage"] = "init" - worker.set_device(device_id) - result["stage"] = "set_device" - # ChipWorker.comm_init owns ACL bring-up and aclrtStream creation # internally — Python never touches aclInit / aclrtSetDevice / # aclrtCreateStream. This matches the L2-boundary contract in