Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions docs/chip-level-arch.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ runner.finalize();

```c
DeviceContextHandle ctx = create_device_context();
set_device(ctx, device_id);
simpler_init(ctx, device_id, log_level, log_info_v); // attach + log config
size_t size = get_runtime_size();
run_runtime(ctx, runtime, callable, args, block_dim,
aicpu_thread_num, device_id,
Expand All @@ -129,8 +129,8 @@ destroy_device_context(ctx);
from simpler.task_interface import ChipWorker, ChipCallable, ChipStorageTaskArgs, CallConfig

worker = ChipWorker()
worker.init(host_lib_path, aicpu_path, aicore_path, sim_context_lib_path="")
worker.set_device(device_id)
worker.init(host_lib_path, aicpu_path, aicore_path, simpler_log_lib_path,
device_id, sim_context_lib_path="")

config = CallConfig()
config.block_dim = 24
Expand Down Expand Up @@ -171,20 +171,21 @@ Python test_*.py (SceneTestCase)
├─→ KernelCompiler(platform).compile_orchestration(runtime, source) → orch .so
└─→ ChipWorker()
└─→ init(host_path, aicpu_path, aicore_path)
└─→ dlopen(host.so) → resolve C API symbols via dlsym
└─→ init(host_path, aicpu_path, aicore_path, simpler_log_path, device_id)
├─→ dlopen(host.so) → resolve C API symbols via dlsym
├─→ create_device_context() → DeviceContextHandle
└─→ simpler_init(ctx, device_id, log_level, log_info_v)
└─→ DeviceRunner::attach_current_thread(device_id)
├─→ rtSetDevice(device_id) on onboard
└─→ pto_cpu_sim_bind+acquire on sim
```

### 2. Initialization Phase

```text
worker.set_device(device_id)
└─→ create_device_context() → DeviceContextHandle
└─→ set_device(ctx, device_id)
├─→ Initialize device (CANN on hardware, no-op on sim)
└─→ Allocate device streams
```
The thread that called `init()` is now attached to `device_id`. Streams are
created lazily on the first `run()` call (`prepare_run_context`). Subsequent
device-ops (`malloc`, `copy_to`, `copy_from`, `free`) reuse that per-thread
binding — they must be called from the same thread that called `init()`.

### 3. Execution Phase

Expand Down
28 changes: 12 additions & 16 deletions docs/dynamic-linking.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,17 +255,18 @@ different tasks have different configurations.

## Execution Lifecycle

### Simulation (in-process, per-task init/reset)
### Simulation (in-process, per-task)

```text
ChipWorker.init(host_path, aicpu_path, aicore_path)
ChipWorker.init(host_path, aicpu_path, aicore_path, device_id)
dlopen(host_runtime.so, RTLD_GLOBAL)
dlsym: create_device_context, destroy_device_context, set_device,
dlsym: create_device_context, destroy_device_context, simpler_init,
get_runtime_size, run_runtime, finalize_device

ChipWorker.set_device(device_id)
create_device_context() → DeviceContextHandle
set_device(ctx, device_id)
simpler_init(ctx, device_id, log_level, log_info_v)
DeviceRunner::attach_current_thread(device_id)
pto_cpu_sim_bind_device(device_id)
pto_cpu_sim_acquire_device(device_id)

ChipWorker.run(callable, args, config)
run_runtime(ctx, buf, callable, args, ...)
Expand All @@ -280,12 +281,9 @@ ChipWorker.run(callable, args, config)
validate_runtime_impl(r) copy results, remove kernels
r->~Runtime()

ChipWorker.reset_device()
ChipWorker.finalize()
finalize_device(ctx)
destroy_device_context(ctx)

ChipWorker.finalize()
reset_device() (if needed)
dlclose(host_runtime.so) -fno-gnu-unique ensures real unload
```

Expand All @@ -294,11 +292,11 @@ ChipWorker.finalize()
```text
device_worker_main(device_id)
for each runtime_group:
ChipWorker.init(host_path, aicpu_path, aicore_path)
ChipWorker.init(host_path, aicpu_path, aicore_path, device_id)
dlopen(host_runtime.so, RTLD_GLOBAL)
ChipWorker.set_device(device_id)
create_device_context()
set_device(ctx, device_id) rtSetDevice()
simpler_init(ctx, device_id, log_level, log_info_v)
DeviceRunner::attach_current_thread(device_id) rtSetDevice()

for each task in group:
ChipWorker.run(callable, args, config)
Expand All @@ -312,10 +310,8 @@ device_worker_main(device_id)
launch_aicore_kernel() rtRegisterAllKernel + rtKernelLaunch
validate_runtime_impl() rtMemcpy results back to host

ChipWorker.reset_device()
ChipWorker.finalize()
finalize_device(ctx) rtDeviceReset()
destroy_device_context(ctx)

ChipWorker.finalize()
dlclose(host_runtime.so)
```
9 changes: 3 additions & 6 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,18 +159,15 @@ from simpler_setup.runtime_builder import RuntimeBuilder
builder = RuntimeBuilder(platform="a2a3sim")
binaries = builder.get_binaries("tensormap_and_ringbuffer")

# Create worker and initialize with platform binaries
# Create worker and initialize with platform binaries (attaches the calling
# thread to device 0 internally — no separate set_device step required)
worker = ChipWorker()
worker.init(host_path=str(binaries.host_path),
aicpu_path=str(binaries.aicpu_path),
aicore_path=str(binaries.aicore_path))
worker.set_device(device_id=0)
worker.init(device_id=0, bins=binaries)

# Execute callable on device
worker.run(chip_callable, orch_args, block_dim=24)

# Cleanup
worker.reset_device()
worker.finalize()
```

Expand Down
2 changes: 1 addition & 1 deletion examples/workers/l2/hello_worker/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
1. Your venv can import ``simpler.Worker`` (i.e. the nanobind extension is built).
2. Pre-built runtime binaries exist under ``build/lib/<platform>/tensormap_and_ringbuffer/``
so that ``RuntimeBuilder`` can find them on ``Worker.init()``.
3. ``set_device()`` + ACL init on the chosen platform works end-to-end.
3. ``ChipWorker.init(device_id)`` + ACL init on the chosen platform works end-to-end.
If this example runs cleanly, moving on to ``vector_add/`` (which adds a real
kernel, TaskArgs, and a golden check) is safe.
Expand Down
18 changes: 9 additions & 9 deletions examples/workers/l2/worker_malloc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ the `Worker` API in isolation:

There is **no `worker.run()` call** anywhere — that's deliberate. On real
hardware the CANN device context is per-thread, so `rtMalloc` only succeeds
on a thread previously bound by `rtSetDevice`. `Worker.init()` is the only
thing that performs that bind for the Python caller thread; if its `set_device`
path is broken, `worker.malloc()` fails with CANN error 107002 *before* any
kernel ever runs. Every example that does `init() -> run() -> ...` accidentally
masks that bug because the run path re-binds the device on the same thread
just before allocations happen. This example doesn't.
on a thread previously bound by `rtSetDevice`. `Worker.init(...)` is the
only thing that performs that bind for the Python caller thread; if that
path is broken, `worker.malloc()` fails with CANN error 107002 *before*
any kernel ever runs. Every example that does `init() -> run() -> ...`
accidentally masks that bug because the run path re-binds the device on the
same thread just before allocations happen. This example doesn't.

## Run

Expand Down Expand Up @@ -45,6 +45,6 @@ Same for `a5sim` / `a5`.

If you see `rtMalloc failed: 107002` on `a2a3` / `a5` (but the same example
passes on `a2a3sim` / `a5sim`), the per-thread `rtSetDevice` is not happening
during `Worker.init()` — see `src/{arch}/platform/onboard/host/pto_runtime_c_api.cpp`
and confirm the C-API `set_device` actually calls
`DeviceRunner::attach_current_thread`.
during `Worker.init()` — see `simpler_init` in
`src/{arch}/platform/onboard/host/pto_runtime_c_api.cpp` and confirm it
forwards to `DeviceRunner::attach_current_thread`.
15 changes: 8 additions & 7 deletions examples/workers/l2/worker_malloc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@

Why a standalone example for these? On real hardware (a2a3 / a5 onboard) the
CANN device context is per-thread, so ``rtMalloc`` only succeeds on a thread
that previously executed ``rtSetDevice``. Until you call ``worker.run(...)``
the only thing that has bound the device on the calling Python thread is
``Worker.init() -> ChipWorker::set_device(...)``. If that path is broken,
this example fails at the first ``worker.malloc`` with CANN error 107002.
``vector_add`` happens to mask that bug because its first malloc lands on
the same thread that ``run()`` later attaches; this example doesn't ``run``
at all, so it's a focused regression check for the standalone alloc path.
that previously executed ``rtSetDevice``. ``Worker.init(...)`` is now the
single point that performs that bind for the Python caller thread (folded
down from the previous explicit ``ChipWorker::set_device``). If that path
breaks, this example fails at the first ``worker.malloc`` with CANN error
107002. ``vector_add`` happens to mask such a bug because its first malloc
lands on the same thread that ``run()`` later attaches; this example doesn't
``run`` at all, so it's a focused regression check for the standalone alloc
path.

Run:
python examples/workers/l2/worker_malloc/main.py -p a2a3sim -d 0
Expand Down
7 changes: 2 additions & 5 deletions python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,11 +616,9 @@ NB_MODULE(_task_interface, m) {
.def(nb::init<>())
.def(
"init", &ChipWorker::init, nb::arg("host_lib_path"), nb::arg("aicpu_path"), nb::arg("aicore_path"),
nb::arg("simpler_log_lib_path"), nb::arg("sim_context_lib_path") = "", nb::arg("log_level") = 1,
nb::arg("log_info_v") = 5
nb::arg("simpler_log_lib_path"), nb::arg("device_id"), nb::arg("sim_context_lib_path") = "",
nb::arg("log_level") = 1, nb::arg("log_info_v") = 5
)
.def("set_device", &ChipWorker::set_device, nb::arg("device_id"))
.def("reset_device", &ChipWorker::reset_device)
.def("finalize", &ChipWorker::finalize)
.def(
"run",
Expand Down Expand Up @@ -650,7 +648,6 @@ NB_MODULE(_task_interface, m) {
)
.def_prop_ro("device_id", &ChipWorker::device_id)
.def_prop_ro("initialized", &ChipWorker::initialized)
.def_prop_ro("device_set", &ChipWorker::device_set)
.def("malloc", &ChipWorker::malloc, nb::arg("size"))
.def("free", &ChipWorker::free, nb::arg("ptr"))
.def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size"))
Expand Down
50 changes: 20 additions & 30 deletions python/simpler/task_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,30 +230,29 @@ class ChipContext:
class ChipWorker:
"""Unified execution interface wrapping the host runtime C API.

The runtime library is bound once via init() and cannot be changed.
Devices can be set and reset independently.
The runtime library and target device are bound once via init() and
cannot be changed.

Usage::

worker = ChipWorker()
worker.init(host_path="build/lib/.../host.so",
aicpu_path="build/lib/.../aicpu.so",
aicore_path="build/lib/.../aicore.o")
worker.set_device(device_id=0)
worker.init(device_id=0, bins=bins)
worker.run(chip_callable, orch_args, block_dim=24)
worker.reset_device()
worker.finalize()
"""

def __init__(self):
self._impl = _ChipWorker()

def init(self, bins, log_level=None, log_info_v=None):
"""Load host runtime library and cache platform binaries.
def init(self, device_id, bins, log_level=None, log_info_v=None):
"""Attach the calling thread to ``device_id``, load the host runtime
library, and cache platform binaries.

Can only be called once — the runtime cannot be changed.
Can only be called once — the runtime and device cannot be changed
after init.

Args:
device_id: NPU device ID to attach the calling thread to.
bins: A `simpler_setup.runtime_builder.RuntimeBinaries` (or any
object exposing host_path / aicpu_path / aicore_path /
simpler_log_path / sim_context_path).
Expand All @@ -279,25 +278,12 @@ def init(self, bins, log_level=None, log_info_v=None):
str(bins.aicpu_path),
str(bins.aicore_path),
str(bins.simpler_log_path),
int(device_id),
str(bins.sim_context_path) if bins.sim_context_path else "",
log_level,
log_info_v,
)

def set_device(self, device_id):
"""Set the target NPU device.

Requires init() first. Can be called after reset_device() to switch devices.

Args:
device_id: NPU device ID.
"""
self._impl.set_device(device_id)

def reset_device(self):
"""Release device resources. The runtime binding remains intact."""
self._impl.reset_device()

def finalize(self):
"""Tear down everything: device resources and runtime library.

Expand Down Expand Up @@ -389,9 +375,13 @@ def bootstrap_context( # noqa: PLR0912 -- config validation + comm setup + wind
cfg: ChipBootstrapConfig,
channel: Optional[ChipBootstrapChannel] = None,
) -> ChipBootstrapResult:
"""One-shot per-chip bootstrap: set device, build communicator, slice window,
"""One-shot per-chip bootstrap: build communicator, slice window,
stage inputs from host shared memory, and (optionally) publish the result.

The target device must already be attached via ``init(bins, device_id)``
before invoking this method; ``device_id`` is supplied here only to
catch a caller that wired up the wrong device on the wrong worker.

Runs inside a forked chip child. If ``channel`` is provided (the
Worker-orchestrated integration path), the result is written as
SUCCESS or — on any exception — as ERROR (code=1,
Expand Down Expand Up @@ -428,7 +418,11 @@ def bootstrap_context( # noqa: PLR0912 -- config validation + comm setup + wind
f"matching HostBufferStaging in host_outputs; none found"
) from None

self.set_device(device_id)
if self.device_id != device_id:
raise RuntimeError(
f"bootstrap_context(device_id={device_id}) called on a ChipWorker "
f"already initialized for device_id={self.device_id}"
)

device_ctx = 0
local_base = 0
Expand Down Expand Up @@ -517,7 +511,3 @@ def device_id(self):
@property
def initialized(self):
return self._impl.initialized

@property
def device_set(self):
return self._impl.device_set
12 changes: 5 additions & 7 deletions python/simpler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,7 @@ def _chip_process_loop(

try:
cw = ChipWorker()
cw.init(bins, log_level=log_level, log_info_v=log_info_v)
cw.set_device(device_id)
cw.init(device_id, bins, log_level=log_level, log_info_v=log_info_v)
except Exception as e:
_tb.print_exc()
# Write the message so any parent reader that *does* inspect this
Expand Down Expand Up @@ -365,7 +364,7 @@ def _chip_process_loop_with_bootstrap( # noqa: PLR0912

cw = ChipWorker()
try:
cw.init(bins, log_level=log_level, log_info_v=log_info_v)
cw.init(device_id, bins, log_level=log_level, log_info_v=log_info_v)
except Exception as e: # noqa: BLE001
traceback.print_exc()
channel.write_error(1, f"{type(e).__name__}: chip_worker.init: {e}")
Expand Down Expand Up @@ -662,8 +661,7 @@ def _init_level2(self) -> None:
binaries = builder.get_binaries(runtime, build=self._config.get("build", False))

self._chip_worker = ChipWorker()
self._chip_worker.init(binaries)
self._chip_worker.set_device(device_id)
self._chip_worker.init(device_id, binaries)

def _init_hierarchical(self) -> None:
device_ids = self._config.get("device_ids", [])
Expand All @@ -687,8 +685,8 @@ def _init_hierarchical(self) -> None:
binaries = builder.get_binaries(runtime, build=self._config.get("build", False))

# Stash the full RuntimeBinaries so forked chip children can
# construct a ChipWorker with one call (`cw.init(bins)`) instead
# of taking ~10 path strings via positional args. Forked-child
# construct a ChipWorker with one call (`cw.init(device_id, bins)`)
# instead of taking ~10 path strings via positional args. Forked-child
# invocation is `os.fork()` + direct function call, so no pickle
# barrier — the bins object is just a Python value passed through.
self._l3_bins = binaries
Expand Down
3 changes: 1 addition & 2 deletions simpler_setup/scene_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,8 +808,7 @@ def _create_worker(cls, platform, device_id=0, build=False):

bins = cls._get_binaries(platform, build=build)
w = ChipWorker()
w.init(bins)
w.set_device(device_id)
w.init(device_id, bins)
return w

# ------------------------------------------------------------------
Expand Down
10 changes: 7 additions & 3 deletions src/a2a3/platform/include/host/profiling_common/profiler_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,16 @@ class ProfilerBase {
using ReadyEntry = typename Module::ReadyEntry;
using ReadyBufferInfo = typename Module::ReadyBufferInfo;

ProfilerBase() = default;
~ProfilerBase() = default;

ProfilerBase(const ProfilerBase &) = delete;
ProfilerBase &operator=(const ProfilerBase &) = delete;

private:
// CRTP base — only the Derived class may construct/destruct.
friend Derived;
ProfilerBase() = default;
~ProfilerBase() = default;

public:
/**
* Stash the memory context produced by Derived::init(). Must be called on
* the init() success path; if init aborts before this, start(tf) is a
Expand Down
Loading
Loading