From 5cdc58fbf0698d3bd650f44f675bc03cde195b2f Mon Sep 17 00:00:00 2001 From: chenshengxin Date: Sat, 9 May 2026 19:38:04 +0800 Subject: [PATCH] feat(frontend): allow local_slot_num on gm_slot_tensor pipe init Previously `pto.aic_initialize_pipe` / `pto.aiv_initialize_pipe` rejected `local_slot_num` whenever they used the address-based `gm_slot_tensor` operand: 'pto.aic_initialize_pipe' op globaltensor pipe init does not use 'local_slot_num' That made it impossible for kernels to mirror the manual-flash-attention TPipe instantiation. The C++ template default in `include/pto/npu/a2a3/TPush.hpp` is `LocalSlotNum=2`, which is also what `kernels/manual/common/flash_atten/fa_performance_kernel.cpp` uses on QK/PV/P pipes. Without an IR-level override, ptoas's address-based pipe lowering ends up emitting `TPipe<..., 8, 8, false>` (LocalSlotNum=SlotNum) because `buildTPipeTokenFromInitOp` in `PTOToEmitC.cpp` falls back to `getSlotNum()` when no `local_slot_num` attribute is present. The mismatch inflates the FFTS event multiplex (8 local x 8 global per pipe instead of 2 local x 8 global) and exhausts `--enable-insert-sync`'s 8-event pool at long sequences, e.g. flash-attention `S1>=4096` on a3. Changes: 1. `lib/PTO/IR/PTO.cpp` - frontend verifier - Drop the blanket "globaltensor pipe init does not use 'local_slot_num'" rejection in `verifyFrontendInitCommon`. - Validate the attribute the same way the legacy `gm_slot_buffer` branch does: must be > 0, must be <= 8 for dir_mask=1/2 or 4 for dir_mask=3. 2. `lib/PTO/IR/PTO.cpp` - InitializeL2G2LPipeOp::verify - Allow `local_slot_num` on the no-`local_addr` path when the `gm_addr` operand is a `!pto.tensor_view<...>` (i.e. the gm_slot_tensor form). The previous rule "'local_slot_num' is only allowed when 'local_addr' is present" was tied to the legacy local-FIFO form and predates the address-based slot model. The `localSlotNum > 0 && <= slot_num` bounds check still runs in both branches. 3. `lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp` - In `createFrontendPipe`, propagate `initOp.getLocalSlotNumAttr()` into the lowered `InitializeL2G2LPipeOp` for the gm_slot_tensor branch (was hard-coded to `IntegerAttr{}`). Combined with the existing `buildTPipeTokenFromInitOp` logic in `PTOToEmitC.cpp`, this makes `local_slot_num=N` on the IR flow through to `TPipe<..., N, ...>` in the generated C++. 4. `test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto` - New lit test exercising `local_slot_num=2` on both `aic_initialize_pipe` and `aiv_initialize_pipe` with the gm_slot_tensor form, asserting the lowered TPipe carries `..., 8, 2, ...`. Backwards compatibility: when `local_slot_num` is absent the existing fallback (LocalSlotNum=SlotNum=8 on the address-based path, LocalSlotNum=SlotNum on the legacy path) is preserved, so existing tests under `test/lit/pto/tpush_tpop_globaltensor_*.pto` continue to pass unchanged. Note: the wider question of whether the *default* LocalSlotNum on the address-based path should be 2 (matching the C++ TPipe template default) instead of SlotNum is left open. That change touches several existing lit-test expectations and is more invasive; surfacing the attribute is sufficient to unblock the manual-parity flash-attention kernel in hw-native-sys/pto-isa#117. Motivating downstream: - hw-native-sys/pto-isa#117 - PTO-DSL Flash Attention performance kernel consumed via PTO-ISA/pto-dsl#8 (companion frontend PR exposing `local_slot_num=` on the Python wrapper). --- lib/PTO/IR/PTO.cpp | 30 +++++++-- .../PTOLowerFrontendPipeOpsPass.cpp | 6 +- ...sh_tpop_globaltensor_local_slot_num_a3.pto | 66 +++++++++++++++++++ 3 files changed, 94 insertions(+), 8 deletions(-) create mode 100644 test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 4047bc7a6..874ba04e6 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -10677,9 +10677,17 @@ static LogicalResult verifyFrontendInitCommon(InitOpT op, "globaltensor pipe init expects only 'gm_slot_tensor' and no " "'gm_slot_buffer', 'c2v_consumer_buf', or 'v2c_consumer_buf'"); } - if (op.getLocalSlotNumAttr()) - return op.emitOpError( - "globaltensor pipe init does not use 'local_slot_num'"); + if (auto localSlotNumAttr = op.getLocalSlotNumAttr()) { + int32_t localSlotNum = localSlotNumAttr.getInt(); + if (localSlotNum <= 0) + return op.emitOpError("expects 'local_slot_num' to be greater than 0"); + int32_t loweredSlotNum = dirMask == 3 ? 4 : 8; + if (localSlotNum > loweredSlotNum) { + return op.emitOpError() + << "expects 'local_slot_num' to be less than or equal to " + << loweredSlotNum << " for dir_mask = " << static_cast(dirMask); + } + } if (getTargetArch(op.getOperation()) == PTOArch::A5) { return op.emitOpError( "globaltensor pipe entries are supported for a2/a3 l2g2l pipes"); @@ -11437,12 +11445,24 @@ LogicalResult InitializeL2G2LPipeOp::verify() { : std::nullopt))) return failure(); + bool hasGmSlotTensor = + getGmAddr() && isa(getGmAddr().getType()); + if (!getLocalAddr()) { if (getPeerLocalAddr()) return emitOpError("'peer_local_addr' requires 'local_addr'"); - if (getLocalSlotNumAttr()) + if (getLocalSlotNumAttr() && !hasGmSlotTensor) return emitOpError( - "'local_slot_num' is only allowed when 'local_addr' is present"); + "'local_slot_num' is only allowed when 'local_addr' is present " + "or the pipe init uses a globaltensor slot"); + if (auto localSlotNumAttr = getLocalSlotNumAttr()) { + int32_t localSlotNum = localSlotNumAttr.getInt(); + if (localSlotNum <= 0) + return emitOpError("expects 'local_slot_num' to be greater than 0"); + if (static_cast(localSlotNum) > getSlotNum()) + return emitOpError( + "expects 'local_slot_num' to be less than or equal to slot_num"); + } return success(); } diff --git a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp index 162e7e9b5..fe00dd457 100644 --- a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp +++ b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp @@ -126,9 +126,9 @@ static FailureOr createFrontendPipe(InitOpT initOp, IRRewriter &rewriter, "globaltensor pipe entries are supported for a2/a3 l2g2l pipes"); auto pipe = rewriter.create( - loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, IntegerAttr{}, - IntegerAttr{}, noSplitAttr, initOp.getGmSlotTensor(), Value{}, - Value{}); + loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, + initOp.getLocalSlotNumAttr(), IntegerAttr{}, noSplitAttr, + initOp.getGmSlotTensor(), Value{}, Value{}); propagateFrontendIdAttr(initOp, pipe.getOperation(), rewriter); return pipe.getPipe(); } diff --git a/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto new file mode 100644 index 000000000..22132bb9b --- /dev/null +++ b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto @@ -0,0 +1,66 @@ +// RUN: ptoas --pto-arch=a3 %s 2>&1 | FileCheck %s + +// Verify that `local_slot_num` is accepted on the gm_slot_tensor (address-based) +// form of pto.aic_initialize_pipe / pto.aiv_initialize_pipe and that it flows +// through to the lowered TPipe<..., LocalSlotNum, ...> template instantiation. +// +// Without an override, ptoas keeps the current default of LocalSlotNum=SlotNum. +// With local_slot_num=2, the lowered TPipe must use 2, which matches the +// `LocalSlotNum=2` template default in +// `include/pto/npu/a2a3/TPush.hpp` and the manual flash-attention kernel in +// `kernels/manual/common/flash_atten/fa_performance_kernel.cpp`. + +module { + func.func @cube_kernel( + %gm_slot_buffer : !pto.ptr, + %src : !pto.tile_buf) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %gm_slots = pto.make_tensor_view %gm_slot_buffer, + shape = [%c16, %c16], strides = [%c16, %c1] + : !pto.tensor_view<16x16xf32> + pto.aic_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2} + (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>) + + %entry = pto.talloc_to_aiv {id = 0, split = 0} + -> !pto.tensor_view<16x16xf32> + %entry_partition = pto.partition_view %entry, + offsets = [%c0, %c0], sizes = [%c16, %c16] + : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32> + pto.tstore ins(%src : !pto.tile_buf) + outs(%entry_partition : !pto.partition_tensor_view<16x16xf32>) + pto.tpush_to_aiv(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0} + func.return + } + + func.func @vector_kernel( + %gm_slot_buffer : !pto.ptr, + %dst : !pto.tile_buf) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %gm_slots = pto.make_tensor_view %gm_slot_buffer, + shape = [%c16, %c16], strides = [%c16, %c1] + : !pto.tensor_view<16x16xf32> + pto.aiv_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2} + (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>) + + %entry = pto.tpop_from_aic {id = 0, split = 0} + -> !pto.tensor_view<16x16xf32> + %entry_partition = pto.partition_view %entry, + offsets = [%c0, %c0], sizes = [%c16, %c16] + : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32> + pto.tload ins(%entry_partition : !pto.partition_tensor_view<16x16xf32>) + outs(%dst : !pto.tile_buf) + pto.tfree_from_aic(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0} + func.return + } +} + +// CHECK-LABEL: AICORE void cube_kernel +// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2, +// CHECK-LABEL: AICORE void vector_kernel +// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,