From 5cdc58fbf0698d3bd650f44f675bc03cde195b2f Mon Sep 17 00:00:00 2001
From: chenshengxin <hw_chenshengxin@163.com>
Date: Sat, 9 May 2026 19:38:04 +0800
Subject: [PATCH] feat(frontend): allow local_slot_num on gm_slot_tensor pipe
 init

Previously `pto.aic_initialize_pipe` / `pto.aiv_initialize_pipe` rejected
`local_slot_num` whenever they used the address-based `gm_slot_tensor`
operand:

    'pto.aic_initialize_pipe' op globaltensor pipe init does not use
    'local_slot_num'

That made it impossible for kernels to mirror the manual-flash-attention
TPipe instantiation. The C++ template default in
`include/pto/npu/a2a3/TPush.hpp` is `LocalSlotNum=2`, which is also what
`kernels/manual/common/flash_atten/fa_performance_kernel.cpp` uses on
QK/PV/P pipes. Without an IR-level override, ptoas's address-based pipe
lowering ends up emitting `TPipe<..., 8, 8, false>` (LocalSlotNum=SlotNum)
because `buildTPipeTokenFromInitOp` in `PTOToEmitC.cpp` falls back to
`getSlotNum()` when no `local_slot_num` attribute is present. The
mismatch inflates the FFTS event multiplex (8 local x 8 global per pipe
instead of 2 local x 8 global) and exhausts `--enable-insert-sync`'s
8-event pool at long sequences, e.g. flash-attention `S1>=4096` on a3.

Changes:

1. `lib/PTO/IR/PTO.cpp` - frontend verifier
   - Drop the blanket "globaltensor pipe init does not use 'local_slot_num'"
     rejection in `verifyFrontendInitCommon`.
   - Validate the attribute the same way the legacy `gm_slot_buffer`
     branch does: must be > 0, must be <= 8 for dir_mask=1/2 or 4 for
     dir_mask=3.

2. `lib/PTO/IR/PTO.cpp` - InitializeL2G2LPipeOp::verify
   - Allow `local_slot_num` on the no-`local_addr` path when the
     `gm_addr` operand is a `!pto.tensor_view<...>` (i.e. the
     gm_slot_tensor form). The previous rule
     "'local_slot_num' is only allowed when 'local_addr' is present"
     was tied to the legacy local-FIFO form and predates the
     address-based slot model. The `localSlotNum > 0 && <= slot_num`
     bounds check still runs in both branches.

3. `lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp`
   - In `createFrontendPipe`, propagate `initOp.getLocalSlotNumAttr()`
     into the lowered `InitializeL2G2LPipeOp` for the gm_slot_tensor
     branch (was hard-coded to `IntegerAttr{}`). Combined with the
     existing `buildTPipeTokenFromInitOp` logic in `PTOToEmitC.cpp`,
     this makes `local_slot_num=N` on the IR flow through to
     `TPipe<..., N, ...>` in the generated C++.

4. `test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto`
   - New lit test exercising `local_slot_num=2` on both
     `aic_initialize_pipe` and `aiv_initialize_pipe` with the
     gm_slot_tensor form, asserting the lowered TPipe carries
     `..., 8, 2, ...`.

Backwards compatibility: when `local_slot_num` is absent the existing
fallback (LocalSlotNum=SlotNum=8 on the address-based path,
LocalSlotNum=SlotNum on the legacy path) is preserved, so existing tests
under `test/lit/pto/tpush_tpop_globaltensor_*.pto` continue to pass
unchanged.

Note: the wider question of whether the *default* LocalSlotNum on the
address-based path should be 2 (matching the C++ TPipe template default)
instead of SlotNum is left open. That change touches several existing
lit-test expectations and is more invasive; surfacing the attribute is
sufficient to unblock the manual-parity flash-attention kernel in
hw-native-sys/pto-isa#117.

Motivating downstream:
- hw-native-sys/pto-isa#117 - PTO-DSL Flash Attention performance kernel
  consumed via PTO-ISA/pto-dsl#8 (companion frontend PR exposing
  `local_slot_num=` on the Python wrapper).
---
 lib/PTO/IR/PTO.cpp                            | 30 +++++++--
 .../PTOLowerFrontendPipeOpsPass.cpp           |  6 +-
 ...sh_tpop_globaltensor_local_slot_num_a3.pto | 66 +++++++++++++++++++
 3 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto

diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp
index 4047bc7a6..874ba04e6 100644
--- a/lib/PTO/IR/PTO.cpp
+++ b/lib/PTO/IR/PTO.cpp
@@ -10677,9 +10677,17 @@ static LogicalResult verifyFrontendInitCommon(InitOpT op,
           "globaltensor pipe init expects only 'gm_slot_tensor' and no "
           "'gm_slot_buffer', 'c2v_consumer_buf', or 'v2c_consumer_buf'");
     }
-    if (op.getLocalSlotNumAttr())
-      return op.emitOpError(
-          "globaltensor pipe init does not use 'local_slot_num'");
+    if (auto localSlotNumAttr = op.getLocalSlotNumAttr()) {
+      int32_t localSlotNum = localSlotNumAttr.getInt();
+      if (localSlotNum <= 0)
+        return op.emitOpError("expects 'local_slot_num' to be greater than 0");
+      int32_t loweredSlotNum = dirMask == 3 ? 4 : 8;
+      if (localSlotNum > loweredSlotNum) {
+        return op.emitOpError()
+               << "expects 'local_slot_num' to be less than or equal to "
+               << loweredSlotNum << " for dir_mask = " << static_cast<int>(dirMask);
+      }
+    }
     if (getTargetArch(op.getOperation()) == PTOArch::A5) {
       return op.emitOpError(
           "globaltensor pipe entries are supported for a2/a3 l2g2l pipes");
@@ -11437,12 +11445,24 @@ LogicalResult InitializeL2G2LPipeOp::verify() {
                                  : std::nullopt)))
     return failure();
 
+  bool hasGmSlotTensor =
+      getGmAddr() && isa<TensorViewType>(getGmAddr().getType());
+
   if (!getLocalAddr()) {
     if (getPeerLocalAddr())
       return emitOpError("'peer_local_addr' requires 'local_addr'");
-    if (getLocalSlotNumAttr())
+    if (getLocalSlotNumAttr() && !hasGmSlotTensor)
       return emitOpError(
-          "'local_slot_num' is only allowed when 'local_addr' is present");
+          "'local_slot_num' is only allowed when 'local_addr' is present "
+          "or the pipe init uses a globaltensor slot");
+    if (auto localSlotNumAttr = getLocalSlotNumAttr()) {
+      int32_t localSlotNum = localSlotNumAttr.getInt();
+      if (localSlotNum <= 0)
+        return emitOpError("expects 'local_slot_num' to be greater than 0");
+      if (static_cast<uint32_t>(localSlotNum) > getSlotNum())
+        return emitOpError(
+            "expects 'local_slot_num' to be less than or equal to slot_num");
+    }
     return success();
   }
 
diff --git a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
index 162e7e9b5..fe00dd457 100644
--- a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
+++ b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
@@ -126,9 +126,9 @@ static FailureOr<Value> createFrontendPipe(InitOpT initOp, IRRewriter &rewriter,
           "globaltensor pipe entries are supported for a2/a3 l2g2l pipes");
 
     auto pipe = rewriter.create<InitializeL2G2LPipeOp>(
-        loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, IntegerAttr{},
-        IntegerAttr{}, noSplitAttr, initOp.getGmSlotTensor(), Value{},
-        Value{});
+        loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr,
+        initOp.getLocalSlotNumAttr(), IntegerAttr{}, noSplitAttr,
+        initOp.getGmSlotTensor(), Value{}, Value{});
     propagateFrontendIdAttr(initOp, pipe.getOperation(), rewriter);
     return pipe.getPipe();
   }
diff --git a/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto
new file mode 100644
index 000000000..22132bb9b
--- /dev/null
+++ b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto
@@ -0,0 +1,66 @@
+// RUN: ptoas --pto-arch=a3 %s 2>&1 | FileCheck %s
+
+// Verify that `local_slot_num` is accepted on the gm_slot_tensor (address-based)
+// form of pto.aic_initialize_pipe / pto.aiv_initialize_pipe and that it flows
+// through to the lowered TPipe<..., LocalSlotNum, ...> template instantiation.
+//
+// Without an override, ptoas keeps the current default of LocalSlotNum=SlotNum.
+// With local_slot_num=2, the lowered TPipe must use 2, which matches the
+// `LocalSlotNum=2` template default in
+// `include/pto/npu/a2a3/TPush.hpp` and the manual flash-attention kernel in
+// `kernels/manual/common/flash_atten/fa_performance_kernel.cpp`.
+
+module {
+  func.func @cube_kernel(
+      %gm_slot_buffer : !pto.ptr<f32>,
+      %src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %gm_slots = pto.make_tensor_view %gm_slot_buffer,
+      shape = [%c16, %c16], strides = [%c16, %c1]
+      : !pto.tensor_view<16x16xf32>
+    pto.aic_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2}
+      (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>)
+
+    %entry = pto.talloc_to_aiv {id = 0, split = 0}
+      -> !pto.tensor_view<16x16xf32>
+    %entry_partition = pto.partition_view %entry,
+      offsets = [%c0, %c0], sizes = [%c16, %c16]
+      : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tstore ins(%src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+               outs(%entry_partition : !pto.partition_tensor_view<16x16xf32>)
+    pto.tpush_to_aiv(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0}
+    func.return
+  }
+
+  func.func @vector_kernel(
+      %gm_slot_buffer : !pto.ptr<f32>,
+      %dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %gm_slots = pto.make_tensor_view %gm_slot_buffer,
+      shape = [%c16, %c16], strides = [%c16, %c1]
+      : !pto.tensor_view<16x16xf32>
+    pto.aiv_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2}
+      (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>)
+
+    %entry = pto.tpop_from_aic {id = 0, split = 0}
+      -> !pto.tensor_view<16x16xf32>
+    %entry_partition = pto.partition_view %entry,
+      offsets = [%c0, %c0], sizes = [%c16, %c16]
+      : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tload ins(%entry_partition : !pto.partition_tensor_view<16x16xf32>)
+              outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+    pto.tfree_from_aic(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0}
+    func.return
+  }
+}
+
+// CHECK-LABEL: AICORE void cube_kernel
+// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,
+// CHECK-LABEL: AICORE void vector_kernel
+// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,