Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions lib/PTO/IR/PTO.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10677,9 +10677,17 @@ static LogicalResult verifyFrontendInitCommon(InitOpT op,
"globaltensor pipe init expects only 'gm_slot_tensor' and no "
"'gm_slot_buffer', 'c2v_consumer_buf', or 'v2c_consumer_buf'");
}
if (op.getLocalSlotNumAttr())
return op.emitOpError(
"globaltensor pipe init does not use 'local_slot_num'");
if (auto localSlotNumAttr = op.getLocalSlotNumAttr()) {
int32_t localSlotNum = localSlotNumAttr.getInt();
if (localSlotNum <= 0)
return op.emitOpError("expects 'local_slot_num' to be greater than 0");
int32_t loweredSlotNum = dirMask == 3 ? 4 : 8;
if (localSlotNum > loweredSlotNum) {
return op.emitOpError()
<< "expects 'local_slot_num' to be less than or equal to "
<< loweredSlotNum << " for dir_mask = " << static_cast<int>(dirMask);
}
}
Comment on lines +10680 to +10690
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The validation logic for local_slot_num (including the bounds check and the calculation of loweredSlotNum) is duplicated between the hasGlobalSlotTensor branch and the local pipe branch (lines 10709-10719). This logic should be refactored into a common block at the beginning of the function to improve maintainability and ensure consistency.

if (getTargetArch(op.getOperation()) == PTOArch::A5) {
return op.emitOpError(
"globaltensor pipe entries are supported for a2/a3 l2g2l pipes");
Expand Down Expand Up @@ -11437,12 +11445,24 @@ LogicalResult InitializeL2G2LPipeOp::verify() {
: std::nullopt)))
return failure();

bool hasGmSlotTensor =
getGmAddr() && isa<TensorViewType>(getGmAddr().getType());

if (!getLocalAddr()) {
if (getPeerLocalAddr())
return emitOpError("'peer_local_addr' requires 'local_addr'");
if (getLocalSlotNumAttr())
if (getLocalSlotNumAttr() && !hasGmSlotTensor)
return emitOpError(
"'local_slot_num' is only allowed when 'local_addr' is present");
"'local_slot_num' is only allowed when 'local_addr' is present "
"or the pipe init uses a globaltensor slot");
if (auto localSlotNumAttr = getLocalSlotNumAttr()) {
int32_t localSlotNum = localSlotNumAttr.getInt();
if (localSlotNum <= 0)
return emitOpError("expects 'local_slot_num' to be greater than 0");
if (static_cast<uint32_t>(localSlotNum) > getSlotNum())
return emitOpError(
"expects 'local_slot_num' to be less than or equal to slot_num");
}
Comment on lines +11458 to +11465
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The bounds check for local_slot_num is duplicated. It is added here for the case where local_addr is absent but gm_slot_tensor is present, and it already exists later in the function (lines 11469-11476) for the case where local_addr is present. This logic should be unified before the if (!getLocalAddr()) block to avoid redundancy.

return success();
}

Expand Down
6 changes: 3 additions & 3 deletions lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ static FailureOr<Value> createFrontendPipe(InitOpT initOp, IRRewriter &rewriter,
"globaltensor pipe entries are supported for a2/a3 l2g2l pipes");

auto pipe = rewriter.create<InitializeL2G2LPipeOp>(
loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, IntegerAttr{},
IntegerAttr{}, noSplitAttr, initOp.getGmSlotTensor(), Value{},
Value{});
loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr,
initOp.getLocalSlotNumAttr(), IntegerAttr{}, noSplitAttr,
initOp.getGmSlotTensor(), Value{}, Value{});
propagateFrontendIdAttr(initOp, pipe.getOperation(), rewriter);
return pipe.getPipe();
}
Expand Down
66 changes: 66 additions & 0 deletions test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// RUN: ptoas --pto-arch=a3 %s 2>&1 | FileCheck %s

// Verify that `local_slot_num` is accepted on the gm_slot_tensor (address-based)
// form of pto.aic_initialize_pipe / pto.aiv_initialize_pipe and that it flows
// through to the lowered TPipe<..., LocalSlotNum, ...> template instantiation.
//
// Without an override, ptoas keeps the current default of LocalSlotNum=SlotNum.
// With local_slot_num=2, the lowered TPipe must use 2, which matches the
// `LocalSlotNum=2` template default in
// `include/pto/npu/a2a3/TPush.hpp` and the manual flash-attention kernel in
// `kernels/manual/common/flash_atten/fa_performance_kernel.cpp`.

module {
func.func @cube_kernel(
%gm_slot_buffer : !pto.ptr<f32>,
%src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%gm_slots = pto.make_tensor_view %gm_slot_buffer,
shape = [%c16, %c16], strides = [%c16, %c1]
: !pto.tensor_view<16x16xf32>
pto.aic_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2}
(gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>)

%entry = pto.talloc_to_aiv {id = 0, split = 0}
-> !pto.tensor_view<16x16xf32>
%entry_partition = pto.partition_view %entry,
offsets = [%c0, %c0], sizes = [%c16, %c16]
: !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
pto.tstore ins(%src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
outs(%entry_partition : !pto.partition_tensor_view<16x16xf32>)
pto.tpush_to_aiv(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0}
func.return
}

func.func @vector_kernel(
%gm_slot_buffer : !pto.ptr<f32>,
%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%gm_slots = pto.make_tensor_view %gm_slot_buffer,
shape = [%c16, %c16], strides = [%c16, %c1]
: !pto.tensor_view<16x16xf32>
pto.aiv_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2}
(gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>)

%entry = pto.tpop_from_aic {id = 0, split = 0}
-> !pto.tensor_view<16x16xf32>
%entry_partition = pto.partition_view %entry,
offsets = [%c0, %c0], sizes = [%c16, %c16]
: !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
pto.tload ins(%entry_partition : !pto.partition_tensor_view<16x16xf32>)
outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
pto.tfree_from_aic(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0}
func.return
}
}

// CHECK-LABEL: AICORE void cube_kernel
// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,
// CHECK-LABEL: AICORE void vector_kernel
// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,
Loading