diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 4047bc7a6..874ba04e6 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -10677,9 +10677,17 @@ static LogicalResult verifyFrontendInitCommon(InitOpT op, "globaltensor pipe init expects only 'gm_slot_tensor' and no " "'gm_slot_buffer', 'c2v_consumer_buf', or 'v2c_consumer_buf'"); } - if (op.getLocalSlotNumAttr()) - return op.emitOpError( - "globaltensor pipe init does not use 'local_slot_num'"); + if (auto localSlotNumAttr = op.getLocalSlotNumAttr()) { + int32_t localSlotNum = localSlotNumAttr.getInt(); + if (localSlotNum <= 0) + return op.emitOpError("expects 'local_slot_num' to be greater than 0"); + int32_t loweredSlotNum = dirMask == 3 ? 4 : 8; + if (localSlotNum > loweredSlotNum) { + return op.emitOpError() + << "expects 'local_slot_num' to be less than or equal to " + << loweredSlotNum << " for dir_mask = " << static_cast(dirMask); + } + } if (getTargetArch(op.getOperation()) == PTOArch::A5) { return op.emitOpError( "globaltensor pipe entries are supported for a2/a3 l2g2l pipes"); @@ -11437,12 +11445,24 @@ LogicalResult InitializeL2G2LPipeOp::verify() { : std::nullopt))) return failure(); + bool hasGmSlotTensor = + getGmAddr() && isa(getGmAddr().getType()); + if (!getLocalAddr()) { if (getPeerLocalAddr()) return emitOpError("'peer_local_addr' requires 'local_addr'"); - if (getLocalSlotNumAttr()) + if (getLocalSlotNumAttr() && !hasGmSlotTensor) return emitOpError( - "'local_slot_num' is only allowed when 'local_addr' is present"); + "'local_slot_num' is only allowed when 'local_addr' is present " + "or the pipe init uses a globaltensor slot"); + if (auto localSlotNumAttr = getLocalSlotNumAttr()) { + int32_t localSlotNum = localSlotNumAttr.getInt(); + if (localSlotNum <= 0) + return emitOpError("expects 'local_slot_num' to be greater than 0"); + if (static_cast(localSlotNum) > getSlotNum()) + return emitOpError( + "expects 'local_slot_num' to be less than or equal to slot_num"); + } return success(); } diff --git a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp index 162e7e9b5..fe00dd457 100644 --- a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp +++ b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp @@ -126,9 +126,9 @@ static FailureOr createFrontendPipe(InitOpT initOp, IRRewriter &rewriter, "globaltensor pipe entries are supported for a2/a3 l2g2l pipes"); auto pipe = rewriter.create( - loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, IntegerAttr{}, - IntegerAttr{}, noSplitAttr, initOp.getGmSlotTensor(), Value{}, - Value{}); + loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, + initOp.getLocalSlotNumAttr(), IntegerAttr{}, noSplitAttr, + initOp.getGmSlotTensor(), Value{}, Value{}); propagateFrontendIdAttr(initOp, pipe.getOperation(), rewriter); return pipe.getPipe(); } diff --git a/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto new file mode 100644 index 000000000..22132bb9b --- /dev/null +++ b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto @@ -0,0 +1,66 @@ +// RUN: ptoas --pto-arch=a3 %s 2>&1 | FileCheck %s + +// Verify that `local_slot_num` is accepted on the gm_slot_tensor (address-based) +// form of pto.aic_initialize_pipe / pto.aiv_initialize_pipe and that it flows +// through to the lowered TPipe<..., LocalSlotNum, ...> template instantiation. +// +// Without an override, ptoas keeps the current default of LocalSlotNum=SlotNum. +// With local_slot_num=2, the lowered TPipe must use 2, which matches the +// `LocalSlotNum=2` template default in +// `include/pto/npu/a2a3/TPush.hpp` and the manual flash-attention kernel in +// `kernels/manual/common/flash_atten/fa_performance_kernel.cpp`. + +module { + func.func @cube_kernel( + %gm_slot_buffer : !pto.ptr, + %src : !pto.tile_buf) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %gm_slots = pto.make_tensor_view %gm_slot_buffer, + shape = [%c16, %c16], strides = [%c16, %c1] + : !pto.tensor_view<16x16xf32> + pto.aic_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2} + (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>) + + %entry = pto.talloc_to_aiv {id = 0, split = 0} + -> !pto.tensor_view<16x16xf32> + %entry_partition = pto.partition_view %entry, + offsets = [%c0, %c0], sizes = [%c16, %c16] + : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32> + pto.tstore ins(%src : !pto.tile_buf) + outs(%entry_partition : !pto.partition_tensor_view<16x16xf32>) + pto.tpush_to_aiv(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0} + func.return + } + + func.func @vector_kernel( + %gm_slot_buffer : !pto.ptr, + %dst : !pto.tile_buf) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %gm_slots = pto.make_tensor_view %gm_slot_buffer, + shape = [%c16, %c16], strides = [%c16, %c1] + : !pto.tensor_view<16x16xf32> + pto.aiv_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2} + (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>) + + %entry = pto.tpop_from_aic {id = 0, split = 0} + -> !pto.tensor_view<16x16xf32> + %entry_partition = pto.partition_view %entry, + offsets = [%c0, %c0], sizes = [%c16, %c16] + : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32> + pto.tload ins(%entry_partition : !pto.partition_tensor_view<16x16xf32>) + outs(%dst : !pto.tile_buf) + pto.tfree_from_aic(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0} + func.return + } +} + +// CHECK-LABEL: AICORE void cube_kernel +// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2, +// CHECK-LABEL: AICORE void vector_kernel +// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,