-
Notifications
You must be signed in to change notification settings - Fork 49
feat(frontend): allow local_slot_num on gm_slot_tensor pipe init #650
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10677,9 +10677,17 @@ static LogicalResult verifyFrontendInitCommon(InitOpT op, | |
| "globaltensor pipe init expects only 'gm_slot_tensor' and no " | ||
| "'gm_slot_buffer', 'c2v_consumer_buf', or 'v2c_consumer_buf'"); | ||
| } | ||
| if (op.getLocalSlotNumAttr()) | ||
| return op.emitOpError( | ||
| "globaltensor pipe init does not use 'local_slot_num'"); | ||
| if (auto localSlotNumAttr = op.getLocalSlotNumAttr()) { | ||
| int32_t localSlotNum = localSlotNumAttr.getInt(); | ||
| if (localSlotNum <= 0) | ||
| return op.emitOpError("expects 'local_slot_num' to be greater than 0"); | ||
| int32_t loweredSlotNum = dirMask == 3 ? 4 : 8; | ||
| if (localSlotNum > loweredSlotNum) { | ||
| return op.emitOpError() | ||
| << "expects 'local_slot_num' to be less than or equal to " | ||
| << loweredSlotNum << " for dir_mask = " << static_cast<int>(dirMask); | ||
| } | ||
| } | ||
| if (getTargetArch(op.getOperation()) == PTOArch::A5) { | ||
| return op.emitOpError( | ||
| "globaltensor pipe entries are supported for a2/a3 l2g2l pipes"); | ||
|
|
@@ -11437,12 +11445,24 @@ LogicalResult InitializeL2G2LPipeOp::verify() { | |
| : std::nullopt))) | ||
| return failure(); | ||
|
|
||
| bool hasGmSlotTensor = | ||
| getGmAddr() && isa<TensorViewType>(getGmAddr().getType()); | ||
|
|
||
| if (!getLocalAddr()) { | ||
| if (getPeerLocalAddr()) | ||
| return emitOpError("'peer_local_addr' requires 'local_addr'"); | ||
| if (getLocalSlotNumAttr()) | ||
| if (getLocalSlotNumAttr() && !hasGmSlotTensor) | ||
| return emitOpError( | ||
| "'local_slot_num' is only allowed when 'local_addr' is present"); | ||
| "'local_slot_num' is only allowed when 'local_addr' is present " | ||
| "or the pipe init uses a globaltensor slot"); | ||
| if (auto localSlotNumAttr = getLocalSlotNumAttr()) { | ||
| int32_t localSlotNum = localSlotNumAttr.getInt(); | ||
| if (localSlotNum <= 0) | ||
| return emitOpError("expects 'local_slot_num' to be greater than 0"); | ||
| if (static_cast<uint32_t>(localSlotNum) > getSlotNum()) | ||
| return emitOpError( | ||
| "expects 'local_slot_num' to be less than or equal to slot_num"); | ||
| } | ||
|
Comment on lines
+11458
to
+11465
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The bounds check for |
||
| return success(); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| // RUN: ptoas --pto-arch=a3 %s 2>&1 | FileCheck %s | ||
|
|
||
| // Verify that `local_slot_num` is accepted on the gm_slot_tensor (address-based) | ||
| // form of pto.aic_initialize_pipe / pto.aiv_initialize_pipe and that it flows | ||
| // through to the lowered TPipe<..., LocalSlotNum, ...> template instantiation. | ||
| // | ||
| // Without an override, ptoas keeps the current default of LocalSlotNum=SlotNum. | ||
| // With local_slot_num=2, the lowered TPipe must use 2, which matches the | ||
| // `LocalSlotNum=2` template default in | ||
| // `include/pto/npu/a2a3/TPush.hpp` and the manual flash-attention kernel in | ||
| // `kernels/manual/common/flash_atten/fa_performance_kernel.cpp`. | ||
|
|
||
| module { | ||
| func.func @cube_kernel( | ||
| %gm_slot_buffer : !pto.ptr<f32>, | ||
| %src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>) | ||
| attributes {pto.kernel_kind = #pto.kernel_kind<cube>} { | ||
| %c0 = arith.constant 0 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c16 = arith.constant 16 : index | ||
| %gm_slots = pto.make_tensor_view %gm_slot_buffer, | ||
| shape = [%c16, %c16], strides = [%c16, %c1] | ||
| : !pto.tensor_view<16x16xf32> | ||
| pto.aic_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2} | ||
| (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>) | ||
|
|
||
| %entry = pto.talloc_to_aiv {id = 0, split = 0} | ||
| -> !pto.tensor_view<16x16xf32> | ||
| %entry_partition = pto.partition_view %entry, | ||
| offsets = [%c0, %c0], sizes = [%c16, %c16] | ||
| : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
| pto.tstore ins(%src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>) | ||
| outs(%entry_partition : !pto.partition_tensor_view<16x16xf32>) | ||
| pto.tpush_to_aiv(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0} | ||
| func.return | ||
| } | ||
|
|
||
| func.func @vector_kernel( | ||
| %gm_slot_buffer : !pto.ptr<f32>, | ||
| %dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>) | ||
| attributes {pto.kernel_kind = #pto.kernel_kind<vector>} { | ||
| %c0 = arith.constant 0 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c16 = arith.constant 16 : index | ||
| %gm_slots = pto.make_tensor_view %gm_slot_buffer, | ||
| shape = [%c16, %c16], strides = [%c16, %c1] | ||
| : !pto.tensor_view<16x16xf32> | ||
| pto.aiv_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2} | ||
| (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>) | ||
|
|
||
| %entry = pto.tpop_from_aic {id = 0, split = 0} | ||
| -> !pto.tensor_view<16x16xf32> | ||
| %entry_partition = pto.partition_view %entry, | ||
| offsets = [%c0, %c0], sizes = [%c16, %c16] | ||
| : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
| pto.tload ins(%entry_partition : !pto.partition_tensor_view<16x16xf32>) | ||
| outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>) | ||
| pto.tfree_from_aic(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0} | ||
| func.return | ||
| } | ||
| } | ||
|
|
||
| // CHECK-LABEL: AICORE void cube_kernel | ||
| // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2, | ||
| // CHECK-LABEL: AICORE void vector_kernel | ||
| // CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The validation logic for
local_slot_num(including the bounds check and the calculation ofloweredSlotNum) is duplicated between thehasGlobalSlotTensorbranch and the local pipe branch (lines 10709-10719). This logic should be refactored into a common block at the beginning of the function to improve maintainability and ensure consistency.