Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion lib/PTO/Transforms/InsertSync/SyncEventIdAllocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,10 @@ void SyncEventIdAllocation::UpdateBackwardMatchSync(
syncFront->reallocatedLoopHeadTailSync = true;
syncEnd->reallocatedLoopHeadTailSync = true;
syncIR_[ptr->beginId]->pipeBefore.push_back(syncFront.get());
syncIR_[ptr->endId]->pipeAfter.push_back(syncEnd.get());
// Insert the synthetic tail wait ahead of existing loop-end sets so the
// loop tail anchor does not emit a new set before consuming the carried
// event of the previous iteration.
syncIR_[ptr->endId]->pipeAfter.push_front(syncEnd.get());
} else {
syncFront->SetSyncIRIndex(0);
syncEnd->SetSyncIRIndex(syncIR_.size() - 1);
Expand Down
21 changes: 15 additions & 6 deletions test/lit/pto/issue428_cube_sync_regression.pto
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,30 @@
// after preheat sets are emitted.
//
// CHECK-LABEL: tri_inv_block2x2_fp16(
// CHECK: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
// CHECK: set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
// CHECK-NEXT: wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
// CHECK-NEXT: for (size_t
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID6);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_FIX, EVENT_ID4);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_FIX, EVENT_ID4);
// CHECK: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
// CHECK: set_flag(PIPE_FIX, PIPE_M, EVENT_ID5);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7);
// CHECK-NEXT: wait_flag(PIPE_FIX, PIPE_M, EVENT_ID5);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
// CHECK-NEXT: for (size_t
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
// CHECK: ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);

Expand Down
20 changes: 11 additions & 9 deletions test/lit/pto/issue564_k_loop_mte1_mte2_wait_regression.pto
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,22 @@
// CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2:[0-9]+]]);
// CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3:[0-9]+]]);
// CHECK-NEXT: for (size_t
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2]]);
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]);
// CHECK-NEXT: TLOAD(
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3]]);
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2]]);
// CHECK-NEXT: TLOAD(
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]);
// CHECK: pipe_barrier(PIPE_ALL);
// CHECK-NEXT: TLOAD(
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD1]]);
// CHECK: pipe_barrier(PIPE_ALL);
// CHECK-NEXT: TLOAD(
// CHECK: set_flag(PIPE_M, PIPE_FIX, EVENT_ID[[PUSH:[0-9]+]]);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID[[POST:[0-9]+]]);
// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]);
// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD1]]);
// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3]]);
// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2]]);
// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3]]);
// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD1]]);
// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]);
// CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[EXIT0:[0-9]+]]);
// CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[EXIT1:[0-9]+]]);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_FIX, EVENT_ID[[PUSH:[0-9]+]]);
// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID[[POST:[0-9]+]]);
// CHECK-NEXT: wait_flag(PIPE_M, PIPE_FIX, EVENT_ID[[PUSH]]);
// CHECK-NEXT: TPUSH
// CHECK: ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
Expand Down
192 changes: 192 additions & 0 deletions test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s | FileCheck %s
//
// Reproducer for issue #622:
// a mixed 256-wide / 128-wide DIR_C2V pipeline with local_slot_num = 2 can
// currently reused the same PIPE_V -> PIPE_MTE2 event id across multiple
// outstanding sync pairs. The loop-tail synthetic wait must be emitted before
// the post-loop local set so the same-key order stays serialized:
//
// set(E0)
// wait(E0)
// ...
// wait(E0)
// set(E0)
// wait(E0)
//
// This file is intentionally kept small and readable so we can debug event-id
// lifetime reuse in isolation.
//
// CHECK-LABEL: AICORE void vector_kernel(
// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
// CHECK-NEXT: wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID5);
// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4);
// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7);
// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6);
// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5);
// CHECK-NEXT: for (size_t
// CHECK: };
// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5);
// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6);
// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7);
// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
// CHECK-NEXT: TPOP<TPipe<2, Direction::DIR_C2V, 16384, 8, 2, false>,
// CHECK-NEXT: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
// CHECK-NEXT: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);

module {
func.func @cube_kernel(%gm0: !pto.ptr<f32>, %gm1: !pto.ptr<f32>, %gm2: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
%c0_i32 = arith.constant 0 : i32
%buf0 = pto.import_reserved_buffer {name = "fifo0", peer_func = @vector_kernel} -> i32
%pipe0 = pto.initialize_l2g2l_pipe {
dir_mask = 1,
slot_size = 32768,
slot_num = 8,
local_slot_num = 2
}(%gm0 : !pto.ptr<f32>, %buf0 : i32) -> !pto.pipe
%buf1 = pto.import_reserved_buffer {name = "fifo1", peer_func = @vector_kernel} -> i32
%pipe1 = pto.initialize_l2g2l_pipe {
dir_mask = 1,
slot_size = 16384,
slot_num = 8,
local_slot_num = 2
}(%gm1 : !pto.ptr<f32>, %buf1 : i32) -> !pto.pipe

%v2c_local = pto.reserve_buffer {
name = "v2c_fifo",
size = 16384,
location = #pto.address_space<mat>,
auto = false,
base = 49152
} -> i32
pto.aic_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false}
(gm_slot_buffer = %gm2 : !pto.ptr<f32>,
c2v_consumer_buf = %c0_i32 : i32,
v2c_consumer_buf = %v2c_local : i32)

%c0_i64 = arith.constant 0 : i64
%c32768_i64 = arith.constant 32768 : i64
%acc256 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=256, v_row=32, v_col=256, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
%acc128 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=128, v_row=32, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>

// One producer token for each consumer pipe is enough for this reproducer.
pto.tpush(%acc256, %pipe0 : !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=256, v_row=32, v_col=256, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.pipe) {split = 1}
pto.tpush(%acc128, %pipe1 : !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=128, v_row=32, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.pipe) {split = 1}
return
}

func.func @vector_kernel(%gm0: !pto.ptr<f32>, %gm1: !pto.ptr<f32>, %gm2: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i32 = arith.constant 0 : i32
%buf0 = pto.reserve_buffer {
name = "fifo0",
size = 32768,
location = #pto.address_space<vec>,
auto = false,
base = 0
} -> i32
%pipe0 = pto.initialize_l2g2l_pipe {
dir_mask = 1,
slot_size = 32768,
slot_num = 8,
local_slot_num = 2
}(%gm0 : !pto.ptr<f32>, %buf0 : i32) -> !pto.pipe
%buf1 = pto.reserve_buffer {
name = "fifo1",
size = 16384,
location = #pto.address_space<vec>,
auto = false,
base = 32768
} -> i32
%pipe1 = pto.initialize_l2g2l_pipe {
dir_mask = 1,
slot_size = 16384,
slot_num = 8,
local_slot_num = 2
}(%gm1 : !pto.ptr<f32>, %buf1 : i32) -> !pto.pipe

%v2c_import = pto.import_reserved_buffer {name = "v2c_fifo", peer_func = @cube_kernel} -> i32
pto.aiv_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false}
(gm_slot_buffer = %gm2 : !pto.ptr<f32>,
c2v_consumer_buf = %c0_i32 : i32,
v2c_consumer_buf = %v2c_import : i32)

%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c49152_i64 = arith.constant 49152 : i64
%c65536_i64 = arith.constant 65536 : i64
%c81920_i64 = arith.constant 81920 : i64
%c90112_i64 = arith.constant 90112 : i64
%c98304_i64 = arith.constant 98304 : i64
%v256 = pto.alloc_tile addr = %c49152_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%n256 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%v128 = pto.alloc_tile addr = %c81920_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%n128 = pto.alloc_tile addr = %c90112_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%h256 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>

// Per outer iteration, keep the smallest known bad shape:
// A A A A B
// for j in [0, 1): A B
// B
// A = pop/neg/cvt/push on the 256-wide pipe.
// B = pop/neg/free on the 128-wide pipe.
scf.for %i = %c0 to %c1 step %c1 {
pto.tpop(%v256, %pipe0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tcvt ins(%n256 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tpush_to_aic(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {id = 30, split = 1}
pto.tfree(%pipe0 : !pto.pipe) {split = 1}

pto.tpop(%v256, %pipe0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tcvt ins(%n256 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tpush_to_aic(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {id = 30, split = 1}
pto.tfree(%pipe0 : !pto.pipe) {split = 1}

pto.tpop(%v256, %pipe0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tcvt ins(%n256 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tpush_to_aic(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {id = 30, split = 1}
pto.tfree(%pipe0 : !pto.pipe) {split = 1}

pto.tpop(%v256, %pipe0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tcvt ins(%n256 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tpush_to_aic(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {id = 30, split = 1}
pto.tfree(%pipe0 : !pto.pipe) {split = 1}

pto.tpop(%v128, %pipe1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tfree(%pipe1 : !pto.pipe) {split = 1}

scf.for %j = %c0 to %c1 step %c1 {
pto.tpop(%v256, %pipe0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tcvt ins(%n256 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tpush_to_aic(%h256 : !pto.tile_buf<loc=vec, dtype=f16, rows=16, cols=256, v_row=16, v_col=256, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {id = 30, split = 1}
pto.tfree(%pipe0 : !pto.pipe) {split = 1}

pto.tpop(%v128, %pipe1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tfree(%pipe1 : !pto.pipe) {split = 1}
}

pto.tpop(%v128, %pipe1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.pipe) {split = 1}
pto.tneg ins(%v128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%n128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tfree(%pipe1 : !pto.pipe) {split = 1}
}
return
}

func.func @call_both(%ffts: memref<256xi64>, %gm0: !pto.ptr<f32>, %gm1: !pto.ptr<f32>, %gm2: !pto.ptr<f32>)
attributes {pto.entry} {
pto.set_ffts %ffts : memref<256xi64>
call @cube_kernel(%gm0, %gm1, %gm2) : (!pto.ptr<f32>, !pto.ptr<f32>, !pto.ptr<f32>) -> ()
call @vector_kernel(%gm0, %gm1, %gm2) : (!pto.ptr<f32>, !pto.ptr<f32>, !pto.ptr<f32>) -> ()
return
}
}
Loading