From db22e9df0b4785771a04f0cd5355d85bc7170bca Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 8 May 2026 15:24:35 +0800 Subject: [PATCH 1/2] Fix loop-tail synthetic wait ordering --- .../InsertSync/SyncEventIdAllocation.cpp | 5 +- ...e622_v_mte2_eventid_overlap_reproducer.pto | 180 ++++++++++++++++++ 2 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto diff --git a/lib/PTO/Transforms/InsertSync/SyncEventIdAllocation.cpp b/lib/PTO/Transforms/InsertSync/SyncEventIdAllocation.cpp index d937b468b..ceb457ca8 100644 --- a/lib/PTO/Transforms/InsertSync/SyncEventIdAllocation.cpp +++ b/lib/PTO/Transforms/InsertSync/SyncEventIdAllocation.cpp @@ -396,7 +396,10 @@ void SyncEventIdAllocation::UpdateBackwardMatchSync( syncFront->reallocatedLoopHeadTailSync = true; syncEnd->reallocatedLoopHeadTailSync = true; syncIR_[ptr->beginId]->pipeBefore.push_back(syncFront.get()); - syncIR_[ptr->endId]->pipeAfter.push_back(syncEnd.get()); + // Insert the synthetic tail wait ahead of existing loop-end sets so the + // loop tail anchor does not emit a new set before consuming the carried + // event of the previous iteration. + syncIR_[ptr->endId]->pipeAfter.push_front(syncEnd.get()); } else { syncFront->SetSyncIRIndex(0); syncEnd->SetSyncIRIndex(syncIR_.size() - 1); diff --git a/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto b/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto new file mode 100644 index 000000000..a33dd7324 --- /dev/null +++ b/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto @@ -0,0 +1,180 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s | FileCheck %s +// +// Reproducer for issue #622: +// a mixed 256-wide / 128-wide DIR_C2V pipeline with local_slot_num = 2 can +// currently reused the same PIPE_V -> PIPE_MTE2 event id across multiple +// outstanding sync pairs. The loop-tail synthetic wait must be emitted before +// the post-loop local set so the same-key order stays serialized: +// +// set(E0) +// wait(E0) +// ... +// wait(E0) +// set(E0) +// wait(E0) +// +// This file is intentionally kept small and readable so we can debug event-id +// lifetime reuse in isolation. +// +// CHECK-LABEL: AICORE void vector_kernel( +// CHECK: TNEG( +// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0:[0-9]+]]); +// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); +// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[PRE:[0-9]+]]); +// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); +// CHECK: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); +// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); + +module { + func.func @cube_kernel(%gm0: !pto.ptr, %gm1: !pto.ptr, %gm2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i32 = arith.constant 0 : i32 + %buf0 = pto.import_reserved_buffer {name = "fifo0", peer_func = @vector_kernel} -> i32 + %pipe0 = pto.initialize_l2g2l_pipe { + dir_mask = 1, + slot_size = 32768, + slot_num = 8, + local_slot_num = 2 + }(%gm0 : !pto.ptr, %buf0 : i32) -> !pto.pipe + %buf1 = pto.import_reserved_buffer {name = "fifo1", peer_func = @vector_kernel} -> i32 + %pipe1 = pto.initialize_l2g2l_pipe { + dir_mask = 1, + slot_size = 16384, + slot_num = 8, + local_slot_num = 2 + }(%gm1 : !pto.ptr, %buf1 : i32) -> !pto.pipe + + %v2c_local = pto.reserve_buffer { + name = "v2c_fifo", + size = 16384, + location = #pto.address_space, + auto = false, + base = 49152 + } -> i32 + pto.aic_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false} + (gm_slot_buffer = %gm2 : !pto.ptr, + c2v_consumer_buf = %c0_i32 : i32, + v2c_consumer_buf = %v2c_local : i32) + + %c0_i64 = arith.constant 0 : i64 + %c32768_i64 = arith.constant 32768 : i64 + %acc256 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %acc128 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf + + // One producer token for each consumer pipe is enough for this reproducer. + pto.tpush(%acc256, %pipe0 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tpush(%acc128, %pipe1 : !pto.tile_buf, !pto.pipe) {split = 1} + return + } + + func.func @vector_kernel(%gm0: !pto.ptr, %gm1: !pto.ptr, %gm2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i32 = arith.constant 0 : i32 + %buf0 = pto.reserve_buffer { + name = "fifo0", + size = 32768, + location = #pto.address_space, + auto = false, + base = 0 + } -> i32 + %pipe0 = pto.initialize_l2g2l_pipe { + dir_mask = 1, + slot_size = 32768, + slot_num = 8, + local_slot_num = 2 + }(%gm0 : !pto.ptr, %buf0 : i32) -> !pto.pipe + %buf1 = pto.reserve_buffer { + name = "fifo1", + size = 16384, + location = #pto.address_space, + auto = false, + base = 32768 + } -> i32 + %pipe1 = pto.initialize_l2g2l_pipe { + dir_mask = 1, + slot_size = 16384, + slot_num = 8, + local_slot_num = 2 + }(%gm1 : !pto.ptr, %buf1 : i32) -> !pto.pipe + + %v2c_import = pto.import_reserved_buffer {name = "v2c_fifo", peer_func = @cube_kernel} -> i32 + pto.aiv_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false} + (gm_slot_buffer = %gm2 : !pto.ptr, + c2v_consumer_buf = %c0_i32 : i32, + v2c_consumer_buf = %v2c_import : i32) + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c49152_i64 = arith.constant 49152 : i64 + %c65536_i64 = arith.constant 65536 : i64 + %c81920_i64 = arith.constant 81920 : i64 + %c90112_i64 = arith.constant 90112 : i64 + %c98304_i64 = arith.constant 98304 : i64 + %v256 = pto.alloc_tile addr = %c49152_i64 : !pto.tile_buf + %n256 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %v128 = pto.alloc_tile addr = %c81920_i64 : !pto.tile_buf + %n128 = pto.alloc_tile addr = %c90112_i64 : !pto.tile_buf + %h256 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf + + // Per outer iteration, keep the smallest known bad shape: + // A A A A B + // for j in [0, 1): A B + // B + // A = pop/neg/cvt/push on the 256-wide pipe. + // B = pop/neg/free on the 128-wide pipe. + scf.for %i = %c0 to %c1 step %c1 { + pto.tpop(%v256, %pipe0 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v256 : !pto.tile_buf) outs(%n256 : !pto.tile_buf) + pto.tcvt ins(%n256 {rmode = #pto} : !pto.tile_buf) outs(%h256 : !pto.tile_buf) + pto.tpush_to_aic(%h256 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%pipe0 : !pto.pipe) {split = 1} + + pto.tpop(%v256, %pipe0 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v256 : !pto.tile_buf) outs(%n256 : !pto.tile_buf) + pto.tcvt ins(%n256 {rmode = #pto} : !pto.tile_buf) outs(%h256 : !pto.tile_buf) + pto.tpush_to_aic(%h256 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%pipe0 : !pto.pipe) {split = 1} + + pto.tpop(%v256, %pipe0 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v256 : !pto.tile_buf) outs(%n256 : !pto.tile_buf) + pto.tcvt ins(%n256 {rmode = #pto} : !pto.tile_buf) outs(%h256 : !pto.tile_buf) + pto.tpush_to_aic(%h256 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%pipe0 : !pto.pipe) {split = 1} + + pto.tpop(%v256, %pipe0 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v256 : !pto.tile_buf) outs(%n256 : !pto.tile_buf) + pto.tcvt ins(%n256 {rmode = #pto} : !pto.tile_buf) outs(%h256 : !pto.tile_buf) + pto.tpush_to_aic(%h256 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%pipe0 : !pto.pipe) {split = 1} + + pto.tpop(%v128, %pipe1 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v128 : !pto.tile_buf) outs(%n128 : !pto.tile_buf) + pto.tfree(%pipe1 : !pto.pipe) {split = 1} + + scf.for %j = %c0 to %c1 step %c1 { + pto.tpop(%v256, %pipe0 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v256 : !pto.tile_buf) outs(%n256 : !pto.tile_buf) + pto.tcvt ins(%n256 {rmode = #pto} : !pto.tile_buf) outs(%h256 : !pto.tile_buf) + pto.tpush_to_aic(%h256 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%pipe0 : !pto.pipe) {split = 1} + + pto.tpop(%v128, %pipe1 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v128 : !pto.tile_buf) outs(%n128 : !pto.tile_buf) + pto.tfree(%pipe1 : !pto.pipe) {split = 1} + } + + pto.tpop(%v128, %pipe1 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tneg ins(%v128 : !pto.tile_buf) outs(%n128 : !pto.tile_buf) + pto.tfree(%pipe1 : !pto.pipe) {split = 1} + } + return + } + + func.func @call_both(%ffts: memref<256xi64>, %gm0: !pto.ptr, %gm1: !pto.ptr, %gm2: !pto.ptr) + attributes {pto.entry} { + pto.set_ffts %ffts : memref<256xi64> + call @cube_kernel(%gm0, %gm1, %gm2) : (!pto.ptr, !pto.ptr, !pto.ptr) -> () + call @vector_kernel(%gm0, %gm1, %gm2) : (!pto.ptr, !pto.ptr, !pto.ptr) -> () + return + } +} From d0ee3a47b8d7332a400e34c9d25bf23385db305e Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 8 May 2026 15:52:28 +0800 Subject: [PATCH 2/2] Update lit checks for sync ordering fix --- .../lit/pto/issue428_cube_sync_regression.pto | 21 ++++++++++----- ...ue564_k_loop_mte1_mte2_wait_regression.pto | 20 +++++++------- ...e622_v_mte2_eventid_overlap_reproducer.pto | 26 ++++++++++++++----- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/test/lit/pto/issue428_cube_sync_regression.pto b/test/lit/pto/issue428_cube_sync_regression.pto index aca062f3b..a081e9a60 100644 --- a/test/lit/pto/issue428_cube_sync_regression.pto +++ b/test/lit/pto/issue428_cube_sync_regression.pto @@ -10,21 +10,30 @@ // after preheat sets are emitted. // // CHECK-LABEL: tri_inv_block2x2_fp16( -// CHECK: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); +// CHECK: set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); +// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); +// CHECK-NEXT: wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); // CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); // CHECK-NEXT: for (size_t // CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); // CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); -// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); -// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); +// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); +// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID6); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_FIX, EVENT_ID4); // CHECK-NEXT: wait_flag(PIPE_M, PIPE_FIX, EVENT_ID4); -// CHECK: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); +// CHECK: set_flag(PIPE_FIX, PIPE_M, EVENT_ID5); +// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7); +// CHECK-NEXT: wait_flag(PIPE_FIX, PIPE_M, EVENT_ID5); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); // CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); // CHECK-NEXT: for (size_t // CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); // CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); -// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); -// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); +// CHECK: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); +// CHECK-NEXT: wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); // CHECK-NEXT: wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); // CHECK: ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); diff --git a/test/lit/pto/issue564_k_loop_mte1_mte2_wait_regression.pto b/test/lit/pto/issue564_k_loop_mte1_mte2_wait_regression.pto index f7367d179..bd3b9600a 100644 --- a/test/lit/pto/issue564_k_loop_mte1_mte2_wait_regression.pto +++ b/test/lit/pto/issue564_k_loop_mte1_mte2_wait_regression.pto @@ -16,20 +16,22 @@ // CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2:[0-9]+]]); // CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3:[0-9]+]]); // CHECK-NEXT: for (size_t -// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2]]); +// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]); // CHECK-NEXT: TLOAD( -// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3]]); +// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2]]); // CHECK-NEXT: TLOAD( -// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]); +// CHECK: pipe_barrier(PIPE_ALL); // CHECK-NEXT: TLOAD( -// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD1]]); +// CHECK: pipe_barrier(PIPE_ALL); // CHECK-NEXT: TLOAD( -// CHECK: set_flag(PIPE_M, PIPE_FIX, EVENT_ID[[PUSH:[0-9]+]]); -// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID[[POST:[0-9]+]]); -// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]); -// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD1]]); +// CHECK: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3]]); // CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD2]]); -// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD3]]); +// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD1]]); +// CHECK-NEXT: wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[LOAD0]]); +// CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[EXIT0:[0-9]+]]); +// CHECK-NEXT: set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID[[EXIT1:[0-9]+]]); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_FIX, EVENT_ID[[PUSH:[0-9]+]]); +// CHECK-NEXT: set_flag(PIPE_M, PIPE_MTE1, EVENT_ID[[POST:[0-9]+]]); // CHECK-NEXT: wait_flag(PIPE_M, PIPE_FIX, EVENT_ID[[PUSH]]); // CHECK-NEXT: TPUSH // CHECK: ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); diff --git a/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto b/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto index a33dd7324..4f4ffe3c9 100644 --- a/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto +++ b/test/lit/pto/issue622_v_mte2_eventid_overlap_reproducer.pto @@ -17,13 +17,25 @@ // lifetime reuse in isolation. // // CHECK-LABEL: AICORE void vector_kernel( -// CHECK: TNEG( -// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0:[0-9]+]]); -// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); -// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[PRE:[0-9]+]]); -// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); -// CHECK: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); -// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID[[E0]]); +// CHECK: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); +// CHECK-NEXT: wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID5); +// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); +// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); +// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); +// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); +// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); +// CHECK-NEXT: for (size_t +// CHECK: }; +// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); +// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); +// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); +// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); +// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +// CHECK-NEXT: set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); +// CHECK-NEXT: wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); +// CHECK-NEXT: TPOP, +// CHECK-NEXT: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); +// CHECK-NEXT: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); module { func.func @cube_kernel(%gm0: !pto.ptr, %gm1: !pto.ptr, %gm2: !pto.ptr)