From 6e729da58705cbb64fa1eb58692bc0fce6f5a601 Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:17:29 +0800 Subject: [PATCH 1/5] Add FA PTO lit regression cases --- test/lit/pto/fa.pto | 501 ++++++++++++++++++++++++++ test/lit/pto/fa_perf.pto | 743 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 1244 insertions(+) create mode 100644 test/lit/pto/fa.pto create mode 100644 test/lit/pto/fa_perf.pto diff --git a/test/lit/pto/fa.pto b/test/lit/pto/fa.pto new file mode 100644 index 000000000..a4190459c --- /dev/null +++ b/test/lit/pto/fa.pto @@ -0,0 +1,501 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s >/dev/null + +module { + func.func @cube_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c128_0 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c4096 = arith.constant 4096 : index + %c128_1 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c16_2 = arith.constant 16 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c16_2, %1 : index + %5 = arith.remsi %c16_2, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c524288 = arith.constant 524288 : index + %19 = arith.muli %3, %c524288 : index + %20 = pto.addptr %arg0, %19 : -> + %c0_3 = arith.constant 0 : index + %21 = pto.addptr %20, %c0_3 : -> + %c262144 = arith.constant 262144 : index + %22 = pto.addptr %20, %c262144 : -> + %c393216 = arith.constant 393216 : index + %23 = pto.addptr %20, %c393216 : -> + %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> + pto.aic_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) + %pv_slot_desc = pto.make_tensor_view %22, shape = [%c128, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<128x128xf32> + pto.aic_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<128x128xf32>) + %28 = pto.reserve_buffer{name = "fa_p_v2c_fifo", size = 524288, location = , auto = false, base = 393216} -> i32 + %c0_i32 = arith.constant 0 : i32 + pto.aic_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536, nosplit = false} (gm_slot_buffer = %23 : !pto.ptr, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32) + %c0_i64 = arith.constant 0 : i64 + %c0_i64_4 = arith.constant 0 : i64 + %29 = pto.alloc_tile addr = %c0_i64_4 : !pto.tile_buf + %c0_i64_5 = arith.constant 0 : i64 + %30 = pto.alloc_tile addr = %c0_i64_5 : !pto.tile_buf + %c32768_i64 = arith.constant 32768 : i64 + %31 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf + %c65536_i64 = arith.constant 65536 : i64 + %32 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %33 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c0_i64_6 = arith.constant 0 : i64 + %34 = pto.alloc_tile addr = %c0_i64_6 : !pto.tile_buf + %c98304_i64 = arith.constant 98304 : i64 + %35 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf + %c32768_i64_7 = arith.constant 32768 : i64 + %36 = pto.alloc_tile addr = %c32768_i64_7 : !pto.tile_buf + %c163840_i64 = arith.constant 163840 : i64 + %37 = pto.alloc_tile addr = %c163840_i64 : !pto.tile_buf + %38 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c131072_i64 = arith.constant 131072 : i64 + %39 = pto.alloc_tile addr = %c131072_i64 : !pto.tile_buf + %c2048 = arith.constant 2048 : index + %40 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + %41 = pto.make_tensor_view %arg2, shape = [%c128_0, %c4096], strides = [%c1, %c128_0] : !pto.tensor_view + %42 = pto.make_tensor_view %arg3, shape = [%c4096, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + scf.for %arg4 = %14 to %18 step %c1 { + %43 = arith.muli %arg4, %c128 : index + %44 = pto.partition_view %40, offsets = [%43, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%44 : !pto.partition_tensor_view<128x128xf16>) outs(%29 : !pto.tile_buf) + pto.tmov ins(%29 : !pto.tile_buf) outs(%30 : !pto.tile_buf) + %c0_8 = arith.constant 0 : index + %c0_9 = arith.constant 0 : index + %45 = arith.addi %c0_8, %c0_9 : index + %46 = pto.partition_view %41, offsets = [%c0, %45], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%46 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_10 = arith.constant 0 : index + %47 = pto.subview %34[%c0, %c0_10] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%47 : !pto.tile_buf) + %c128_11 = arith.constant 128 : index + %48 = arith.addi %c0_8, %c128_11 : index + %49 = pto.partition_view %41, offsets = [%c0, %48], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%49 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_12 = arith.constant 128 : index + %50 = pto.subview %34[%c0, %c128_12] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%50 : !pto.tile_buf) + %qk_push_0 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_0 = pto.partition_view %qk_push_0, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_0 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c256_13 = arith.constant 256 : index + %c0_14 = arith.constant 0 : index + %51 = arith.addi %c256_13, %c0_14 : index + %52 = pto.partition_view %41, offsets = [%c0, %51], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%52 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_15 = arith.constant 0 : index + %53 = pto.subview %34[%c0, %c0_15] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + %c128_16 = arith.constant 128 : index + %54 = arith.addi %c256_13, %c128_16 : index + %55 = pto.partition_view %41, offsets = [%c0, %54], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%55 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_17 = arith.constant 128 : index + %56 = pto.subview %34[%c0, %c128_17] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%56 : !pto.tile_buf) + %qk_push_1 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_1 = pto.partition_view %qk_push_1, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_1 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %57 = pto.partition_view %42, offsets = [%c0, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> + pto.tload ins(%57 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) + %c2 = arith.constant 2 : index + %c7 = arith.constant 7 : index + scf.for %arg5 = %c0 to %c7 step %c1 { + %61 = arith.muli %arg5, %c2 : index + %c2_18 = arith.constant 2 : index + %62 = arith.addi %61, %c2_18 : index + %63 = arith.muli %62, %c256 : index + %64 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%64 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv{id = 30, split = 1} + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + %65 = arith.addi %61, %c1 : index + %66 = arith.muli %65, %c256 : index + %67 = pto.partition_view %42, offsets = [%66, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> + pto.tload ins(%67 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_0 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_0 = pto.partition_view %pv_push_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_0 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c0_19 = arith.constant 0 : index + %68 = arith.addi %63, %c0_19 : index + %69 = pto.partition_view %41, offsets = [%c0, %68], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%69 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_20 = arith.constant 0 : index + %70 = pto.subview %34[%c0, %c0_20] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%70 : !pto.tile_buf) + %c128_21 = arith.constant 128 : index + %71 = arith.addi %63, %c128_21 : index + %72 = pto.partition_view %41, offsets = [%c0, %71], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%72 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_22 = arith.constant 128 : index + %73 = pto.subview %34[%c0, %c128_22] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%73 : !pto.tile_buf) + %qk_push_2 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_2 = pto.partition_view %qk_push_2, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_2 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %74 = arith.muli %arg5, %c2 : index + %75 = arith.addi %74, %c1 : index + %c2_23 = arith.constant 2 : index + %76 = arith.addi %75, %c2_23 : index + %77 = arith.muli %76, %c256 : index + %78 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%78 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv{id = 30, split = 1} + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + %79 = arith.addi %75, %c1 : index + %80 = arith.muli %79, %c256 : index + %81 = pto.partition_view %42, offsets = [%80, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> + pto.tload ins(%81 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_1 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_1 = pto.partition_view %pv_push_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_1 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c0_24 = arith.constant 0 : index + %82 = arith.addi %77, %c0_24 : index + %83 = pto.partition_view %41, offsets = [%c0, %82], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%83 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_25 = arith.constant 0 : index + %84 = pto.subview %34[%c0, %c0_25] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%84 : !pto.tile_buf) + %c128_26 = arith.constant 128 : index + %85 = arith.addi %77, %c128_26 : index + %86 = pto.partition_view %41, offsets = [%c0, %85], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%86 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_27 = arith.constant 128 : index + %87 = pto.subview %34[%c0, %c128_27] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%87 : !pto.tile_buf) + %qk_push_3 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_3 = pto.partition_view %qk_push_3, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_3 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + } + %58 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%58 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv{id = 30, split = 1} + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + %c3840 = arith.constant 3840 : index + %59 = pto.partition_view %42, offsets = [%c3840, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> + pto.tload ins(%59 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_2 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_2 = pto.partition_view %pv_push_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_2 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %60 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%60 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv{id = 30, split = 1} + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_3 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_3 = pto.partition_view %pv_push_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_3 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + } + return + } + func.func @vector_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index + %c128_0 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c16_1 = arith.constant 16 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c16_1, %1 : index + %5 = arith.remsi %c16_1, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c524288 = arith.constant 524288 : index + %19 = arith.muli %3, %c524288 : index + %20 = pto.addptr %arg0, %19 : -> + %c0_2 = arith.constant 0 : index + %21 = pto.addptr %20, %c0_2 : -> + %c262144 = arith.constant 262144 : index + %22 = pto.addptr %20, %c262144 : -> + %c393216 = arith.constant 393216 : index + %23 = pto.addptr %20, %c393216 : -> + %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> + pto.aiv_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) + %pv_slot_desc = pto.make_tensor_view %22, shape = [%c128, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<128x128xf32> + pto.aiv_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<128x128xf32>) + %28 = pto.import_reserved_buffer{name = "fa_p_v2c_fifo", peer_func = @cube_kernel} -> i32 + %c0_i32 = arith.constant 0 : i32 + pto.aiv_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536, nosplit = false} (gm_slot_buffer = %23 : !pto.ptr, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32) + %29 = pto.get_subblock_idx + %30 = arith.index_cast %29 : i64 to index + %31 = arith.muli %30, %c64 : index + %c196608_i64 = arith.constant 196608 : i64 + %32 = pto.alloc_tile addr = %c196608_i64 : !pto.tile_buf + %c262144_i64 = arith.constant 262144 : i64 + %33 = pto.alloc_tile addr = %c262144_i64 : !pto.tile_buf + %c327680_i64 = arith.constant 327680 : i64 + %34 = pto.alloc_tile addr = %c327680_i64 : !pto.tile_buf + %c360448_i64 = arith.constant 360448 : i64 + %35 = pto.alloc_tile addr = %c360448_i64 : !pto.tile_buf + %c393216_i64 = arith.constant 393216 : i64 + %36 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %c393472_i64 = arith.constant 393472 : i64 + %37 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %c393728_i64 = arith.constant 393728 : i64 + %38 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %c393984_i64 = arith.constant 393984 : i64 + %39 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %c394240_i64 = arith.constant 394240 : i64 + %40 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + %c394496_i64 = arith.constant 394496 : i64 + %41 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + %cst = arith.constant 0.0883883461 : f32 + %cst_3 = arith.constant 1.000000e+00 : f32 + %c2048 = arith.constant 2048 : index + %42 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + scf.for %arg2 = %14 to %18 step %c1 { + %43 = arith.muli %arg2, %c128 : index + %c394752_i64 = arith.constant 394752 : i64 + %44 = pto.alloc_tile addr = %c394752_i64 : !pto.tile_buf + %qk_pop_0 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_0 = pto.partition_view %qk_pop_0, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> + pto.tload ins(%qk_pop_part_0 : !pto.partition_tensor_view<64x256xf32>) outs(%44 : !pto.tile_buf) + pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) + pto.trowmax ins(%44, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %45 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %46 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %47 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf + %48 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %49 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.trowexpandsub ins(%44, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmuls ins(%45, %cst_3 : !pto.tile_buf, f32) outs(%46 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree_from_aic(%qk_pop_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c394752_i64_4 = arith.constant 394752 : i64 + %50 = pto.alloc_tile addr = %c394752_i64_4 : !pto.tile_buf + %qk_pop_1 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_1 = pto.partition_view %qk_pop_1, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> + pto.tload ins(%qk_pop_part_1 : !pto.partition_tensor_view<64x256xf32>) outs(%50 : !pto.tile_buf) + pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) + pto.trowmax ins(%50, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %51 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %52 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %53 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf + %54 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %55 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%51, %52 : !pto.tile_buf, !pto.tile_buf) outs(%51 : !pto.tile_buf) + pto.tsub ins(%52, %51 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.tmuls ins(%51, %cst_3 : !pto.tile_buf, f32) outs(%52 : !pto.tile_buf) + pto.trowexpandsub ins(%50, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%53 : !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%54, %53 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%54, %55 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree_from_aic(%qk_pop_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c394752_i64_5 = arith.constant 394752 : i64 + %56 = pto.alloc_tile addr = %c394752_i64_5 : !pto.tile_buf + %pv_pop_0 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_pop_part_0 = pto.partition_view %pv_pop_0, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_0 : !pto.partition_tensor_view<64x128xf32>) outs(%56 : !pto.tile_buf) + pto.tmov ins(%56 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c394752_i64_6 = arith.constant 394752 : i64 + %57 = pto.alloc_tile addr = %c394752_i64_6 : !pto.tile_buf + %qk_pop_2 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_2 = pto.partition_view %qk_pop_2, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> + pto.tload ins(%qk_pop_part_2 : !pto.partition_tensor_view<64x256xf32>) outs(%57 : !pto.tile_buf) + pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) + pto.trowmax ins(%57, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %58 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %59 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %60 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf + %61 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %62 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%58, %59 : !pto.tile_buf, !pto.tile_buf) outs(%58 : !pto.tile_buf) + pto.tsub ins(%59, %58 : !pto.tile_buf, !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.tmuls ins(%58, %cst_3 : !pto.tile_buf, f32) outs(%59 : !pto.tile_buf) + pto.trowexpandsub ins(%57, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%60 : !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%61, %60 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%61, %62 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree_from_aic(%qk_pop_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c394752_i64_7 = arith.constant 394752 : i64 + %63 = pto.alloc_tile addr = %c394752_i64_7 : !pto.tile_buf + %pv_pop_1 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_pop_part_1 = pto.partition_view %pv_pop_1, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_1 : !pto.partition_tensor_view<64x128xf32>) outs(%63 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %63 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c394752_i64_8 = arith.constant 394752 : i64 + %64 = pto.alloc_tile addr = %c394752_i64_8 : !pto.tile_buf + %qk_pop_3 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_3 = pto.partition_view %qk_pop_3, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> + pto.tload ins(%qk_pop_part_3 : !pto.partition_tensor_view<64x256xf32>) outs(%64 : !pto.tile_buf) + pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) + pto.trowmax ins(%64, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %65 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %66 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %67 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf + %68 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %69 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%65, %66 : !pto.tile_buf, !pto.tile_buf) outs(%65 : !pto.tile_buf) + pto.tsub ins(%66, %65 : !pto.tile_buf, !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.tmuls ins(%65, %cst_3 : !pto.tile_buf, f32) outs(%66 : !pto.tile_buf) + pto.trowexpandsub ins(%64, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%67 : !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%68, %67 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%68, %69 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree_from_aic(%qk_pop_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c7 = arith.constant 7 : index + scf.for %arg3 = %c1 to %c7 step %c1 { + %c394752_i64_11 = arith.constant 394752 : i64 + %74 = pto.alloc_tile addr = %c394752_i64_11 : !pto.tile_buf + %pv_pop_2 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_pop_part_2 = pto.partition_view %pv_pop_2, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_2 : !pto.partition_tensor_view<64x128xf32>) outs(%74 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %74 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c394752_i64_12 = arith.constant 394752 : i64 + %75 = pto.alloc_tile addr = %c394752_i64_12 : !pto.tile_buf + %qk_pop_4 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_4 = pto.partition_view %qk_pop_4, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> + pto.tload ins(%qk_pop_part_4 : !pto.partition_tensor_view<64x256xf32>) outs(%75 : !pto.tile_buf) + pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) + pto.trowmax ins(%75, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %76 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %77 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %78 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf + %79 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %80 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%76, %77 : !pto.tile_buf, !pto.tile_buf) outs(%76 : !pto.tile_buf) + pto.tsub ins(%77, %76 : !pto.tile_buf, !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.tmuls ins(%76, %cst_3 : !pto.tile_buf, f32) outs(%77 : !pto.tile_buf) + pto.trowexpandsub ins(%75, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%78 : !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%79, %78 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%79, %80 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree_from_aic(%qk_pop_4 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c394752_i64_13 = arith.constant 394752 : i64 + %81 = pto.alloc_tile addr = %c394752_i64_13 : !pto.tile_buf + %pv_pop_3 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_pop_part_3 = pto.partition_view %pv_pop_3, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_3 : !pto.partition_tensor_view<64x128xf32>) outs(%81 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %81 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c394752_i64_14 = arith.constant 394752 : i64 + %82 = pto.alloc_tile addr = %c394752_i64_14 : !pto.tile_buf + %qk_pop_5 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_5 = pto.partition_view %qk_pop_5, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> + pto.tload ins(%qk_pop_part_5 : !pto.partition_tensor_view<64x256xf32>) outs(%82 : !pto.tile_buf) + pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) + pto.trowmax ins(%82, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %83 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %84 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %85 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf + %86 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %87 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%83, %84 : !pto.tile_buf, !pto.tile_buf) outs(%83 : !pto.tile_buf) + pto.tsub ins(%84, %83 : !pto.tile_buf, !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.tmuls ins(%83, %cst_3 : !pto.tile_buf, f32) outs(%84 : !pto.tile_buf) + pto.trowexpandsub ins(%82, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%85 : !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%86, %85 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%86, %87 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree_from_aic(%qk_pop_5 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + } + %c394752_i64_9 = arith.constant 394752 : i64 + %70 = pto.alloc_tile addr = %c394752_i64_9 : !pto.tile_buf + %pv_pop_4 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_pop_part_4 = pto.partition_view %pv_pop_4, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_4 : !pto.partition_tensor_view<64x128xf32>) outs(%70 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %70 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_4 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c394752_i64_10 = arith.constant 394752 : i64 + %71 = pto.alloc_tile addr = %c394752_i64_10 : !pto.tile_buf + %pv_pop_5 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_pop_part_5 = pto.partition_view %pv_pop_5, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_5 : !pto.partition_tensor_view<64x128xf32>) outs(%71 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %71 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_5 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + pto.trowexpanddiv ins(%35, %38 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + %72 = arith.addi %43, %31 : index + %73 = pto.partition_view %42, offsets = [%72, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xf32> + pto.tstore ins(%35 : !pto.tile_buf) outs(%73 : !pto.partition_tensor_view<64x128xf32>) + } + return + } + func.func @call_both(%arg0: memref<256xi64>, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.entry} { + pto.set_ffts %arg0 : memref<256xi64> + call @cube_kernel(%arg1, %arg2, %arg3, %arg4) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + call @vector_kernel(%arg1, %arg5) : (!pto.ptr, !pto.ptr) -> () + return + } +} diff --git a/test/lit/pto/fa_perf.pto b/test/lit/pto/fa_perf.pto new file mode 100644 index 000000000..f1ad9c590 --- /dev/null +++ b/test/lit/pto/fa_perf.pto @@ -0,0 +1,743 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s >/dev/null + +module { + func.func @cube_kernel(%qk_fifo: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %p_fifo: !pto.ptr, %pv_fifo: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c128_0 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c4096 = arith.constant 4096 : index + %c128_1 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c16_2 = arith.constant 16 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c16_2, %1 : index + %5 = arith.remsi %c16_2, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c262144 = arith.constant 262144 : index + %19 = arith.muli %3, %c262144 : index + %21 = pto.addptr %qk_fifo, %19 : -> + %p_block = pto.addptr %p_fifo, %19 : -> + %c131072 = arith.constant 131072 : index + %pv_block_off = arith.muli %3, %c131072 : index + %22 = pto.addptr %pv_fifo, %pv_block_off : -> + %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> + pto.aic_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) + %p_slot_desc = pto.make_tensor_view %p_block, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf16> + pto.aic_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536} (gm_slot_tensor = %p_slot_desc : !pto.tensor_view<128x256xf16>) + %pv_slot_desc = pto.make_tensor_view %22, shape = [%c128, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<128x128xf32> + pto.aic_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<128x128xf32>) + %c0_i64 = arith.constant 0 : i64 + %c0_i64_4 = arith.constant 0 : i64 + %29 = pto.alloc_tile addr = %c0_i64_4 : !pto.tile_buf + %c0_i64_5 = arith.constant 0 : i64 + %30 = pto.alloc_tile addr = %c0_i64_5 : !pto.tile_buf + %c32768_i64 = arith.constant 32768 : i64 + %31 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf + %c65536_i64 = arith.constant 65536 : i64 + %32 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %33 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c0_i64_6 = arith.constant 0 : i64 + %34 = pto.alloc_tile addr = %c0_i64_6 : !pto.tile_buf + %c98304_i64 = arith.constant 98304 : i64 + %35 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf + %c32768_i64_7 = arith.constant 32768 : i64 + %36 = pto.alloc_tile addr = %c32768_i64_7 : !pto.tile_buf + %c229376_i64 = arith.constant 229376 : i64 + %37 = pto.alloc_tile addr = %c229376_i64 : !pto.tile_buf + %38 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c131072_i64 = arith.constant 131072 : i64 + %39 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %c2048 = arith.constant 2048 : index + %40 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + %41 = pto.make_tensor_view %arg2, shape = [%c128_0, %c4096], strides = [%c1, %c128_0] : !pto.tensor_view + %42 = pto.make_tensor_view %arg3, shape = [%c4096, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + scf.for %arg4 = %14 to %18 step %c1 { + %43 = arith.muli %arg4, %c128 : index + %44 = pto.partition_view %40, offsets = [%43, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%44 : !pto.partition_tensor_view<128x128xf16>) outs(%29 : !pto.tile_buf) + pto.tmov ins(%29 : !pto.tile_buf) outs(%30 : !pto.tile_buf) + %c0_8 = arith.constant 0 : index + %c0_9 = arith.constant 0 : index + %45 = arith.addi %c0_8, %c0_9 : index + %46 = pto.partition_view %41, offsets = [%c0, %45], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%46 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_10 = arith.constant 0 : index + %47 = pto.subview %34[%c0, %c0_10] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%47 : !pto.tile_buf) + %c128_11 = arith.constant 128 : index + %48 = arith.addi %c0_8, %c128_11 : index + %49 = pto.partition_view %41, offsets = [%c0, %48], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%49 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_12 = arith.constant 128 : index + %50 = pto.subview %34[%c0, %c128_12] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%50 : !pto.tile_buf) + %qk_push_0 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_0 = pto.partition_view %qk_push_0, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_0 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c256_13 = arith.constant 256 : index + %c0_14 = arith.constant 0 : index + %51 = arith.addi %c256_13, %c0_14 : index + %52 = pto.partition_view %41, offsets = [%c0, %51], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%52 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_15 = arith.constant 0 : index + %53 = pto.subview %34[%c0, %c0_15] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + %c128_16 = arith.constant 128 : index + %54 = arith.addi %c256_13, %c128_16 : index + %55 = pto.partition_view %41, offsets = [%c0, %54], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%55 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_17 = arith.constant 128 : index + %56 = pto.subview %34[%c0, %c128_17] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%56 : !pto.tile_buf) + %qk_push_1 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_1 = pto.partition_view %qk_push_1, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_1 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %57 = pto.partition_view %42, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%57 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + %c2 = arith.constant 2 : index + %c7 = arith.constant 7 : index + scf.for %arg5 = %c0 to %c7 step %c1 { + %61 = arith.muli %arg5, %c2 : index + %c2_18 = arith.constant 2 : index + %62 = arith.addi %61, %c2_18 : index + %63 = arith.muli %62, %c256 : index + %p_pop_0 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_0 = pto.partition_view %p_pop_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %pv0_v_base = arith.muli %61, %c256 : index + %pv0_v_part_0 = pto.partition_view %42, offsets = [%pv0_v_base, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv0_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_0_hi = pto.partition_view %p_pop_0, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_0_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_0 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %pv0_v_hi = arith.addi %pv0_v_base, %c128 : index + %pv0_v_part_1 = pto.partition_view %42, offsets = [%pv0_v_hi, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv0_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_0 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_0 = pto.partition_view %pv_push_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_0 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c0_19 = arith.constant 0 : index + %68 = arith.addi %63, %c0_19 : index + %69 = pto.partition_view %41, offsets = [%c0, %68], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%69 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_20 = arith.constant 0 : index + %70 = pto.subview %34[%c0, %c0_20] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%70 : !pto.tile_buf) + %c128_21 = arith.constant 128 : index + %71 = arith.addi %63, %c128_21 : index + %72 = pto.partition_view %41, offsets = [%c0, %71], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%72 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_22 = arith.constant 128 : index + %73 = pto.subview %34[%c0, %c128_22] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%73 : !pto.tile_buf) + %qk_push_2 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_2 = pto.partition_view %qk_push_2, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_2 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %74 = arith.muli %arg5, %c2 : index + %75 = arith.addi %74, %c1 : index + %c2_23 = arith.constant 2 : index + %76 = arith.addi %75, %c2_23 : index + %77 = arith.muli %76, %c256 : index + %p_pop_1 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_1 = pto.partition_view %p_pop_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %pv1_v_base = arith.muli %75, %c256 : index + %pv1_v_part_0 = pto.partition_view %42, offsets = [%pv1_v_base, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv1_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_1_hi = pto.partition_view %p_pop_1, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_1_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_1 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %pv1_v_hi = arith.addi %pv1_v_base, %c128 : index + %pv1_v_part_1 = pto.partition_view %42, offsets = [%pv1_v_hi, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv1_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_1 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_1 = pto.partition_view %pv_push_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_1 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %c0_24 = arith.constant 0 : index + %82 = arith.addi %77, %c0_24 : index + %83 = pto.partition_view %41, offsets = [%c0, %82], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%83 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_25 = arith.constant 0 : index + %84 = pto.subview %34[%c0, %c0_25] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%84 : !pto.tile_buf) + %c128_26 = arith.constant 128 : index + %85 = arith.addi %77, %c128_26 : index + %86 = pto.partition_view %41, offsets = [%c0, %85], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%86 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_27 = arith.constant 128 : index + %87 = pto.subview %34[%c0, %c128_27] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%87 : !pto.tile_buf) + %qk_push_3 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_3 = pto.partition_view %qk_push_3, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_3 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + } + %p_pop_2 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_2 = pto.partition_view %p_pop_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_2 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %c3584 = arith.constant 3584 : index + %pv2_v_part_0 = pto.partition_view %42, offsets = [%c3584, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv2_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_2_hi = pto.partition_view %p_pop_2, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_2_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_2 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c3712 = arith.constant 3712 : index + %pv2_v_part_1 = pto.partition_view %42, offsets = [%c3712, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv2_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_2 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_2 = pto.partition_view %pv_push_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_2 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + %p_pop_3 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_3 = pto.partition_view %p_pop_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_3 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %c3840 = arith.constant 3840 : index + %pv3_v_part_0 = pto.partition_view %42, offsets = [%c3840, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv3_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_3_hi = pto.partition_view %p_pop_3, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_3_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_3 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c3968 = arith.constant 3968 : index + %pv3_v_part_1 = pto.partition_view %42, offsets = [%c3968, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv3_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_3 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_part_3 = pto.partition_view %pv_push_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_3 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + } + return + } + func.func @vector_kernel(%qk_fifo: !pto.ptr, %arg1: !pto.ptr, %p_fifo: !pto.ptr, %pv_fifo: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %c128_0 = arith.constant 128 : index + %c16 = arith.constant 16 : index + %c16_1 = arith.constant 16 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c16_1, %1 : index + %5 = arith.remsi %c16_1, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c262144 = arith.constant 262144 : index + %19 = arith.muli %3, %c262144 : index + %21 = pto.addptr %qk_fifo, %19 : -> + %p_block = pto.addptr %p_fifo, %19 : -> + %c131072 = arith.constant 131072 : index + %pv_block_off = arith.muli %3, %c131072 : index + %22 = pto.addptr %pv_fifo, %pv_block_off : -> + %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> + pto.aiv_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) + %p_slot_desc = pto.make_tensor_view %p_block, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf16> + pto.aiv_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536} (gm_slot_tensor = %p_slot_desc : !pto.tensor_view<128x256xf16>) + %pv_slot_desc = pto.make_tensor_view %22, shape = [%c64, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<64x128xf32> + pto.aiv_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<64x128xf32>) + %29 = pto.get_subblock_idx + %30 = arith.index_cast %29 : i64 to index + %31 = arith.muli %30, %c64 : index + %row_slice_1 = arith.addi %31, %c32 : index + %c196608_i64 = arith.constant 196608 : i64 + %32 = pto.alloc_tile addr = %c196608_i64 : !pto.tile_buf + %c262144_i64 = arith.constant 262144 : i64 + %33 = pto.alloc_tile addr = %c262144_i64 : !pto.tile_buf + %c327680_i64 = arith.constant 327680 : i64 + %34 = pto.alloc_tile addr = %c327680_i64 : !pto.tile_buf + %c360448_i64 = arith.constant 360448 : i64 + %35 = pto.alloc_tile addr = %c360448_i64 : !pto.tile_buf + %c393216_i64 = arith.constant 393216 : i64 + %c393344_i64 = arith.constant 393344 : i64 + %36 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %c393472_i64 = arith.constant 393472 : i64 + %c393600_i64 = arith.constant 393600 : i64 + %37 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %c393728_i64 = arith.constant 393728 : i64 + %c393856_i64 = arith.constant 393856 : i64 + %38 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %c393984_i64 = arith.constant 393984 : i64 + %c394112_i64 = arith.constant 394112 : i64 + %39 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %c394240_i64 = arith.constant 394240 : i64 + %c394368_i64 = arith.constant 394368 : i64 + %40 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + %c394496_i64 = arith.constant 394496 : i64 + %c394624_i64 = arith.constant 394624 : i64 + %41 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + %cst = arith.constant 0.0883883461 : f32 + %cst_3 = arith.constant 1.000000e+00 : f32 + %c2048 = arith.constant 2048 : index + %42 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + scf.for %arg2 = %14 to %18 step %c1 { + %43 = arith.muli %arg2, %c128 : index + %c394752_i64 = arith.constant 394752 : i64 + %44 = pto.alloc_tile addr = %c394752_i64 : !pto.tile_buf + %qk_pop_0 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_0 = pto.partition_view %qk_pop_0, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_0 : !pto.partition_tensor_view<32x256xf32>) outs(%44 : !pto.tile_buf) + pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) + %r36_0 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_0 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_0 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_0 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r40_0 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + pto.trowmax ins(%44, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_0 : !pto.tile_buf) + %45 = pto.treshape %r37_0 : !pto.tile_buf -> !pto.tile_buf + %46 = pto.treshape %r36_0 : !pto.tile_buf -> !pto.tile_buf + %47 = pto.treshape %r40_0 : !pto.tile_buf -> !pto.tile_buf + %48 = pto.treshape %r38_0 : !pto.tile_buf -> !pto.tile_buf + %49 = pto.treshape %r39_0 : !pto.tile_buf -> !pto.tile_buf + pto.trowexpandsub ins(%44, %r37_0 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmuls ins(%45, %cst_3 : !pto.tile_buf, f32) outs(%46 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r38_0 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_0 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_0 = pto.partition_view %p_push_0, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_0 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_0_r1 = pto.partition_view %qk_pop_0, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_0_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%44 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) + %r36_0_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_0_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_0_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_0_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r40_0_r1 = pto.alloc_tile addr = %c394368_i64 : !pto.tile_buf + pto.trowmax ins(%44, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_0_r1 : !pto.tile_buf) + %qk0_r1_max = pto.treshape %r37_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_gmax = pto.treshape %r36_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_tmp = pto.treshape %r40_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_lsum = pto.treshape %r38_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_gsum = pto.treshape %r39_0_r1 : !pto.tile_buf -> !pto.tile_buf + pto.trowexpandsub ins(%44, %r37_0_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmuls ins(%qk0_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk0_r1_gmax : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r38_0_r1 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_0_r1 = pto.partition_view %p_push_0, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_0_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_0 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_4 = arith.constant 394752 : i64 + %50 = pto.alloc_tile addr = %c394752_i64_4 : !pto.tile_buf + %qk_pop_1 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_1 = pto.partition_view %qk_pop_1, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_1 : !pto.partition_tensor_view<32x256xf32>) outs(%50 : !pto.tile_buf) + pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) + %r36_1 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_1 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_1 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_1 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r41_1 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + pto.trowmax ins(%50, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_1 : !pto.tile_buf) + %51 = pto.treshape %r37_1 : !pto.tile_buf -> !pto.tile_buf + %52 = pto.treshape %r36_1 : !pto.tile_buf -> !pto.tile_buf + %53 = pto.treshape %r41_1 : !pto.tile_buf -> !pto.tile_buf + %54 = pto.treshape %r38_1 : !pto.tile_buf -> !pto.tile_buf + %55 = pto.treshape %r39_1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%51, %52 : !pto.tile_buf, !pto.tile_buf) outs(%51 : !pto.tile_buf) + pto.tsub ins(%52, %51 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.tmuls ins(%51, %cst_3 : !pto.tile_buf, f32) outs(%52 : !pto.tile_buf) + pto.trowexpandsub ins(%50, %r37_1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%53 : !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%54, %53 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_1 : !pto.tile_buf) + pto.tadd ins(%54, %55 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_1 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_1 = pto.partition_view %p_push_1, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_1 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_1_r1 = pto.partition_view %qk_pop_1, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_1_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%50 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) + %r36_1_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_1_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_1_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_1_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r41_1_r1 = pto.alloc_tile addr = %c394624_i64 : !pto.tile_buf + pto.trowmax ins(%50, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_1_r1 : !pto.tile_buf) + %qk1_r1_max = pto.treshape %r37_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_gmax = pto.treshape %r36_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_diff = pto.treshape %r41_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_lsum = pto.treshape %r38_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_gsum = pto.treshape %r39_1_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk1_r1_max, %qk1_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_max : !pto.tile_buf) + pto.tsub ins(%qk1_r1_gmax, %qk1_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk1_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk1_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%50, %r37_1_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk1_r1_diff : !pto.tile_buf) outs(%qk1_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk1_r1_lsum, %qk1_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_1_r1 : !pto.tile_buf) + pto.tadd ins(%qk1_r1_lsum, %qk1_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_1_r1 = pto.partition_view %p_push_1, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_1_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_1 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_5 = arith.constant 394752 : i64 + %56 = pto.alloc_tile addr = %c394752_i64_5 : !pto.tile_buf + %pv_pop_0 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_0 = pto.partition_view %pv_pop_0, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_0 : !pto.partition_tensor_view<64x128xf32>) outs(%56 : !pto.tile_buf) + pto.tmov ins(%56 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_0 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_6 = arith.constant 394752 : i64 + %57 = pto.alloc_tile addr = %c394752_i64_6 : !pto.tile_buf + %qk_pop_2 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_2 = pto.partition_view %qk_pop_2, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_2 : !pto.partition_tensor_view<32x256xf32>) outs(%57 : !pto.tile_buf) + pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) + %r36_2 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_2 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_2 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_2 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r40_2 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + pto.trowmax ins(%57, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_2 : !pto.tile_buf) + %58 = pto.treshape %r37_2 : !pto.tile_buf -> !pto.tile_buf + %59 = pto.treshape %r36_2 : !pto.tile_buf -> !pto.tile_buf + %60 = pto.treshape %r40_2 : !pto.tile_buf -> !pto.tile_buf + %61 = pto.treshape %r38_2 : !pto.tile_buf -> !pto.tile_buf + %62 = pto.treshape %r39_2 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%58, %59 : !pto.tile_buf, !pto.tile_buf) outs(%58 : !pto.tile_buf) + pto.tsub ins(%59, %58 : !pto.tile_buf, !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.tmuls ins(%58, %cst_3 : !pto.tile_buf, f32) outs(%59 : !pto.tile_buf) + pto.trowexpandsub ins(%57, %r37_2 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%60 : !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%61, %60 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_2 : !pto.tile_buf) + pto.tadd ins(%61, %62 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_2 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_2 = pto.partition_view %p_push_2, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_2 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_2_r1 = pto.partition_view %qk_pop_2, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_2_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%57 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) + %r36_2_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_2_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_2_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_2_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r40_2_r1 = pto.alloc_tile addr = %c394368_i64 : !pto.tile_buf + pto.trowmax ins(%57, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_2_r1 : !pto.tile_buf) + %qk2_r1_max = pto.treshape %r37_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_gmax = pto.treshape %r36_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_diff = pto.treshape %r40_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_lsum = pto.treshape %r38_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_gsum = pto.treshape %r39_2_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk2_r1_max, %qk2_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_max : !pto.tile_buf) + pto.tsub ins(%qk2_r1_gmax, %qk2_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk2_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk2_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%57, %r37_2_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk2_r1_diff : !pto.tile_buf) outs(%qk2_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk2_r1_lsum, %qk2_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_2_r1 : !pto.tile_buf) + pto.tadd ins(%qk2_r1_lsum, %qk2_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_2_r1 = pto.partition_view %p_push_2, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_2_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_2 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_7 = arith.constant 394752 : i64 + %63 = pto.alloc_tile addr = %c394752_i64_7 : !pto.tile_buf + %pv_pop_1 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_1 = pto.partition_view %pv_pop_1, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_1 : !pto.partition_tensor_view<64x128xf32>) outs(%63 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %63 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_1 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_8 = arith.constant 394752 : i64 + %64 = pto.alloc_tile addr = %c394752_i64_8 : !pto.tile_buf + %qk_pop_3 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_3 = pto.partition_view %qk_pop_3, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_3 : !pto.partition_tensor_view<32x256xf32>) outs(%64 : !pto.tile_buf) + pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) + %r36_3 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_3 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_3 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_3 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r41_3 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + pto.trowmax ins(%64, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_3 : !pto.tile_buf) + %65 = pto.treshape %r37_3 : !pto.tile_buf -> !pto.tile_buf + %66 = pto.treshape %r36_3 : !pto.tile_buf -> !pto.tile_buf + %67 = pto.treshape %r41_3 : !pto.tile_buf -> !pto.tile_buf + %68 = pto.treshape %r38_3 : !pto.tile_buf -> !pto.tile_buf + %69 = pto.treshape %r39_3 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%65, %66 : !pto.tile_buf, !pto.tile_buf) outs(%65 : !pto.tile_buf) + pto.tsub ins(%66, %65 : !pto.tile_buf, !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.tmuls ins(%65, %cst_3 : !pto.tile_buf, f32) outs(%66 : !pto.tile_buf) + pto.trowexpandsub ins(%64, %r37_3 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%67 : !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%68, %67 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_3 : !pto.tile_buf) + pto.tadd ins(%68, %69 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_3 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_3 = pto.partition_view %p_push_3, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_3 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_3_r1 = pto.partition_view %qk_pop_3, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_3_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%64 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) + %r36_3_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_3_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_3_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_3_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r41_3_r1 = pto.alloc_tile addr = %c394624_i64 : !pto.tile_buf + pto.trowmax ins(%64, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_3_r1 : !pto.tile_buf) + %qk3_r1_max = pto.treshape %r37_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_gmax = pto.treshape %r36_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_diff = pto.treshape %r41_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_lsum = pto.treshape %r38_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_gsum = pto.treshape %r39_3_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk3_r1_max, %qk3_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_max : !pto.tile_buf) + pto.tsub ins(%qk3_r1_gmax, %qk3_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk3_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk3_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%64, %r37_3_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk3_r1_diff : !pto.tile_buf) outs(%qk3_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk3_r1_lsum, %qk3_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_3_r1 : !pto.tile_buf) + pto.tadd ins(%qk3_r1_lsum, %qk3_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_3_r1 = pto.partition_view %p_push_3, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_3_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_3 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c7 = arith.constant 7 : index + scf.for %arg3 = %c1 to %c7 step %c1 { + %c394752_i64_11 = arith.constant 394752 : i64 + %74 = pto.alloc_tile addr = %c394752_i64_11 : !pto.tile_buf + %pv_pop_2 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_2 = pto.partition_view %pv_pop_2, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_2 : !pto.partition_tensor_view<64x128xf32>) outs(%74 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %74 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_2 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_12 = arith.constant 394752 : i64 + %75 = pto.alloc_tile addr = %c394752_i64_12 : !pto.tile_buf + %qk_pop_4 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_4 = pto.partition_view %qk_pop_4, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_4 : !pto.partition_tensor_view<32x256xf32>) outs(%75 : !pto.tile_buf) + pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) + %r36_4 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_4 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_4 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_4 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r40_4 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + pto.trowmax ins(%75, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_4 : !pto.tile_buf) + %76 = pto.treshape %r37_4 : !pto.tile_buf -> !pto.tile_buf + %77 = pto.treshape %r36_4 : !pto.tile_buf -> !pto.tile_buf + %78 = pto.treshape %r40_4 : !pto.tile_buf -> !pto.tile_buf + %79 = pto.treshape %r38_4 : !pto.tile_buf -> !pto.tile_buf + %80 = pto.treshape %r39_4 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%76, %77 : !pto.tile_buf, !pto.tile_buf) outs(%76 : !pto.tile_buf) + pto.tsub ins(%77, %76 : !pto.tile_buf, !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.tmuls ins(%76, %cst_3 : !pto.tile_buf, f32) outs(%77 : !pto.tile_buf) + pto.trowexpandsub ins(%75, %r37_4 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%78 : !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%79, %78 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_4 : !pto.tile_buf) + pto.tadd ins(%79, %80 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_4 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_4 = pto.partition_view %p_push_4, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_4 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_4_r1 = pto.partition_view %qk_pop_4, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_4_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%75 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_4 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) + %r36_4_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_4_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_4_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_4_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r40_4_r1 = pto.alloc_tile addr = %c394368_i64 : !pto.tile_buf + pto.trowmax ins(%75, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_4_r1 : !pto.tile_buf) + %qk4_r1_max = pto.treshape %r37_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_gmax = pto.treshape %r36_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_diff = pto.treshape %r40_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_lsum = pto.treshape %r38_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_gsum = pto.treshape %r39_4_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk4_r1_max, %qk4_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_max : !pto.tile_buf) + pto.tsub ins(%qk4_r1_gmax, %qk4_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk4_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk4_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%75, %r37_4_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk4_r1_diff : !pto.tile_buf) outs(%qk4_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk4_r1_lsum, %qk4_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_4_r1 : !pto.tile_buf) + pto.tadd ins(%qk4_r1_lsum, %qk4_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_4_r1 = pto.partition_view %p_push_4, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_4_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_4 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_13 = arith.constant 394752 : i64 + %81 = pto.alloc_tile addr = %c394752_i64_13 : !pto.tile_buf + %pv_pop_3 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_3 = pto.partition_view %pv_pop_3, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_3 : !pto.partition_tensor_view<64x128xf32>) outs(%81 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %81 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_3 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_14 = arith.constant 394752 : i64 + %82 = pto.alloc_tile addr = %c394752_i64_14 : !pto.tile_buf + %qk_pop_5 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_5 = pto.partition_view %qk_pop_5, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_5 : !pto.partition_tensor_view<32x256xf32>) outs(%82 : !pto.tile_buf) + pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) + %r36_5 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_5 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_5 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_5 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r41_5 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + pto.trowmax ins(%82, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_5 : !pto.tile_buf) + %83 = pto.treshape %r37_5 : !pto.tile_buf -> !pto.tile_buf + %84 = pto.treshape %r36_5 : !pto.tile_buf -> !pto.tile_buf + %85 = pto.treshape %r41_5 : !pto.tile_buf -> !pto.tile_buf + %86 = pto.treshape %r38_5 : !pto.tile_buf -> !pto.tile_buf + %87 = pto.treshape %r39_5 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%83, %84 : !pto.tile_buf, !pto.tile_buf) outs(%83 : !pto.tile_buf) + pto.tsub ins(%84, %83 : !pto.tile_buf, !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.tmuls ins(%83, %cst_3 : !pto.tile_buf, f32) outs(%84 : !pto.tile_buf) + pto.trowexpandsub ins(%82, %r37_5 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%85 : !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%86, %85 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_5 : !pto.tile_buf) + pto.tadd ins(%86, %87 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_5 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_5 = pto.partition_view %p_push_5, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_5 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_5_r1 = pto.partition_view %qk_pop_5, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_5_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%82 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_5 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) + %r36_5_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_5_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_5_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_5_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r41_5_r1 = pto.alloc_tile addr = %c394624_i64 : !pto.tile_buf + pto.trowmax ins(%82, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_5_r1 : !pto.tile_buf) + %qk5_r1_max = pto.treshape %r37_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_gmax = pto.treshape %r36_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_diff = pto.treshape %r41_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_lsum = pto.treshape %r38_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_gsum = pto.treshape %r39_5_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk5_r1_max, %qk5_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_max : !pto.tile_buf) + pto.tsub ins(%qk5_r1_gmax, %qk5_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk5_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk5_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%82, %r37_5_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk5_r1_diff : !pto.tile_buf) outs(%qk5_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk5_r1_lsum, %qk5_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_5_r1 : !pto.tile_buf) + pto.tadd ins(%qk5_r1_lsum, %qk5_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_5_r1 = pto.partition_view %p_push_5, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_5_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_5 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + } + %c394752_i64_9 = arith.constant 394752 : i64 + %70 = pto.alloc_tile addr = %c394752_i64_9 : !pto.tile_buf + %pv_pop_4 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_4 = pto.partition_view %pv_pop_4, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_4 : !pto.partition_tensor_view<64x128xf32>) outs(%70 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %70 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_4 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_10 = arith.constant 394752 : i64 + %71 = pto.alloc_tile addr = %c394752_i64_10 : !pto.tile_buf + %pv_pop_5 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_5 = pto.partition_view %pv_pop_5, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_5 : !pto.partition_tensor_view<64x128xf32>) outs(%71 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %71 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_5 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + pto.trowexpanddiv ins(%35, %38 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + %72 = arith.addi %43, %31 : index + %73 = pto.partition_view %42, offsets = [%72, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xf32> + pto.tstore ins(%35 : !pto.tile_buf) outs(%73 : !pto.partition_tensor_view<64x128xf32>) + } + return + } + func.func @call_both(%arg0: memref<256xi64>, %q: !pto.ptr, %k: !pto.ptr, %v: !pto.ptr, %p_fifo: !pto.ptr, %o_out: !pto.ptr, %qk_fifo: !pto.ptr, %pv_fifo: !pto.ptr) attributes {pto.entry} { + pto.set_ffts %arg0 : memref<256xi64> + call @cube_kernel(%qk_fifo, %q, %k, %v, %p_fifo, %pv_fifo) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + call @vector_kernel(%qk_fifo, %o_out, %p_fifo, %pv_fifo) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + return + } +} From 45f82f6242c24cd0e12a8c9756a8e81347671923 Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Thu, 30 Apr 2026 16:44:09 +0800 Subject: [PATCH 2/5] Fix FA perf PV pipe split mode --- test/lit/pto/fa_perf.pto | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/lit/pto/fa_perf.pto b/test/lit/pto/fa_perf.pto index f1ad9c590..9c4be080a 100644 --- a/test/lit/pto/fa_perf.pto +++ b/test/lit/pto/fa_perf.pto @@ -142,10 +142,10 @@ module { pto.tload ins(%pv0_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_0 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_0 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> %pv_push_part_0 = pto.partition_view %pv_push_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_0 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + pto.tpush_to_aiv(%pv_push_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} %c0_19 = arith.constant 0 : index %68 = arith.addi %63, %c0_19 : index %69 = pto.partition_view %41, offsets = [%c0, %68], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> @@ -189,10 +189,10 @@ module { pto.tload ins(%pv1_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_1 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_1 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> %pv_push_part_1 = pto.partition_view %pv_push_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_1 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + pto.tpush_to_aiv(%pv_push_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} %c0_24 = arith.constant 0 : index %82 = arith.addi %77, %c0_24 : index %83 = pto.partition_view %41, offsets = [%c0, %82], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> @@ -232,10 +232,10 @@ module { pto.tload ins(%pv2_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_2 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_2 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> %pv_push_part_2 = pto.partition_view %pv_push_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_2 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + pto.tpush_to_aiv(%pv_push_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} %p_pop_3 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> %p_pop_part_3 = pto.partition_view %p_pop_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> pto.tload ins(%p_pop_part_3 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) @@ -254,10 +254,10 @@ module { pto.tload ins(%pv3_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_3 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> + %pv_push_3 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> %pv_push_part_3 = pto.partition_view %pv_push_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_3 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} + pto.tpush_to_aiv(%pv_push_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} } return } From 87a07ef861376e77529b6b724a590e1661c8dcd3 Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:37:01 +0800 Subject: [PATCH 3/5] Add FA perf smoke PTO lit case --- test/lit/pto/fa_perf_smoke.pto | 743 +++++++++++++++++++++++++++++++++ 1 file changed, 743 insertions(+) create mode 100644 test/lit/pto/fa_perf_smoke.pto diff --git a/test/lit/pto/fa_perf_smoke.pto b/test/lit/pto/fa_perf_smoke.pto new file mode 100644 index 000000000..a17d2fd6e --- /dev/null +++ b/test/lit/pto/fa_perf_smoke.pto @@ -0,0 +1,743 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s >/dev/null + +module { + func.func @cube_kernel(%qk_fifo: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %p_fifo: !pto.ptr, %pv_fifo: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c128_0 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c4096 = arith.constant 4096 : index + %c128_1 = arith.constant 128 : index + %c16 = arith.constant 1 : index + %c16_2 = arith.constant 1 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c16_2, %1 : index + %5 = arith.remsi %c16_2, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c262144 = arith.constant 262144 : index + %19 = arith.muli %3, %c262144 : index + %21 = pto.addptr %qk_fifo, %19 : -> + %p_block = pto.addptr %p_fifo, %19 : -> + %c131072 = arith.constant 131072 : index + %pv_block_off = arith.muli %3, %c131072 : index + %22 = pto.addptr %pv_fifo, %pv_block_off : -> + %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> + pto.aic_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) + %p_slot_desc = pto.make_tensor_view %p_block, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf16> + pto.aic_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536} (gm_slot_tensor = %p_slot_desc : !pto.tensor_view<128x256xf16>) + %pv_slot_desc = pto.make_tensor_view %22, shape = [%c128, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<128x128xf32> + pto.aic_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<128x128xf32>) + %c0_i64 = arith.constant 0 : i64 + %c0_i64_4 = arith.constant 0 : i64 + %29 = pto.alloc_tile addr = %c0_i64_4 : !pto.tile_buf + %c0_i64_5 = arith.constant 0 : i64 + %30 = pto.alloc_tile addr = %c0_i64_5 : !pto.tile_buf + %c32768_i64 = arith.constant 32768 : i64 + %31 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf + %c65536_i64 = arith.constant 65536 : i64 + %32 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %33 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c0_i64_6 = arith.constant 0 : i64 + %34 = pto.alloc_tile addr = %c0_i64_6 : !pto.tile_buf + %c98304_i64 = arith.constant 98304 : i64 + %35 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf + %c32768_i64_7 = arith.constant 32768 : i64 + %36 = pto.alloc_tile addr = %c32768_i64_7 : !pto.tile_buf + %c229376_i64 = arith.constant 229376 : i64 + %37 = pto.alloc_tile addr = %c229376_i64 : !pto.tile_buf + %38 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c131072_i64 = arith.constant 131072 : i64 + %39 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %c2048 = arith.constant 128 : index + %40 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + %41 = pto.make_tensor_view %arg2, shape = [%c128_0, %c4096], strides = [%c1, %c128_0] : !pto.tensor_view + %42 = pto.make_tensor_view %arg3, shape = [%c4096, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + scf.for %arg4 = %14 to %18 step %c1 { + %43 = arith.muli %arg4, %c128 : index + %44 = pto.partition_view %40, offsets = [%43, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%44 : !pto.partition_tensor_view<128x128xf16>) outs(%29 : !pto.tile_buf) + pto.tmov ins(%29 : !pto.tile_buf) outs(%30 : !pto.tile_buf) + %c0_8 = arith.constant 0 : index + %c0_9 = arith.constant 0 : index + %45 = arith.addi %c0_8, %c0_9 : index + %46 = pto.partition_view %41, offsets = [%c0, %45], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%46 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_10 = arith.constant 0 : index + %47 = pto.subview %34[%c0, %c0_10] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%47 : !pto.tile_buf) + %c128_11 = arith.constant 128 : index + %48 = arith.addi %c0_8, %c128_11 : index + %49 = pto.partition_view %41, offsets = [%c0, %48], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%49 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_12 = arith.constant 128 : index + %50 = pto.subview %34[%c0, %c128_12] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%50 : !pto.tile_buf) + %qk_push_0 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_0 = pto.partition_view %qk_push_0, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_0 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c256_13 = arith.constant 256 : index + %c0_14 = arith.constant 0 : index + %51 = arith.addi %c256_13, %c0_14 : index + %52 = pto.partition_view %41, offsets = [%c0, %51], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%52 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_15 = arith.constant 0 : index + %53 = pto.subview %34[%c0, %c0_15] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + %c128_16 = arith.constant 128 : index + %54 = arith.addi %c256_13, %c128_16 : index + %55 = pto.partition_view %41, offsets = [%c0, %54], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%55 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_17 = arith.constant 128 : index + %56 = pto.subview %34[%c0, %c128_17] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%56 : !pto.tile_buf) + %qk_push_1 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_1 = pto.partition_view %qk_push_1, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_1 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %57 = pto.partition_view %42, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%57 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + %c2 = arith.constant 2 : index + %c7 = arith.constant 7 : index + scf.for %arg5 = %c0 to %c7 step %c1 { + %61 = arith.muli %arg5, %c2 : index + %c2_18 = arith.constant 2 : index + %62 = arith.addi %61, %c2_18 : index + %63 = arith.muli %62, %c256 : index + %p_pop_0 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_0 = pto.partition_view %p_pop_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %pv0_v_base = arith.muli %61, %c256 : index + %pv0_v_part_0 = pto.partition_view %42, offsets = [%pv0_v_base, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv0_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_0_hi = pto.partition_view %p_pop_0, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_0_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_0 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %pv0_v_hi = arith.addi %pv0_v_base, %c128 : index + %pv0_v_part_1 = pto.partition_view %42, offsets = [%pv0_v_hi, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv0_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_0 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> + %pv_push_part_0 = pto.partition_view %pv_push_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_0 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} + %c0_19 = arith.constant 0 : index + %68 = arith.addi %63, %c0_19 : index + %69 = pto.partition_view %41, offsets = [%c0, %68], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%69 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_20 = arith.constant 0 : index + %70 = pto.subview %34[%c0, %c0_20] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%70 : !pto.tile_buf) + %c128_21 = arith.constant 128 : index + %71 = arith.addi %63, %c128_21 : index + %72 = pto.partition_view %41, offsets = [%c0, %71], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%72 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_22 = arith.constant 128 : index + %73 = pto.subview %34[%c0, %c128_22] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%73 : !pto.tile_buf) + %qk_push_2 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_2 = pto.partition_view %qk_push_2, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_2 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %74 = arith.muli %arg5, %c2 : index + %75 = arith.addi %74, %c1 : index + %c2_23 = arith.constant 2 : index + %76 = arith.addi %75, %c2_23 : index + %77 = arith.muli %76, %c256 : index + %p_pop_1 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_1 = pto.partition_view %p_pop_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %pv1_v_base = arith.muli %75, %c256 : index + %pv1_v_part_0 = pto.partition_view %42, offsets = [%pv1_v_base, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv1_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_1_hi = pto.partition_view %p_pop_1, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_1_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_1 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %pv1_v_hi = arith.addi %pv1_v_base, %c128 : index + %pv1_v_part_1 = pto.partition_view %42, offsets = [%pv1_v_hi, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv1_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_1 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> + %pv_push_part_1 = pto.partition_view %pv_push_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_1 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} + %c0_24 = arith.constant 0 : index + %82 = arith.addi %77, %c0_24 : index + %83 = pto.partition_view %41, offsets = [%c0, %82], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%83 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c0_25 = arith.constant 0 : index + %84 = pto.subview %34[%c0, %c0_25] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%84 : !pto.tile_buf) + %c128_26 = arith.constant 128 : index + %85 = arith.addi %77, %c128_26 : index + %86 = pto.partition_view %41, offsets = [%c0, %85], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%86 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) + pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %c128_27 = arith.constant 128 : index + %87 = pto.subview %34[%c0, %c128_27] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%87 : !pto.tile_buf) + %qk_push_3 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_3 = pto.partition_view %qk_push_3, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_3 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + } + %p_pop_2 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_2 = pto.partition_view %p_pop_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_2 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %c3584 = arith.constant 3584 : index + %pv2_v_part_0 = pto.partition_view %42, offsets = [%c3584, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv2_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_2_hi = pto.partition_view %p_pop_2, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_2_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_2 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c3712 = arith.constant 3712 : index + %pv2_v_part_1 = pto.partition_view %42, offsets = [%c3712, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv2_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_2 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> + %pv_push_part_2 = pto.partition_view %pv_push_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_2 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} + %p_pop_3 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_part_3 = pto.partition_view %p_pop_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_3 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %c3840 = arith.constant 3840 : index + %pv3_v_part_0 = pto.partition_view %42, offsets = [%c3840, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv3_v_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_part_3_hi = pto.partition_view %p_pop_3, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_part_3_hi : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_3 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c3968 = arith.constant 3968 : index + %pv3_v_part_1 = pto.partition_view %42, offsets = [%c3968, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv3_v_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_3 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> + %pv_push_part_3 = pto.partition_view %pv_push_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_3 : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} + } + return + } + func.func @vector_kernel(%qk_fifo: !pto.ptr, %arg1: !pto.ptr, %p_fifo: !pto.ptr, %pv_fifo: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %c128_0 = arith.constant 128 : index + %c16 = arith.constant 1 : index + %c16_1 = arith.constant 1 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c16_1, %1 : index + %5 = arith.remsi %c16_1, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c262144 = arith.constant 262144 : index + %19 = arith.muli %3, %c262144 : index + %21 = pto.addptr %qk_fifo, %19 : -> + %p_block = pto.addptr %p_fifo, %19 : -> + %c131072 = arith.constant 131072 : index + %pv_block_off = arith.muli %3, %c131072 : index + %22 = pto.addptr %pv_fifo, %pv_block_off : -> + %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> + pto.aiv_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) + %p_slot_desc = pto.make_tensor_view %p_block, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf16> + pto.aiv_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536} (gm_slot_tensor = %p_slot_desc : !pto.tensor_view<128x256xf16>) + %pv_slot_desc = pto.make_tensor_view %22, shape = [%c64, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<64x128xf32> + pto.aiv_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<64x128xf32>) + %29 = pto.get_subblock_idx + %30 = arith.index_cast %29 : i64 to index + %31 = arith.muli %30, %c64 : index + %row_slice_1 = arith.addi %31, %c32 : index + %c196608_i64 = arith.constant 196608 : i64 + %32 = pto.alloc_tile addr = %c196608_i64 : !pto.tile_buf + %c262144_i64 = arith.constant 262144 : i64 + %33 = pto.alloc_tile addr = %c262144_i64 : !pto.tile_buf + %c327680_i64 = arith.constant 327680 : i64 + %34 = pto.alloc_tile addr = %c327680_i64 : !pto.tile_buf + %c360448_i64 = arith.constant 360448 : i64 + %35 = pto.alloc_tile addr = %c360448_i64 : !pto.tile_buf + %c393216_i64 = arith.constant 393216 : i64 + %c393344_i64 = arith.constant 393344 : i64 + %36 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %c393472_i64 = arith.constant 393472 : i64 + %c393600_i64 = arith.constant 393600 : i64 + %37 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %c393728_i64 = arith.constant 393728 : i64 + %c393856_i64 = arith.constant 393856 : i64 + %38 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %c393984_i64 = arith.constant 393984 : i64 + %c394112_i64 = arith.constant 394112 : i64 + %39 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %c394240_i64 = arith.constant 394240 : i64 + %c394368_i64 = arith.constant 394368 : i64 + %40 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + %c394496_i64 = arith.constant 394496 : i64 + %c394624_i64 = arith.constant 394624 : i64 + %41 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + %cst = arith.constant 0.0883883461 : f32 + %cst_3 = arith.constant 1.000000e+00 : f32 + %c2048 = arith.constant 128 : index + %42 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view + scf.for %arg2 = %14 to %18 step %c1 { + %43 = arith.muli %arg2, %c128 : index + %c394752_i64 = arith.constant 394752 : i64 + %44 = pto.alloc_tile addr = %c394752_i64 : !pto.tile_buf + %qk_pop_0 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_0 = pto.partition_view %qk_pop_0, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_0 : !pto.partition_tensor_view<32x256xf32>) outs(%44 : !pto.tile_buf) + pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) + %r36_0 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_0 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_0 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_0 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r40_0 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + pto.trowmax ins(%44, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_0 : !pto.tile_buf) + %45 = pto.treshape %r37_0 : !pto.tile_buf -> !pto.tile_buf + %46 = pto.treshape %r36_0 : !pto.tile_buf -> !pto.tile_buf + %47 = pto.treshape %r40_0 : !pto.tile_buf -> !pto.tile_buf + %48 = pto.treshape %r38_0 : !pto.tile_buf -> !pto.tile_buf + %49 = pto.treshape %r39_0 : !pto.tile_buf -> !pto.tile_buf + pto.trowexpandsub ins(%44, %r37_0 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmuls ins(%45, %cst_3 : !pto.tile_buf, f32) outs(%46 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r38_0 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_0 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_0 = pto.partition_view %p_push_0, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_0 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_0_r1 = pto.partition_view %qk_pop_0, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_0_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%44 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) + %r36_0_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_0_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_0_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_0_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r40_0_r1 = pto.alloc_tile addr = %c394368_i64 : !pto.tile_buf + pto.trowmax ins(%44, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_0_r1 : !pto.tile_buf) + %qk0_r1_max = pto.treshape %r37_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_gmax = pto.treshape %r36_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_tmp = pto.treshape %r40_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_lsum = pto.treshape %r38_0_r1 : !pto.tile_buf -> !pto.tile_buf + %qk0_r1_gsum = pto.treshape %r39_0_r1 : !pto.tile_buf -> !pto.tile_buf + pto.trowexpandsub ins(%44, %r37_0_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmuls ins(%qk0_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk0_r1_gmax : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r38_0_r1 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_0_r1 = pto.partition_view %p_push_0, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_0_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_0 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_4 = arith.constant 394752 : i64 + %50 = pto.alloc_tile addr = %c394752_i64_4 : !pto.tile_buf + %qk_pop_1 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_1 = pto.partition_view %qk_pop_1, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_1 : !pto.partition_tensor_view<32x256xf32>) outs(%50 : !pto.tile_buf) + pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) + %r36_1 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_1 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_1 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_1 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r41_1 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + pto.trowmax ins(%50, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_1 : !pto.tile_buf) + %51 = pto.treshape %r37_1 : !pto.tile_buf -> !pto.tile_buf + %52 = pto.treshape %r36_1 : !pto.tile_buf -> !pto.tile_buf + %53 = pto.treshape %r41_1 : !pto.tile_buf -> !pto.tile_buf + %54 = pto.treshape %r38_1 : !pto.tile_buf -> !pto.tile_buf + %55 = pto.treshape %r39_1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%51, %52 : !pto.tile_buf, !pto.tile_buf) outs(%51 : !pto.tile_buf) + pto.tsub ins(%52, %51 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.tmuls ins(%51, %cst_3 : !pto.tile_buf, f32) outs(%52 : !pto.tile_buf) + pto.trowexpandsub ins(%50, %r37_1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%53 : !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%54, %53 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_1 : !pto.tile_buf) + pto.tadd ins(%54, %55 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_1 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_1 = pto.partition_view %p_push_1, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_1 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_1_r1 = pto.partition_view %qk_pop_1, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_1_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%50 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) + %r36_1_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_1_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_1_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_1_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r41_1_r1 = pto.alloc_tile addr = %c394624_i64 : !pto.tile_buf + pto.trowmax ins(%50, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_1_r1 : !pto.tile_buf) + %qk1_r1_max = pto.treshape %r37_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_gmax = pto.treshape %r36_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_diff = pto.treshape %r41_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_lsum = pto.treshape %r38_1_r1 : !pto.tile_buf -> !pto.tile_buf + %qk1_r1_gsum = pto.treshape %r39_1_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk1_r1_max, %qk1_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_max : !pto.tile_buf) + pto.tsub ins(%qk1_r1_gmax, %qk1_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk1_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk1_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%50, %r37_1_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk1_r1_diff : !pto.tile_buf) outs(%qk1_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk1_r1_lsum, %qk1_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_1_r1 : !pto.tile_buf) + pto.tadd ins(%qk1_r1_lsum, %qk1_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk1_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_1_r1 = pto.partition_view %p_push_1, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_1_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_1 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_5 = arith.constant 394752 : i64 + %56 = pto.alloc_tile addr = %c394752_i64_5 : !pto.tile_buf + %pv_pop_0 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_0 = pto.partition_view %pv_pop_0, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_0 : !pto.partition_tensor_view<64x128xf32>) outs(%56 : !pto.tile_buf) + pto.tmov ins(%56 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_0 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_6 = arith.constant 394752 : i64 + %57 = pto.alloc_tile addr = %c394752_i64_6 : !pto.tile_buf + %qk_pop_2 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_2 = pto.partition_view %qk_pop_2, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_2 : !pto.partition_tensor_view<32x256xf32>) outs(%57 : !pto.tile_buf) + pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) + %r36_2 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_2 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_2 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_2 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r40_2 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + pto.trowmax ins(%57, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_2 : !pto.tile_buf) + %58 = pto.treshape %r37_2 : !pto.tile_buf -> !pto.tile_buf + %59 = pto.treshape %r36_2 : !pto.tile_buf -> !pto.tile_buf + %60 = pto.treshape %r40_2 : !pto.tile_buf -> !pto.tile_buf + %61 = pto.treshape %r38_2 : !pto.tile_buf -> !pto.tile_buf + %62 = pto.treshape %r39_2 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%58, %59 : !pto.tile_buf, !pto.tile_buf) outs(%58 : !pto.tile_buf) + pto.tsub ins(%59, %58 : !pto.tile_buf, !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.tmuls ins(%58, %cst_3 : !pto.tile_buf, f32) outs(%59 : !pto.tile_buf) + pto.trowexpandsub ins(%57, %r37_2 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%60 : !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%61, %60 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_2 : !pto.tile_buf) + pto.tadd ins(%61, %62 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_2 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_2 = pto.partition_view %p_push_2, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_2 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_2_r1 = pto.partition_view %qk_pop_2, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_2_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%57 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) + %r36_2_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_2_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_2_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_2_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r40_2_r1 = pto.alloc_tile addr = %c394368_i64 : !pto.tile_buf + pto.trowmax ins(%57, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_2_r1 : !pto.tile_buf) + %qk2_r1_max = pto.treshape %r37_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_gmax = pto.treshape %r36_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_diff = pto.treshape %r40_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_lsum = pto.treshape %r38_2_r1 : !pto.tile_buf -> !pto.tile_buf + %qk2_r1_gsum = pto.treshape %r39_2_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk2_r1_max, %qk2_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_max : !pto.tile_buf) + pto.tsub ins(%qk2_r1_gmax, %qk2_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk2_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk2_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%57, %r37_2_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk2_r1_diff : !pto.tile_buf) outs(%qk2_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk2_r1_lsum, %qk2_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_2_r1 : !pto.tile_buf) + pto.tadd ins(%qk2_r1_lsum, %qk2_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk2_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_2_r1 = pto.partition_view %p_push_2, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_2_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_2 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_7 = arith.constant 394752 : i64 + %63 = pto.alloc_tile addr = %c394752_i64_7 : !pto.tile_buf + %pv_pop_1 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_1 = pto.partition_view %pv_pop_1, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_1 : !pto.partition_tensor_view<64x128xf32>) outs(%63 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %63 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_1 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_8 = arith.constant 394752 : i64 + %64 = pto.alloc_tile addr = %c394752_i64_8 : !pto.tile_buf + %qk_pop_3 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_3 = pto.partition_view %qk_pop_3, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_3 : !pto.partition_tensor_view<32x256xf32>) outs(%64 : !pto.tile_buf) + pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) + %r36_3 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_3 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_3 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_3 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r41_3 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + pto.trowmax ins(%64, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_3 : !pto.tile_buf) + %65 = pto.treshape %r37_3 : !pto.tile_buf -> !pto.tile_buf + %66 = pto.treshape %r36_3 : !pto.tile_buf -> !pto.tile_buf + %67 = pto.treshape %r41_3 : !pto.tile_buf -> !pto.tile_buf + %68 = pto.treshape %r38_3 : !pto.tile_buf -> !pto.tile_buf + %69 = pto.treshape %r39_3 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%65, %66 : !pto.tile_buf, !pto.tile_buf) outs(%65 : !pto.tile_buf) + pto.tsub ins(%66, %65 : !pto.tile_buf, !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.tmuls ins(%65, %cst_3 : !pto.tile_buf, f32) outs(%66 : !pto.tile_buf) + pto.trowexpandsub ins(%64, %r37_3 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%67 : !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%68, %67 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_3 : !pto.tile_buf) + pto.tadd ins(%68, %69 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_3 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_3 = pto.partition_view %p_push_3, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_3 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_3_r1 = pto.partition_view %qk_pop_3, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_3_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%64 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) + %r36_3_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_3_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_3_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_3_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r41_3_r1 = pto.alloc_tile addr = %c394624_i64 : !pto.tile_buf + pto.trowmax ins(%64, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_3_r1 : !pto.tile_buf) + %qk3_r1_max = pto.treshape %r37_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_gmax = pto.treshape %r36_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_diff = pto.treshape %r41_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_lsum = pto.treshape %r38_3_r1 : !pto.tile_buf -> !pto.tile_buf + %qk3_r1_gsum = pto.treshape %r39_3_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk3_r1_max, %qk3_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_max : !pto.tile_buf) + pto.tsub ins(%qk3_r1_gmax, %qk3_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk3_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk3_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%64, %r37_3_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk3_r1_diff : !pto.tile_buf) outs(%qk3_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk3_r1_lsum, %qk3_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_3_r1 : !pto.tile_buf) + pto.tadd ins(%qk3_r1_lsum, %qk3_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk3_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_3_r1 = pto.partition_view %p_push_3, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_3_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_3 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c7 = arith.constant 7 : index + scf.for %arg3 = %c1 to %c7 step %c1 { + %c394752_i64_11 = arith.constant 394752 : i64 + %74 = pto.alloc_tile addr = %c394752_i64_11 : !pto.tile_buf + %pv_pop_2 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_2 = pto.partition_view %pv_pop_2, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_2 : !pto.partition_tensor_view<64x128xf32>) outs(%74 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %74 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_2 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_12 = arith.constant 394752 : i64 + %75 = pto.alloc_tile addr = %c394752_i64_12 : !pto.tile_buf + %qk_pop_4 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_4 = pto.partition_view %qk_pop_4, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_4 : !pto.partition_tensor_view<32x256xf32>) outs(%75 : !pto.tile_buf) + pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) + %r36_4 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_4 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_4 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_4 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r40_4 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf + pto.trowmax ins(%75, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_4 : !pto.tile_buf) + %76 = pto.treshape %r37_4 : !pto.tile_buf -> !pto.tile_buf + %77 = pto.treshape %r36_4 : !pto.tile_buf -> !pto.tile_buf + %78 = pto.treshape %r40_4 : !pto.tile_buf -> !pto.tile_buf + %79 = pto.treshape %r38_4 : !pto.tile_buf -> !pto.tile_buf + %80 = pto.treshape %r39_4 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%76, %77 : !pto.tile_buf, !pto.tile_buf) outs(%76 : !pto.tile_buf) + pto.tsub ins(%77, %76 : !pto.tile_buf, !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.tmuls ins(%76, %cst_3 : !pto.tile_buf, f32) outs(%77 : !pto.tile_buf) + pto.trowexpandsub ins(%75, %r37_4 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%78 : !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%79, %78 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_4 : !pto.tile_buf) + pto.tadd ins(%79, %80 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_4 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_4 = pto.partition_view %p_push_4, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_4 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_4_r1 = pto.partition_view %qk_pop_4, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_4_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%75 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_4 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) + %r36_4_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_4_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_4_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_4_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r40_4_r1 = pto.alloc_tile addr = %c394368_i64 : !pto.tile_buf + pto.trowmax ins(%75, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_4_r1 : !pto.tile_buf) + %qk4_r1_max = pto.treshape %r37_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_gmax = pto.treshape %r36_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_diff = pto.treshape %r40_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_lsum = pto.treshape %r38_4_r1 : !pto.tile_buf -> !pto.tile_buf + %qk4_r1_gsum = pto.treshape %r39_4_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk4_r1_max, %qk4_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_max : !pto.tile_buf) + pto.tsub ins(%qk4_r1_gmax, %qk4_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk4_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk4_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%75, %r37_4_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk4_r1_diff : !pto.tile_buf) outs(%qk4_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk4_r1_lsum, %qk4_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_4_r1 : !pto.tile_buf) + pto.tadd ins(%qk4_r1_lsum, %qk4_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk4_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_4_r1 = pto.partition_view %p_push_4, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_4_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_4 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c394752_i64_13 = arith.constant 394752 : i64 + %81 = pto.alloc_tile addr = %c394752_i64_13 : !pto.tile_buf + %pv_pop_3 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_3 = pto.partition_view %pv_pop_3, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_3 : !pto.partition_tensor_view<64x128xf32>) outs(%81 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %81 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_3 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_14 = arith.constant 394752 : i64 + %82 = pto.alloc_tile addr = %c394752_i64_14 : !pto.tile_buf + %qk_pop_5 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_pop_part_5 = pto.partition_view %qk_pop_5, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_5 : !pto.partition_tensor_view<32x256xf32>) outs(%82 : !pto.tile_buf) + pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) + %r36_5 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf + %r37_5 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf + %r38_5 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf + %r39_5 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf + %r41_5 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf + pto.trowmax ins(%82, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_5 : !pto.tile_buf) + %83 = pto.treshape %r37_5 : !pto.tile_buf -> !pto.tile_buf + %84 = pto.treshape %r36_5 : !pto.tile_buf -> !pto.tile_buf + %85 = pto.treshape %r41_5 : !pto.tile_buf -> !pto.tile_buf + %86 = pto.treshape %r38_5 : !pto.tile_buf -> !pto.tile_buf + %87 = pto.treshape %r39_5 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%83, %84 : !pto.tile_buf, !pto.tile_buf) outs(%83 : !pto.tile_buf) + pto.tsub ins(%84, %83 : !pto.tile_buf, !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.tmuls ins(%83, %cst_3 : !pto.tile_buf, f32) outs(%84 : !pto.tile_buf) + pto.trowexpandsub ins(%82, %r37_5 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%85 : !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%86, %85 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_5 : !pto.tile_buf) + pto.tadd ins(%86, %87 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_5 = pto.talloc_to_aic {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_push_part_5 = pto.partition_view %p_push_5, offsets = [%31, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_5 : !pto.partition_tensor_view<32x256xf16>) + %qk_pop_part_5_r1 = pto.partition_view %qk_pop_5, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<32x256xf32> + pto.tload ins(%qk_pop_part_5_r1 : !pto.partition_tensor_view<32x256xf32>) outs(%82 : !pto.tile_buf) + pto.tfree_from_aic(%qk_pop_5 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) + %r36_5_r1 = pto.alloc_tile addr = %c393344_i64 : !pto.tile_buf + %r37_5_r1 = pto.alloc_tile addr = %c393600_i64 : !pto.tile_buf + %r38_5_r1 = pto.alloc_tile addr = %c393856_i64 : !pto.tile_buf + %r39_5_r1 = pto.alloc_tile addr = %c394112_i64 : !pto.tile_buf + %r41_5_r1 = pto.alloc_tile addr = %c394624_i64 : !pto.tile_buf + pto.trowmax ins(%82, %33 : !pto.tile_buf, !pto.tile_buf) outs(%r37_5_r1 : !pto.tile_buf) + %qk5_r1_max = pto.treshape %r37_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_gmax = pto.treshape %r36_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_diff = pto.treshape %r41_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_lsum = pto.treshape %r38_5_r1 : !pto.tile_buf -> !pto.tile_buf + %qk5_r1_gsum = pto.treshape %r39_5_r1 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%qk5_r1_max, %qk5_r1_gmax : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_max : !pto.tile_buf) + pto.tsub ins(%qk5_r1_gmax, %qk5_r1_max : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_diff : !pto.tile_buf) + pto.tmuls ins(%qk5_r1_max, %cst_3 : !pto.tile_buf, f32) outs(%qk5_r1_gmax : !pto.tile_buf) + pto.trowexpandsub ins(%82, %r37_5_r1 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%qk5_r1_diff : !pto.tile_buf) outs(%qk5_r1_diff : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%qk5_r1_lsum, %qk5_r1_diff : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_lsum : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%r39_5_r1 : !pto.tile_buf) + pto.tadd ins(%qk5_r1_lsum, %qk5_r1_gsum : !pto.tile_buf, !pto.tile_buf) outs(%qk5_r1_lsum : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + %p_push_part_5_r1 = pto.partition_view %p_push_5, offsets = [%row_slice_1, %c0], sizes = [%c32, %c256] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<32x256xf16> + pto.tstore ins(%34 : !pto.tile_buf) outs(%p_push_part_5_r1 : !pto.partition_tensor_view<32x256xf16>) + pto.tpush_to_aic(%p_push_5 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + } + %c394752_i64_9 = arith.constant 394752 : i64 + %70 = pto.alloc_tile addr = %c394752_i64_9 : !pto.tile_buf + %pv_pop_4 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_4 = pto.partition_view %pv_pop_4, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_4 : !pto.partition_tensor_view<64x128xf32>) outs(%70 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %70 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_4 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + %c394752_i64_10 = arith.constant 394752 : i64 + %71 = pto.alloc_tile addr = %c394752_i64_10 : !pto.tile_buf + %pv_pop_5 = pto.tpop_from_aic {id = 27, split = 1} -> !pto.tensor_view<64x128xf32> + %pv_pop_part_5 = pto.partition_view %pv_pop_5, offsets = [%c0, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<64x128xf32> -> !pto.partition_tensor_view<64x128xf32> + pto.tload ins(%pv_pop_part_5 : !pto.partition_tensor_view<64x128xf32>) outs(%71 : !pto.tile_buf) + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %71 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aic(%pv_pop_5 : !pto.tensor_view<64x128xf32>) {id = 27, split = 1} + pto.trowexpanddiv ins(%35, %38 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + %72 = arith.addi %43, %31 : index + %73 = pto.partition_view %42, offsets = [%72, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xf32> + pto.tstore ins(%35 : !pto.tile_buf) outs(%73 : !pto.partition_tensor_view<64x128xf32>) + } + return + } + func.func @call_both(%arg0: memref<256xi64>, %q: !pto.ptr, %k: !pto.ptr, %v: !pto.ptr, %p_fifo: !pto.ptr, %o_out: !pto.ptr, %qk_fifo: !pto.ptr, %pv_fifo: !pto.ptr) attributes {pto.entry} { + pto.set_ffts %arg0 : memref<256xi64> + call @cube_kernel(%qk_fifo, %q, %k, %v, %p_fifo, %pv_fifo) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + call @vector_kernel(%qk_fifo, %o_out, %p_fifo, %pv_fifo) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + return + } +} From 5100115c10e7a26922c78c5032352765afc4c30e Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:45:45 +0800 Subject: [PATCH 4/5] Delete test/lit/pto/fa.pto --- test/lit/pto/fa.pto | 501 -------------------------------------------- 1 file changed, 501 deletions(-) delete mode 100644 test/lit/pto/fa.pto diff --git a/test/lit/pto/fa.pto b/test/lit/pto/fa.pto deleted file mode 100644 index a4190459c..000000000 --- a/test/lit/pto/fa.pto +++ /dev/null @@ -1,501 +0,0 @@ -// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s >/dev/null - -module { - func.func @cube_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c128_0 = arith.constant 128 : index - %c256 = arith.constant 256 : index - %c4096 = arith.constant 4096 : index - %c128_1 = arith.constant 128 : index - %c16 = arith.constant 16 : index - %c16_2 = arith.constant 16 : index - %0 = pto.get_block_num - %1 = arith.index_cast %0 : i64 to index - %2 = pto.get_block_idx - %3 = arith.index_cast %2 : i64 to index - %4 = arith.divsi %c16_2, %1 : index - %5 = arith.remsi %c16_2, %1 : index - %6 = arith.addi %4, %c1 : index - %7 = arith.muli %3, %6 : index - %8 = arith.addi %4, %c1 : index - %9 = arith.muli %5, %8 : index - %10 = arith.subi %3, %5 : index - %11 = arith.muli %10, %4 : index - %12 = arith.addi %9, %11 : index - %13 = arith.cmpi slt, %3, %5 : index - %14 = arith.select %13, %7, %12 : index - %15 = arith.cmpi slt, %3, %5 : index - %16 = arith.addi %4, %c1 : index - %17 = arith.select %15, %16, %4 : index - %18 = arith.addi %14, %17 : index - %c524288 = arith.constant 524288 : index - %19 = arith.muli %3, %c524288 : index - %20 = pto.addptr %arg0, %19 : -> - %c0_3 = arith.constant 0 : index - %21 = pto.addptr %20, %c0_3 : -> - %c262144 = arith.constant 262144 : index - %22 = pto.addptr %20, %c262144 : -> - %c393216 = arith.constant 393216 : index - %23 = pto.addptr %20, %c393216 : -> - %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> - pto.aic_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) - %pv_slot_desc = pto.make_tensor_view %22, shape = [%c128, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<128x128xf32> - pto.aic_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<128x128xf32>) - %28 = pto.reserve_buffer{name = "fa_p_v2c_fifo", size = 524288, location = , auto = false, base = 393216} -> i32 - %c0_i32 = arith.constant 0 : i32 - pto.aic_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536, nosplit = false} (gm_slot_buffer = %23 : !pto.ptr, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32) - %c0_i64 = arith.constant 0 : i64 - %c0_i64_4 = arith.constant 0 : i64 - %29 = pto.alloc_tile addr = %c0_i64_4 : !pto.tile_buf - %c0_i64_5 = arith.constant 0 : i64 - %30 = pto.alloc_tile addr = %c0_i64_5 : !pto.tile_buf - %c32768_i64 = arith.constant 32768 : i64 - %31 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf - %c65536_i64 = arith.constant 65536 : i64 - %32 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf - %33 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf - %c0_i64_6 = arith.constant 0 : i64 - %34 = pto.alloc_tile addr = %c0_i64_6 : !pto.tile_buf - %c98304_i64 = arith.constant 98304 : i64 - %35 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf - %c32768_i64_7 = arith.constant 32768 : i64 - %36 = pto.alloc_tile addr = %c32768_i64_7 : !pto.tile_buf - %c163840_i64 = arith.constant 163840 : i64 - %37 = pto.alloc_tile addr = %c163840_i64 : !pto.tile_buf - %38 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf - %c131072_i64 = arith.constant 131072 : i64 - %39 = pto.alloc_tile addr = %c131072_i64 : !pto.tile_buf - %c2048 = arith.constant 2048 : index - %40 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view - %41 = pto.make_tensor_view %arg2, shape = [%c128_0, %c4096], strides = [%c1, %c128_0] : !pto.tensor_view - %42 = pto.make_tensor_view %arg3, shape = [%c4096, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view - scf.for %arg4 = %14 to %18 step %c1 { - %43 = arith.muli %arg4, %c128 : index - %44 = pto.partition_view %40, offsets = [%43, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%44 : !pto.partition_tensor_view<128x128xf16>) outs(%29 : !pto.tile_buf) - pto.tmov ins(%29 : !pto.tile_buf) outs(%30 : !pto.tile_buf) - %c0_8 = arith.constant 0 : index - %c0_9 = arith.constant 0 : index - %45 = arith.addi %c0_8, %c0_9 : index - %46 = pto.partition_view %41, offsets = [%c0, %45], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%46 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) - pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c0_10 = arith.constant 0 : index - %47 = pto.subview %34[%c0, %c0_10] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%47 : !pto.tile_buf) - %c128_11 = arith.constant 128 : index - %48 = arith.addi %c0_8, %c128_11 : index - %49 = pto.partition_view %41, offsets = [%c0, %48], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%49 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) - pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c128_12 = arith.constant 128 : index - %50 = pto.subview %34[%c0, %c128_12] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%50 : !pto.tile_buf) - %qk_push_0 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_push_part_0 = pto.partition_view %qk_push_0, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> - pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_0 : !pto.partition_tensor_view<128x256xf32>) - pto.tpush_to_aiv(%qk_push_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %c256_13 = arith.constant 256 : index - %c0_14 = arith.constant 0 : index - %51 = arith.addi %c256_13, %c0_14 : index - %52 = pto.partition_view %41, offsets = [%c0, %51], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%52 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) - pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c0_15 = arith.constant 0 : index - %53 = pto.subview %34[%c0, %c0_15] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) - %c128_16 = arith.constant 128 : index - %54 = arith.addi %c256_13, %c128_16 : index - %55 = pto.partition_view %41, offsets = [%c0, %54], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%55 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) - pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c128_17 = arith.constant 128 : index - %56 = pto.subview %34[%c0, %c128_17] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%56 : !pto.tile_buf) - %qk_push_1 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_push_part_1 = pto.partition_view %qk_push_1, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> - pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_1 : !pto.partition_tensor_view<128x256xf32>) - pto.tpush_to_aiv(%qk_push_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %57 = pto.partition_view %42, offsets = [%c0, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> - pto.tload ins(%57 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) - %c2 = arith.constant 2 : index - %c7 = arith.constant 7 : index - scf.for %arg5 = %c0 to %c7 step %c1 { - %61 = arith.muli %arg5, %c2 : index - %c2_18 = arith.constant 2 : index - %62 = arith.addi %61, %c2_18 : index - %63 = arith.muli %62, %c256 : index - %64 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf - pto.tmov ins(%64 : !pto.tile_buf) outs(%36 : !pto.tile_buf) - pto.tfree_from_aiv{id = 30, split = 1} - pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) - %65 = arith.addi %61, %c1 : index - %66 = arith.muli %65, %c256 : index - %67 = pto.partition_view %42, offsets = [%66, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> - pto.tload ins(%67 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) - pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_0 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_push_part_0 = pto.partition_view %pv_push_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> - pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_0 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c0_19 = arith.constant 0 : index - %68 = arith.addi %63, %c0_19 : index - %69 = pto.partition_view %41, offsets = [%c0, %68], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%69 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) - pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c0_20 = arith.constant 0 : index - %70 = pto.subview %34[%c0, %c0_20] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%70 : !pto.tile_buf) - %c128_21 = arith.constant 128 : index - %71 = arith.addi %63, %c128_21 : index - %72 = pto.partition_view %41, offsets = [%c0, %71], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%72 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) - pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c128_22 = arith.constant 128 : index - %73 = pto.subview %34[%c0, %c128_22] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%73 : !pto.tile_buf) - %qk_push_2 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_push_part_2 = pto.partition_view %qk_push_2, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> - pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_2 : !pto.partition_tensor_view<128x256xf32>) - pto.tpush_to_aiv(%qk_push_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %74 = arith.muli %arg5, %c2 : index - %75 = arith.addi %74, %c1 : index - %c2_23 = arith.constant 2 : index - %76 = arith.addi %75, %c2_23 : index - %77 = arith.muli %76, %c256 : index - %78 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf - pto.tmov ins(%78 : !pto.tile_buf) outs(%36 : !pto.tile_buf) - pto.tfree_from_aiv{id = 30, split = 1} - pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) - %79 = arith.addi %75, %c1 : index - %80 = arith.muli %79, %c256 : index - %81 = pto.partition_view %42, offsets = [%80, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> - pto.tload ins(%81 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) - pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_1 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_push_part_1 = pto.partition_view %pv_push_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> - pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_1 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c0_24 = arith.constant 0 : index - %82 = arith.addi %77, %c0_24 : index - %83 = pto.partition_view %41, offsets = [%c0, %82], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%83 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) - pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c0_25 = arith.constant 0 : index - %84 = pto.subview %34[%c0, %c0_25] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%84 : !pto.tile_buf) - %c128_26 = arith.constant 128 : index - %85 = arith.addi %77, %c128_26 : index - %86 = pto.partition_view %41, offsets = [%c0, %85], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> - pto.tload ins(%86 : !pto.partition_tensor_view<128x128xf16>) outs(%32 : !pto.tile_buf) - pto.tmov ins(%32 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - %c128_27 = arith.constant 128 : index - %87 = pto.subview %34[%c0, %c128_27] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf - pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%87 : !pto.tile_buf) - %qk_push_3 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_push_part_3 = pto.partition_view %qk_push_3, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> - pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_3 : !pto.partition_tensor_view<128x256xf32>) - pto.tpush_to_aiv(%qk_push_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - } - %58 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf - pto.tmov ins(%58 : !pto.tile_buf) outs(%36 : !pto.tile_buf) - pto.tfree_from_aiv{id = 30, split = 1} - pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) - %c3840 = arith.constant 3840 : index - %59 = pto.partition_view %42, offsets = [%c3840, %c0], sizes = [%c256, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<256x128xf16> - pto.tload ins(%59 : !pto.partition_tensor_view<256x128xf16>) outs(%37 : !pto.tile_buf) - pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_2 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_push_part_2 = pto.partition_view %pv_push_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> - pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_2 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %60 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf - pto.tmov ins(%60 : !pto.tile_buf) outs(%36 : !pto.tile_buf) - pto.tfree_from_aiv{id = 30, split = 1} - pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) - pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - %pv_push_3 = pto.talloc_to_aiv {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_push_part_3 = pto.partition_view %pv_push_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> - pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_part_3 : !pto.partition_tensor_view<128x128xf32>) - pto.tpush_to_aiv(%pv_push_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - } - return - } - func.func @vector_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c64 = arith.constant 64 : index - %c256 = arith.constant 256 : index - %c128_0 = arith.constant 128 : index - %c16 = arith.constant 16 : index - %c16_1 = arith.constant 16 : index - %0 = pto.get_block_num - %1 = arith.index_cast %0 : i64 to index - %2 = pto.get_block_idx - %3 = arith.index_cast %2 : i64 to index - %4 = arith.divsi %c16_1, %1 : index - %5 = arith.remsi %c16_1, %1 : index - %6 = arith.addi %4, %c1 : index - %7 = arith.muli %3, %6 : index - %8 = arith.addi %4, %c1 : index - %9 = arith.muli %5, %8 : index - %10 = arith.subi %3, %5 : index - %11 = arith.muli %10, %4 : index - %12 = arith.addi %9, %11 : index - %13 = arith.cmpi slt, %3, %5 : index - %14 = arith.select %13, %7, %12 : index - %15 = arith.cmpi slt, %3, %5 : index - %16 = arith.addi %4, %c1 : index - %17 = arith.select %15, %16, %4 : index - %18 = arith.addi %14, %17 : index - %c524288 = arith.constant 524288 : index - %19 = arith.muli %3, %c524288 : index - %20 = pto.addptr %arg0, %19 : -> - %c0_2 = arith.constant 0 : index - %21 = pto.addptr %20, %c0_2 : -> - %c262144 = arith.constant 262144 : index - %22 = pto.addptr %20, %c262144 : -> - %c393216 = arith.constant 393216 : index - %23 = pto.addptr %20, %c393216 : -> - %qk_slot_desc = pto.make_tensor_view %21, shape = [%c128, %c256], strides = [%c256, %c1] : !pto.tensor_view<128x256xf32> - pto.aiv_initialize_pipe{id = 25, dir_mask = 1, slot_size = 131072} (gm_slot_tensor = %qk_slot_desc : !pto.tensor_view<128x256xf32>) - %pv_slot_desc = pto.make_tensor_view %22, shape = [%c128, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view<128x128xf32> - pto.aiv_initialize_pipe{id = 27, dir_mask = 1, slot_size = 65536} (gm_slot_tensor = %pv_slot_desc : !pto.tensor_view<128x128xf32>) - %28 = pto.import_reserved_buffer{name = "fa_p_v2c_fifo", peer_func = @cube_kernel} -> i32 - %c0_i32 = arith.constant 0 : i32 - pto.aiv_initialize_pipe{id = 30, dir_mask = 2, slot_size = 65536, nosplit = false} (gm_slot_buffer = %23 : !pto.ptr, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32) - %29 = pto.get_subblock_idx - %30 = arith.index_cast %29 : i64 to index - %31 = arith.muli %30, %c64 : index - %c196608_i64 = arith.constant 196608 : i64 - %32 = pto.alloc_tile addr = %c196608_i64 : !pto.tile_buf - %c262144_i64 = arith.constant 262144 : i64 - %33 = pto.alloc_tile addr = %c262144_i64 : !pto.tile_buf - %c327680_i64 = arith.constant 327680 : i64 - %34 = pto.alloc_tile addr = %c327680_i64 : !pto.tile_buf - %c360448_i64 = arith.constant 360448 : i64 - %35 = pto.alloc_tile addr = %c360448_i64 : !pto.tile_buf - %c393216_i64 = arith.constant 393216 : i64 - %36 = pto.alloc_tile addr = %c393216_i64 : !pto.tile_buf - %c393472_i64 = arith.constant 393472 : i64 - %37 = pto.alloc_tile addr = %c393472_i64 : !pto.tile_buf - %c393728_i64 = arith.constant 393728 : i64 - %38 = pto.alloc_tile addr = %c393728_i64 : !pto.tile_buf - %c393984_i64 = arith.constant 393984 : i64 - %39 = pto.alloc_tile addr = %c393984_i64 : !pto.tile_buf - %c394240_i64 = arith.constant 394240 : i64 - %40 = pto.alloc_tile addr = %c394240_i64 : !pto.tile_buf - %c394496_i64 = arith.constant 394496 : i64 - %41 = pto.alloc_tile addr = %c394496_i64 : !pto.tile_buf - %cst = arith.constant 0.0883883461 : f32 - %cst_3 = arith.constant 1.000000e+00 : f32 - %c2048 = arith.constant 2048 : index - %42 = pto.make_tensor_view %arg1, shape = [%c2048, %c128_0], strides = [%c128_0, %c1] : !pto.tensor_view - scf.for %arg2 = %14 to %18 step %c1 { - %43 = arith.muli %arg2, %c128 : index - %c394752_i64 = arith.constant 394752 : i64 - %44 = pto.alloc_tile addr = %c394752_i64 : !pto.tile_buf - %qk_pop_0 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_pop_part_0 = pto.partition_view %qk_pop_0, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> - pto.tload ins(%qk_pop_part_0 : !pto.partition_tensor_view<64x256xf32>) outs(%44 : !pto.tile_buf) - pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) - pto.trowmax ins(%44, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) - %45 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf - %46 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf - %47 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf - %48 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf - %49 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf - pto.trowexpandsub ins(%44, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.tmuls ins(%45, %cst_3 : !pto.tile_buf, f32) outs(%46 : !pto.tile_buf) - pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) - pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) - pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} - pto.tfree_from_aic(%qk_pop_0 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %c394752_i64_4 = arith.constant 394752 : i64 - %50 = pto.alloc_tile addr = %c394752_i64_4 : !pto.tile_buf - %qk_pop_1 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_pop_part_1 = pto.partition_view %qk_pop_1, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> - pto.tload ins(%qk_pop_part_1 : !pto.partition_tensor_view<64x256xf32>) outs(%50 : !pto.tile_buf) - pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) - pto.trowmax ins(%50, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) - %51 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf - %52 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf - %53 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf - %54 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf - %55 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf - pto.tmax ins(%51, %52 : !pto.tile_buf, !pto.tile_buf) outs(%51 : !pto.tile_buf) - pto.tsub ins(%52, %51 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) - pto.tmuls ins(%51, %cst_3 : !pto.tile_buf, f32) outs(%52 : !pto.tile_buf) - pto.trowexpandsub ins(%50, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.texp ins(%53 : !pto.tile_buf) outs(%53 : !pto.tile_buf) - pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.tmul ins(%54, %53 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) - pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - pto.tadd ins(%54, %55 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) - pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) - pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} - pto.tfree_from_aic(%qk_pop_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %c394752_i64_5 = arith.constant 394752 : i64 - %56 = pto.alloc_tile addr = %c394752_i64_5 : !pto.tile_buf - %pv_pop_0 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_pop_part_0 = pto.partition_view %pv_pop_0, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> - pto.tload ins(%pv_pop_part_0 : !pto.partition_tensor_view<64x128xf32>) outs(%56 : !pto.tile_buf) - pto.tmov ins(%56 : !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tfree_from_aic(%pv_pop_0 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c394752_i64_6 = arith.constant 394752 : i64 - %57 = pto.alloc_tile addr = %c394752_i64_6 : !pto.tile_buf - %qk_pop_2 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_pop_part_2 = pto.partition_view %qk_pop_2, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> - pto.tload ins(%qk_pop_part_2 : !pto.partition_tensor_view<64x256xf32>) outs(%57 : !pto.tile_buf) - pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) - pto.trowmax ins(%57, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) - %58 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf - %59 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf - %60 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf - %61 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf - %62 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf - pto.tmax ins(%58, %59 : !pto.tile_buf, !pto.tile_buf) outs(%58 : !pto.tile_buf) - pto.tsub ins(%59, %58 : !pto.tile_buf, !pto.tile_buf) outs(%60 : !pto.tile_buf) - pto.tmuls ins(%58, %cst_3 : !pto.tile_buf, f32) outs(%59 : !pto.tile_buf) - pto.trowexpandsub ins(%57, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.texp ins(%60 : !pto.tile_buf) outs(%60 : !pto.tile_buf) - pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.tmul ins(%61, %60 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) - pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - pto.tadd ins(%61, %62 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) - pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) - pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} - pto.tfree_from_aic(%qk_pop_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %c394752_i64_7 = arith.constant 394752 : i64 - %63 = pto.alloc_tile addr = %c394752_i64_7 : !pto.tile_buf - %pv_pop_1 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_pop_part_1 = pto.partition_view %pv_pop_1, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> - pto.tload ins(%pv_pop_part_1 : !pto.partition_tensor_view<64x128xf32>) outs(%63 : !pto.tile_buf) - pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tadd ins(%35, %63 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tfree_from_aic(%pv_pop_1 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c394752_i64_8 = arith.constant 394752 : i64 - %64 = pto.alloc_tile addr = %c394752_i64_8 : !pto.tile_buf - %qk_pop_3 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_pop_part_3 = pto.partition_view %qk_pop_3, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> - pto.tload ins(%qk_pop_part_3 : !pto.partition_tensor_view<64x256xf32>) outs(%64 : !pto.tile_buf) - pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) - pto.trowmax ins(%64, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) - %65 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf - %66 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf - %67 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf - %68 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf - %69 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf - pto.tmax ins(%65, %66 : !pto.tile_buf, !pto.tile_buf) outs(%65 : !pto.tile_buf) - pto.tsub ins(%66, %65 : !pto.tile_buf, !pto.tile_buf) outs(%67 : !pto.tile_buf) - pto.tmuls ins(%65, %cst_3 : !pto.tile_buf, f32) outs(%66 : !pto.tile_buf) - pto.trowexpandsub ins(%64, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.texp ins(%67 : !pto.tile_buf) outs(%67 : !pto.tile_buf) - pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.tmul ins(%68, %67 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) - pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - pto.tadd ins(%68, %69 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) - pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) - pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} - pto.tfree_from_aic(%qk_pop_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %c7 = arith.constant 7 : index - scf.for %arg3 = %c1 to %c7 step %c1 { - %c394752_i64_11 = arith.constant 394752 : i64 - %74 = pto.alloc_tile addr = %c394752_i64_11 : !pto.tile_buf - %pv_pop_2 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_pop_part_2 = pto.partition_view %pv_pop_2, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> - pto.tload ins(%pv_pop_part_2 : !pto.partition_tensor_view<64x128xf32>) outs(%74 : !pto.tile_buf) - pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tadd ins(%35, %74 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tfree_from_aic(%pv_pop_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c394752_i64_12 = arith.constant 394752 : i64 - %75 = pto.alloc_tile addr = %c394752_i64_12 : !pto.tile_buf - %qk_pop_4 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_pop_part_4 = pto.partition_view %qk_pop_4, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> - pto.tload ins(%qk_pop_part_4 : !pto.partition_tensor_view<64x256xf32>) outs(%75 : !pto.tile_buf) - pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) - pto.trowmax ins(%75, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) - %76 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf - %77 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf - %78 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf - %79 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf - %80 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf - pto.tmax ins(%76, %77 : !pto.tile_buf, !pto.tile_buf) outs(%76 : !pto.tile_buf) - pto.tsub ins(%77, %76 : !pto.tile_buf, !pto.tile_buf) outs(%78 : !pto.tile_buf) - pto.tmuls ins(%76, %cst_3 : !pto.tile_buf, f32) outs(%77 : !pto.tile_buf) - pto.trowexpandsub ins(%75, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.texp ins(%78 : !pto.tile_buf) outs(%78 : !pto.tile_buf) - pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.tmul ins(%79, %78 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) - pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - pto.tadd ins(%79, %80 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) - pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) - pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} - pto.tfree_from_aic(%qk_pop_4 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - %c394752_i64_13 = arith.constant 394752 : i64 - %81 = pto.alloc_tile addr = %c394752_i64_13 : !pto.tile_buf - %pv_pop_3 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_pop_part_3 = pto.partition_view %pv_pop_3, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> - pto.tload ins(%pv_pop_part_3 : !pto.partition_tensor_view<64x128xf32>) outs(%81 : !pto.tile_buf) - pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tadd ins(%35, %81 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tfree_from_aic(%pv_pop_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c394752_i64_14 = arith.constant 394752 : i64 - %82 = pto.alloc_tile addr = %c394752_i64_14 : !pto.tile_buf - %qk_pop_5 = pto.tpop_from_aic {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> - %qk_pop_part_5 = pto.partition_view %qk_pop_5, offsets = [%31, %c0], sizes = [%c64, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<64x256xf32> - pto.tload ins(%qk_pop_part_5 : !pto.partition_tensor_view<64x256xf32>) outs(%82 : !pto.tile_buf) - pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) - pto.trowmax ins(%82, %33 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) - %83 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf - %84 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf - %85 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf - %86 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf - %87 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf - pto.tmax ins(%83, %84 : !pto.tile_buf, !pto.tile_buf) outs(%83 : !pto.tile_buf) - pto.tsub ins(%84, %83 : !pto.tile_buf, !pto.tile_buf) outs(%85 : !pto.tile_buf) - pto.tmuls ins(%83, %cst_3 : !pto.tile_buf, f32) outs(%84 : !pto.tile_buf) - pto.trowexpandsub ins(%82, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.texp ins(%85 : !pto.tile_buf) outs(%85 : !pto.tile_buf) - pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) - pto.tmul ins(%86, %85 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) - pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) - pto.tadd ins(%86, %87 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) - pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) - pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} - pto.tfree_from_aic(%qk_pop_5 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} - } - %c394752_i64_9 = arith.constant 394752 : i64 - %70 = pto.alloc_tile addr = %c394752_i64_9 : !pto.tile_buf - %pv_pop_4 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_pop_part_4 = pto.partition_view %pv_pop_4, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> - pto.tload ins(%pv_pop_part_4 : !pto.partition_tensor_view<64x128xf32>) outs(%70 : !pto.tile_buf) - pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tadd ins(%35, %70 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tfree_from_aic(%pv_pop_4 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - %c394752_i64_10 = arith.constant 394752 : i64 - %71 = pto.alloc_tile addr = %c394752_i64_10 : !pto.tile_buf - %pv_pop_5 = pto.tpop_from_aic {id = 27, split = 0} -> !pto.tensor_view<128x128xf32> - %pv_pop_part_5 = pto.partition_view %pv_pop_5, offsets = [%31, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<64x128xf32> - pto.tload ins(%pv_pop_part_5 : !pto.partition_tensor_view<64x128xf32>) outs(%71 : !pto.tile_buf) - pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tadd ins(%35, %71 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - pto.tfree_from_aic(%pv_pop_5 : !pto.tensor_view<128x128xf32>) {id = 27, split = 0} - pto.trowexpanddiv ins(%35, %38 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) - %72 = arith.addi %43, %31 : index - %73 = pto.partition_view %42, offsets = [%72, %c0], sizes = [%c64, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<64x128xf32> - pto.tstore ins(%35 : !pto.tile_buf) outs(%73 : !pto.partition_tensor_view<64x128xf32>) - } - return - } - func.func @call_both(%arg0: memref<256xi64>, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.entry} { - pto.set_ffts %arg0 : memref<256xi64> - call @cube_kernel(%arg1, %arg2, %arg3, %arg4) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () - call @vector_kernel(%arg1, %arg5) : (!pto.ptr, !pto.ptr) -> () - return - } -} From 9f12bae0e2bacafbb6830204b5a9937cbd5ba338 Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Thu, 7 May 2026 19:26:17 +0800 Subject: [PATCH 5/5] Align FA perf PTO preload schedule --- test/lit/pto/fa_perf.pto | 88 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/test/lit/pto/fa_perf.pto b/test/lit/pto/fa_perf.pto index 9c4be080a..3642d140e 100644 --- a/test/lit/pto/fa_perf.pto +++ b/test/lit/pto/fa_perf.pto @@ -115,14 +115,46 @@ module { %qk_push_part_1 = pto.partition_view %qk_push_1, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_1 : !pto.partition_tensor_view<128x256xf32>) pto.tpush_to_aiv(%qk_push_1 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c512 = arith.constant 512 : index + %qk_init2_part_0 = pto.partition_view %41, offsets = [%c0, %c512], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%qk_init2_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %qk_init2_lo = pto.subview %34[%c0, %c0] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%qk_init2_lo : !pto.tile_buf) + %qk_init2_hi_off = arith.addi %c512, %c128 : index + %qk_init2_part_1 = pto.partition_view %41, offsets = [%c0, %qk_init2_hi_off], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%qk_init2_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %qk_init2_hi = pto.subview %34[%c0, %c128] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%qk_init2_hi : !pto.tile_buf) + %qk_push_init_2 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_init_2 = pto.partition_view %qk_push_init_2, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_init_2 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_init_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} + %c768 = arith.constant 768 : index + %qk_init3_part_0 = pto.partition_view %41, offsets = [%c0, %c768], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%qk_init3_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %qk_init3_lo = pto.subview %34[%c0, %c0] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%qk_init3_lo : !pto.tile_buf) + %qk_init3_hi_off = arith.addi %c768, %c128 : index + %qk_init3_part_1 = pto.partition_view %41, offsets = [%c0, %qk_init3_hi_off], sizes = [%c128_0, %c128_1] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%qk_init3_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + %qk_init3_hi = pto.subview %34[%c0, %c128] sizes [128, 128] : !pto.tile_buf -> !pto.tile_buf + pto.tmatmul ins(%30, %33 : !pto.tile_buf, !pto.tile_buf) outs(%qk_init3_hi : !pto.tile_buf) + %qk_push_init_3 = pto.talloc_to_aiv {id = 25, split = 0} -> !pto.tensor_view<128x256xf32> + %qk_push_part_init_3 = pto.partition_view %qk_push_init_3, offsets = [%c0, %c0], sizes = [%c128, %c256] : !pto.tensor_view<128x256xf32> -> !pto.partition_tensor_view<128x256xf32> + pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_init_3 : !pto.partition_tensor_view<128x256xf32>) + pto.tpush_to_aiv(%qk_push_init_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} %57 = pto.partition_view %42, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> pto.tload ins(%57 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) %c2 = arith.constant 2 : index - %c7 = arith.constant 7 : index - scf.for %arg5 = %c0 to %c7 step %c1 { + %c6 = arith.constant 6 : index + scf.for %arg5 = %c0 to %c6 step %c1 { %61 = arith.muli %arg5, %c2 : index - %c2_18 = arith.constant 2 : index - %62 = arith.addi %61, %c2_18 : index + %c4_18 = arith.constant 4 : index + %62 = arith.addi %61, %c4_18 : index %63 = arith.muli %62, %c256 : index %p_pop_0 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> %p_pop_part_0 = pto.partition_view %p_pop_0, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> @@ -168,8 +200,8 @@ module { pto.tpush_to_aiv(%qk_push_2 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} %74 = arith.muli %arg5, %c2 : index %75 = arith.addi %74, %c1 : index - %c2_23 = arith.constant 2 : index - %76 = arith.addi %75, %c2_23 : index + %c4_23 = arith.constant 4 : index + %76 = arith.addi %75, %c4_23 : index %77 = arith.muli %76, %c256 : index %p_pop_1 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> %p_pop_part_1 = pto.partition_view %p_pop_1, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> @@ -214,6 +246,50 @@ module { pto.tstore ins(%34 : !pto.tile_buf) outs(%qk_push_part_3 : !pto.partition_tensor_view<128x256xf32>) pto.tpush_to_aiv(%qk_push_3 : !pto.tensor_view<128x256xf32>) {id = 25, split = 0} } + %p_pop_tail_2 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_tail_2_part_0 = pto.partition_view %p_pop_tail_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_tail_2_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %c3072 = arith.constant 3072 : index + %pv_tail_2_part_0 = pto.partition_view %42, offsets = [%c3072, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv_tail_2_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_tail_2_part_1 = pto.partition_view %p_pop_tail_2, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_tail_2_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_tail_2 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c3200 = arith.constant 3200 : index + %pv_tail_2_part_1 = pto.partition_view %42, offsets = [%c3200, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv_tail_2_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_tail_2 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> + %pv_push_tail_2_part = pto.partition_view %pv_push_tail_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_tail_2_part : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_tail_2 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} + %p_pop_tail_3 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> + %p_pop_tail_3_part_0 = pto.partition_view %p_pop_tail_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_tail_3_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + %c3328 = arith.constant 3328 : index + %pv_tail_3_part_0 = pto.partition_view %42, offsets = [%c3328, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv_tail_3_part_0 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul ins(%36, %38 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %p_pop_tail_3_part_1 = pto.partition_view %p_pop_tail_3, offsets = [%c0, %c128], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%p_pop_tail_3_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf) + pto.tmov ins(%35 : !pto.tile_buf) outs(%36 : !pto.tile_buf) + pto.tfree_from_aiv(%p_pop_tail_3 : !pto.tensor_view<128x256xf16>) {id = 30, split = 0} + %c3456 = arith.constant 3456 : index + %pv_tail_3_part_1 = pto.partition_view %42, offsets = [%c3456, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view -> !pto.partition_tensor_view<128x128xf16> + pto.tload ins(%pv_tail_3_part_1 : !pto.partition_tensor_view<128x128xf16>) outs(%37 : !pto.tile_buf) + pto.tmov ins(%37 : !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tmatmul.acc ins(%39, %36, %38 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + %pv_push_tail_3 = pto.talloc_to_aiv {id = 27, split = 1} -> !pto.tensor_view<128x128xf32> + %pv_push_tail_3_part = pto.partition_view %pv_push_tail_3, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x128xf32> -> !pto.partition_tensor_view<128x128xf32> + pto.tstore ins(%39 : !pto.tile_buf) outs(%pv_push_tail_3_part : !pto.partition_tensor_view<128x128xf32>) + pto.tpush_to_aiv(%pv_push_tail_3 : !pto.tensor_view<128x128xf32>) {id = 27, split = 1} %p_pop_2 = pto.tpop_from_aiv {id = 30, split = 0} -> !pto.tensor_view<128x256xf16> %p_pop_part_2 = pto.partition_view %p_pop_2, offsets = [%c0, %c0], sizes = [%c128, %c128_0] : !pto.tensor_view<128x256xf16> -> !pto.partition_tensor_view<128x128xf16> pto.tload ins(%p_pop_part_2 : !pto.partition_tensor_view<128x128xf16>) outs(%35 : !pto.tile_buf)