From b87ccf3b9cbc1db1d5d1c2a7a03671596ca21809 Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Thu, 7 May 2026 13:41:44 +0200
Subject: [PATCH] added benchmark and mlirs for 140tflops dls flash attention
 and ptoas mlir version

---
 .../FlashAttention/compile_and_run/README.md  |  36 +
 .../benchmark_flashattention.py               | 167 ++++
 .../FlashAttention/compile_and_run/caller.cpp |  29 +
 .../compile_and_run/caller_140tflops.cpp      |  30 +
 .../compile_and_run/compile_flashattention.sh |  54 ++
 .../compile_and_run/fa_140tflops.pto          | 828 ++++++++++++++++++
 .../fa_patched_s1_256_q3072_s0_8192.pto       | 402 +++++++++
 7 files changed, 1546 insertions(+)
 create mode 100644 test/samples/FlashAttention/compile_and_run/README.md
 create mode 100644 test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py
 create mode 100644 test/samples/FlashAttention/compile_and_run/caller.cpp
 create mode 100644 test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp
 create mode 100755 test/samples/FlashAttention/compile_and_run/compile_flashattention.sh
 create mode 100644 test/samples/FlashAttention/compile_and_run/fa_140tflops.pto
 create mode 100644 test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto

diff --git a/test/samples/FlashAttention/compile_and_run/README.md b/test/samples/FlashAttention/compile_and_run/README.md
new file mode 100644
index 000000000..8e2bb7913
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/README.md
@@ -0,0 +1,36 @@
+# FlashAttention compile and benchmark
+
+This directory contains two PTO FlashAttention variants:
+
+- `fa_140tflops.pto`
+- `fa_patched_s1_256_q3072_s0_8192.pto`
+
+## Requirements
+
+- Run inside the configured Ascend/CANN container environment.
+- `ptoas` and `bisheng` must already be available in `PATH`.
+- `/sources/pto-isa/include` must exist.
+- Python benchmark requires `torch_npu==2.9.0`.
+
+## Compile
+
+From this directory, run:
+
+```bash
+bash compile_flashattention.sh
+```
+
+This builds:
+
+- `/tmp/fa_140tflops.so`
+- `/tmp/compiler_team_fa.so`
+
+## Benchmark
+
+After compiling, run:
+
+```bash
+python3 benchmark_flashattention.py
+```
+
+The benchmark compares both PTO kernels against `torch_npu.npu_fused_infer_attention_score`, checks correctness against both fp32 reference attention and torch_npu output, and reports latency, TFLOP/s, and speedup.
\ No newline at end of file
diff --git a/test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py b/test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py
new file mode 100644
index 000000000..79f495ee7
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import ctypes
+import math
+
+import torch
+import torch_npu  # noqa: F401
+
+KERNELS = [
+    ("fa_140tflops", "/tmp/fa_140tflops.so", 524288, True),
+    ("patched", "/tmp/compiler_team_fa.so", 229376, False),
+]
+DEVICE = "npu:0"
+WARMUP_ITERS = 10
+BENCH_ITERS = 100
+NUM_CUBE_CORES = 24
+RTOL = 1e-3
+ATOL = 1e-3
+
+Q_ROWS = 3072
+HEAD = 128
+S1_TOTAL = 8192
+NUM_Q_BLOCKS = Q_ROWS // 32
+
+
+def load_lib(lib_path, pass_shape):
+    lib = ctypes.CDLL(lib_path)
+    argtypes = [
+        ctypes.c_uint32,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+    ]
+    if pass_shape:
+        argtypes += [ctypes.c_int64, ctypes.c_int64]
+    lib.call_kernel.argtypes = argtypes
+    lib.call_kernel.restype = None
+    return lib
+
+
+def ptr(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+def fused_attention(q_bsh, k_bsh, v_bsh):
+    scale = 1.0 / math.sqrt(q_bsh.shape[-1])
+    out, _ = torch_npu.npu_fused_infer_attention_score(
+        q_bsh,
+        k_bsh,
+        v_bsh,
+        num_heads=1,
+        input_layout="BSH",
+        scale=scale,
+        next_tokens=65535,
+    )
+    return out
+
+
+def fa_reference(q, k, v):
+    scale = 1.0 / math.sqrt(q.shape[1])
+    scores = q.float() @ k.float().T * scale
+    return torch.softmax(scores, dim=-1) @ v.float()
+
+
+def run_pto_kernel(lib, pass_shape, block_dim, gm, q, k, v, o):
+    stream = torch.npu.current_stream()._as_parameter_
+    args = [block_dim, stream, ptr(gm), ptr(q), ptr(k), ptr(v), ptr(o)]
+    if pass_shape:
+        args += [q.shape[0], k.shape[0]]
+    lib.call_kernel(*args)
+
+
+def check_close(out_pto, out_fp32, out_torch_npu):
+    max_err_fp32 = (out_pto - out_fp32).abs().max().item()
+    max_err_torch_npu = (out_pto - out_torch_npu).abs().max().item()
+    try:
+        torch.testing.assert_close(out_pto, out_fp32, rtol=RTOL, atol=ATOL)
+        torch.testing.assert_close(out_pto, out_torch_npu, rtol=RTOL, atol=ATOL)
+        return "PASSED", max_err_fp32, max_err_torch_npu
+    except AssertionError:
+        return "FAILED", max_err_fp32, max_err_torch_npu
+
+
+def bench(fn):
+    for _ in range(WARMUP_ITERS):
+        fn()
+    torch.npu.synchronize()
+
+    start = torch.npu.Event(enable_timing=True)
+    end = torch.npu.Event(enable_timing=True)
+    start.record()
+    for _ in range(BENCH_ITERS):
+        fn()
+    end.record()
+    torch.npu.synchronize()
+    return start.elapsed_time(end) * 1000.0 / BENCH_ITERS
+
+
+def main():
+    device = torch.device(DEVICE)
+    block_dim = min(NUM_Q_BLOCKS, NUM_CUBE_CORES)
+    flops = 4 * Q_ROWS * HEAD * S1_TOTAL
+
+    torch.manual_seed(0)
+    q = torch.randn((Q_ROWS, HEAD), dtype=torch.float16, device=device)
+    k = torch.randn((S1_TOTAL, HEAD), dtype=torch.float16, device=device)
+    v = torch.randn((S1_TOTAL, HEAD), dtype=torch.float16, device=device)
+    q_bsh = q.unsqueeze(0)
+    k_bsh = k.unsqueeze(0)
+    v_bsh = v.unsqueeze(0)
+
+    def run_torch_npu():
+        fused_attention(q_bsh, k_bsh, v_bsh)
+
+    out_torch_npu = fused_attention(q_bsh, k_bsh, v_bsh).squeeze(0).float().cpu()
+    out_fp32 = fa_reference(q, k, v).float().cpu()
+    torch.npu.synchronize()
+
+    torch_npu_us = bench(run_torch_npu)
+    torch_npu_tflops = flops / (torch_npu_us * 1e-6) / 1e12
+
+    print(
+        f"PTO FA variants vs torch_npu fused attention: Q={Q_ROWS} S1={S1_TOTAL} H={HEAD} "
+        f"blockDim={block_dim}"
+    )
+    print(f"  torch_npu: {torch_npu_us:8.2f} us  {torch_npu_tflops:7.3f} TFLOP/s")
+
+    for name, lib_path, gm_elems_per_block, pass_shape in KERNELS:
+        lib = load_lib(lib_path, pass_shape)
+        gm = torch.zeros(
+            (gm_elems_per_block * block_dim,), dtype=torch.float32, device=device
+        )
+        o = torch.zeros((Q_ROWS, HEAD), dtype=torch.float32, device=device)
+
+        def run_pto():
+            run_pto_kernel(lib, pass_shape, block_dim, gm, q, k, v, o)
+
+        # Correctness check against torch_npu fused attention.
+        gm.zero_()
+        o.zero_()
+        run_pto()
+        torch.npu.synchronize()
+        out_pto = o.float().cpu()
+        correctness, max_err_fp32, max_err_torch_npu = check_close(
+            out_pto, out_fp32, out_torch_npu
+        )
+
+        pto_us = bench(run_pto)
+        pto_tflops = flops / (pto_us * 1e-6) / 1e12
+        print(
+            f"  {name:12s}: {pto_us:8.2f} us  {pto_tflops:7.3f} TFLOP/s  "
+            f"speedup={torch_npu_us / pto_us:.2f}x  {correctness}  "
+            f"max_err(fp32={max_err_fp32:.3e}, torch_npu={max_err_torch_npu:.3e})"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/samples/FlashAttention/compile_and_run/caller.cpp b/test/samples/FlashAttention/compile_and_run/caller.cpp
new file mode 100644
index 000000000..02bae6e11
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/caller.cpp
@@ -0,0 +1,29 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include <cstdint>
+
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(uint32_t blockDim, void *stream, uint8_t *gmSlotBuffer, uint8_t *q, uint8_t *k, uint8_t *v,
+                            uint8_t *o)
+{
+    void *fftsAddr = nullptr;
+    uint32_t fftsLen = 0;
+    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
+    (void)fftsLen;
+
+    call_both<<<blockDim, nullptr, stream>>>((__gm__ int64_t *)fftsAddr, (__gm__ float *)gmSlotBuffer,
+                                            (__gm__ half *)q, (__gm__ half *)k, (__gm__ half *)v, (__gm__ float *)o);
+}
diff --git a/test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp b/test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp
new file mode 100644
index 000000000..faeb3a540
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include <cstdint>
+
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(uint32_t blockDim, void *stream, uint8_t *gmSlotBuffer, uint8_t *q, uint8_t *k, uint8_t *v,
+                            uint8_t *o, int64_t s0, int64_t s1)
+{
+    void *fftsAddr = nullptr;
+    uint32_t fftsLen = 0;
+    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
+    (void)fftsLen;
+
+    call_both<<<blockDim, nullptr, stream>>>((__gm__ int64_t *)fftsAddr, (__gm__ float *)gmSlotBuffer,
+                                            (__gm__ half *)gmSlotBuffer, (__gm__ half *)q, (__gm__ half *)k,
+                                            (__gm__ half *)v, (__gm__ float *)o, s0, s1);
+}
diff --git a/test/samples/FlashAttention/compile_and_run/compile_flashattention.sh b/test/samples/FlashAttention/compile_and_run/compile_flashattention.sh
new file mode 100755
index 000000000..140d16d73
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/compile_flashattention.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+set -euo pipefail
+
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync \
+    fa_patched_s1_256_q3072_s0_8192.pto \
+    >/tmp/compiler_team_fa.cpp
+
+bisheng \
+    -I/sources/pto-isa/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    -cce-enable-mix \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"/tmp/compiler_team_fa.cpp\"" \
+    "caller.cpp" \
+    -o /tmp/compiler_team_fa.so
+
+ptoas --pto-arch=a3 --enable-insert-sync \
+    fa_140tflops.pto \
+    >/tmp/fa_140tflops.cpp
+
+bisheng \
+    -I/sources/pto-isa/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    -cce-enable-mix \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"/tmp/fa_140tflops.cpp\"" \
+    "caller_140tflops.cpp" \
+    -o /tmp/fa_140tflops.so
diff --git a/test/samples/FlashAttention/compile_and_run/fa_140tflops.pto b/test/samples/FlashAttention/compile_and_run/fa_140tflops.pto
new file mode 100644
index 000000000..e60bd83ba
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/fa_140tflops.pto
@@ -0,0 +1,828 @@
+"builtin.module"() ({
+  "func.func"() <{function_type = (!pto.ptr<f32>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>, i64, i64) -> (), sym_name = "cube_kernel"}> ({
+  ^bb0(%arg15: !pto.ptr<f32>, %arg16: !pto.ptr<f16>, %arg17: !pto.ptr<f16>, %arg18: !pto.ptr<f16>, %arg19: !pto.ptr<f16>, %arg20: i64, %arg21: i64):
+    %200 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %201 = "arith.constant"() <{value = 1 : index}> : () -> index
+    %202 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %203 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %204 = "arith.constant"() <{value = 256 : index}> : () -> index
+    %205 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %206 = "arith.constant"() <{value = 524288 : index}> : () -> index
+    %207 = "arith.constant"() <{value = 1048576 : index}> : () -> index
+    %208 = "pto.get_block_idx"() : () -> i64
+    %209 = "arith.index_cast"(%208) : (i64) -> index
+    %210 = "arith.index_cast"(%arg20) : (i64) -> index
+    %211 = "arith.index_cast"(%arg21) : (i64) -> index
+    %212 = "arith.divsi"(%211, %204) : (index, index) -> index
+    %213 = "arith.muli"(%209, %202) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %214 = "arith.muli"(%209, %206) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %215 = "pto.addptr"(%arg15, %214) : (!pto.ptr<f32>, index) -> !pto.ptr<f32>
+    %216 = "arith.muli"(%209, %207) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %217 = "pto.addptr"(%arg16, %216) : (!pto.ptr<f16>, index) -> !pto.ptr<f16>
+    %218 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %219 = "pto.addptr"(%215, %218) : (!pto.ptr<f32>, index) -> !pto.ptr<f32>
+    %220 = "arith.constant"() <{value = 524288 : index}> : () -> index
+    %221 = "pto.addptr"(%217, %220) : (!pto.ptr<f16>, index) -> !pto.ptr<f16>
+    %222 = "arith.constant"() <{value = 393216 : index}> : () -> index
+    %223 = "pto.addptr"(%215, %222) : (!pto.ptr<f32>, index) -> !pto.ptr<f32>
+    %224 = "pto.make_tensor_view"(%219, %202, %204, %204, %201) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f32>, index, index, index, index) -> !pto.tensor_view<128x256xf32>
+    %225 = "pto.initialize_l2g2l_pipe"(%224) <{dir_mask = 1 : i8, flag_base = 0 : i32, operandSegmentSizes = array<i32: 1, 0, 0>, slot_num = 8 : i32, slot_size = 131072 : i32}> : (!pto.tensor_view<128x256xf32>) -> !pto.pipe
+    %226 = "pto.make_tensor_view"(%221, %202, %204, %204, %201) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f16>, index, index, index, index) -> !pto.tensor_view<128x256xf16>
+    %227 = "pto.initialize_l2g2l_pipe"(%226) <{dir_mask = 2 : i8, flag_base = 2 : i32, operandSegmentSizes = array<i32: 1, 0, 0>, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<128x256xf16>) -> !pto.pipe
+    %228 = "pto.make_tensor_view"(%223, %202, %203, %203, %201) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f32>, index, index, index, index) -> !pto.tensor_view<128x128xf32>
+    %229 = "pto.initialize_l2g2l_pipe"(%228) <{dir_mask = 1 : i8, flag_base = 4 : i32, operandSegmentSizes = array<i32: 1, 0, 0>, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<128x128xf32>) -> !pto.pipe
+    %230 = "pto.make_tensor_view"(%arg17, %210, %203, %203, %201) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f16>, index, index, index, index) -> !pto.tensor_view<?x?xf16>
+    %231 = "pto.make_tensor_view"(%arg18, %203, %211, %201, %203) <{layout = #pto.layout<dn>, operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f16>, index, index, index, index) -> !pto.tensor_view<?x?xf16>
+    %232 = "pto.make_tensor_view"(%arg19, %211, %203, %203, %201) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f16>, index, index, index, index) -> !pto.tensor_view<?x?xf16>
+    %233 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>
+    %234 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<left, 128x128xf16, slayout=row_major>
+    %235 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<mat, 128x128xf16, slayout=col_major>
+    %236 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<right, 128x128xf16, slayout=col_major>
+    %237 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>
+    %238 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>
+    %239 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<left, 128x128xf16, slayout=row_major>
+    %240 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>
+    %241 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<right, 128x128xf16, slayout=col_major>
+    %242 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>
+    %243 = "pto.partition_view"(%230, %213, %200, %202, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%243, %233) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%233, %234) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    %244 = "pto.declare_global"() : () -> !pto.tensor_view<128x256xf32>
+    %245 = "pto.declare_global"() : () -> !pto.tensor_view<128x256xf16>
+    %246 = "pto.declare_global"() : () -> !pto.tensor_view<128x128xf32>
+    %247 = "arith.constant"() <{value = 0 : index}> : () -> index
+    "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+    %248 = "arith.muli"(%247, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %249 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %250 = "arith.addi"(%248, %249) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %251 = "pto.partition_view"(%231, %200, %250, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%251, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %252 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %253 = "pto.partition_view"(%244, %200, %252, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%237, %253) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    %254 = "arith.muli"(%247, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %255 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %256 = "arith.addi"(%254, %255) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %257 = "pto.partition_view"(%231, %200, %256, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%257, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %258 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %259 = "pto.partition_view"(%244, %200, %258, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%237, %259) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+    %260 = "arith.constant"() <{value = 1 : index}> : () -> index
+    "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+    %261 = "arith.muli"(%260, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %262 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %263 = "arith.addi"(%261, %262) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %264 = "pto.partition_view"(%231, %200, %263, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%264, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %265 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %266 = "pto.partition_view"(%244, %200, %265, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%237, %266) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    %267 = "arith.muli"(%260, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %268 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %269 = "arith.addi"(%267, %268) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %270 = "pto.partition_view"(%231, %200, %269, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%270, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %271 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %272 = "pto.partition_view"(%244, %200, %271, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%237, %272) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+    %273 = "arith.constant"() <{value = 2 : index}> : () -> index
+    "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+    %274 = "arith.muli"(%273, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %275 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %276 = "arith.addi"(%274, %275) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %277 = "pto.partition_view"(%231, %200, %276, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%277, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %278 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %279 = "pto.partition_view"(%244, %200, %278, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%237, %279) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    %280 = "arith.muli"(%273, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %281 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %282 = "arith.addi"(%280, %281) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %283 = "pto.partition_view"(%231, %200, %282, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%283, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %284 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %285 = "pto.partition_view"(%244, %200, %284, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%237, %285) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+    %286 = "arith.constant"() <{value = 3 : index}> : () -> index
+    %287 = "arith.subi"(%212, %286) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "scf.for"(%200, %287, %201) ({
+    ^bb0(%arg22: index):
+      %333 = "arith.addi"(%arg22, %286) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+      %334 = "arith.muli"(%arg22, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %335 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %336 = "arith.addi"(%334, %335) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %337 = "pto.partition_view"(%232, %336, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+      "pto.tload"(%337, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+      %338 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %339 = "pto.partition_view"(%245, %200, %338, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+      "pto.tload"(%339, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+      "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+      "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+      "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+      "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+      %340 = "arith.muli"(%333, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %341 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %342 = "arith.addi"(%340, %341) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %343 = "pto.partition_view"(%231, %200, %342, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+      "pto.tload"(%343, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+      "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+      "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+      %344 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %345 = "pto.partition_view"(%244, %200, %344, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+      "pto.tstore"(%237, %345) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+      %346 = "arith.muli"(%arg22, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %347 = "arith.constant"() <{value = 128 : index}> : () -> index
+      %348 = "arith.addi"(%346, %347) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %349 = "pto.partition_view"(%232, %348, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+      "pto.tload"(%349, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+      %350 = "arith.constant"() <{value = 128 : index}> : () -> index
+      %351 = "pto.partition_view"(%245, %200, %350, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+      "pto.tload"(%351, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+      "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+      "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+      "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+      "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+      "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+      %352 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+      "pto.tstore"(%242, %352) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+      "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+      %353 = "arith.muli"(%333, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %354 = "arith.constant"() <{value = 128 : index}> : () -> index
+      %355 = "arith.addi"(%353, %354) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      %356 = "pto.partition_view"(%231, %200, %355, %203, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+      "pto.tload"(%356, %235) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, slayout=col_major>) -> ()
+      "pto.tmov"(%235, %236) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, slayout=col_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+      "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+      %357 = "arith.constant"() <{value = 128 : index}> : () -> index
+      %358 = "pto.partition_view"(%244, %200, %357, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+      "pto.tstore"(%237, %358) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+      "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> ()
+      "scf.yield"() : () -> ()
+    }) : (index, index, index) -> ()
+    %288 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %289 = "arith.addi"(%287, %288) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+    %290 = "arith.muli"(%289, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %291 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %292 = "arith.addi"(%290, %291) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %293 = "pto.partition_view"(%232, %292, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%293, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    %294 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %295 = "pto.partition_view"(%245, %200, %294, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%295, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %296 = "arith.muli"(%289, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %297 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %298 = "arith.addi"(%296, %297) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %299 = "pto.partition_view"(%232, %298, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%299, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    %300 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %301 = "pto.partition_view"(%245, %200, %300, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%301, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+    "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+    %302 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%242, %302) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+    %303 = "arith.constant"() <{value = 1 : index}> : () -> index
+    %304 = "arith.addi"(%287, %303) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+    %305 = "arith.muli"(%304, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %306 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %307 = "arith.addi"(%305, %306) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %308 = "pto.partition_view"(%232, %307, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%308, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    %309 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %310 = "pto.partition_view"(%245, %200, %309, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%310, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %311 = "arith.muli"(%304, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %312 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %313 = "arith.addi"(%311, %312) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %314 = "pto.partition_view"(%232, %313, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%314, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    %315 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %316 = "pto.partition_view"(%245, %200, %315, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%316, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+    "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+    %317 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%242, %317) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+    %318 = "arith.constant"() <{value = 2 : index}> : () -> index
+    %319 = "arith.addi"(%287, %318) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+    %320 = "arith.muli"(%319, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %321 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %322 = "arith.addi"(%320, %321) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %323 = "pto.partition_view"(%232, %322, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%323, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    %324 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %325 = "pto.partition_view"(%245, %200, %324, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%325, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    %326 = "arith.muli"(%319, %204) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %327 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %328 = "arith.addi"(%326, %327) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %329 = "pto.partition_view"(%232, %328, %200, %205, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%329, %240) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    %330 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %331 = "pto.partition_view"(%245, %200, %330, %202, %205) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16>
+    "pto.tload"(%331, %238) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>) -> ()
+    "pto.tmov"(%238, %239) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>) -> ()
+    "pto.tmov"(%240, %241) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<mat, 128x128xf16, blayout=col_major, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>) -> ()
+    "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.tile_buf<left, 128x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x128xf16, slayout=col_major>, !pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>) -> ()
+    "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> ()
+    "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+    %332 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32>
+    "pto.tstore"(%242, %332) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<acc, 128x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.partition_tensor_view<128x128xf32>) -> ()
+    "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> ()
+    "func.return"() : () -> ()
+  }) {pto.kernel_kind = #pto.kernel_kind<cube>} : () -> ()
+  "func.func"() <{function_type = (!pto.ptr<f32>, !pto.ptr<f16>, !pto.ptr<f32>, i64, i64) -> (), sym_name = "vector_kernel"}> ({
+  ^bb0(%arg9: !pto.ptr<f32>, %arg10: !pto.ptr<f16>, %arg11: !pto.ptr<f32>, %arg12: i64, %arg13: i64):
+    %0 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %1 = "arith.constant"() <{value = 1 : index}> : () -> index
+    %2 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %3 = "arith.constant"() <{value = 64 : index}> : () -> index
+    %4 = "arith.constant"() <{value = 32 : index}> : () -> index
+    %5 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %6 = "arith.constant"() <{value = 256 : index}> : () -> index
+    %7 = "arith.constant"() <{value = 128 : index}> : () -> index
+    %8 = "arith.constant"() <{value = 524288 : index}> : () -> index
+    %9 = "arith.constant"() <{value = 1048576 : index}> : () -> index
+    %10 = "pto.get_block_idx"() : () -> i64
+    %11 = "arith.index_cast"(%10) : (i64) -> index
+    %12 = "pto.get_subblock_idx"() : () -> i64
+    %13 = "arith.index_cast"(%12) : (i64) -> index
+    %14 = "arith.index_cast"(%arg12) : (i64) -> index
+    %15 = "arith.index_cast"(%arg13) : (i64) -> index
+    %16 = "arith.divsi"(%15, %6) : (index, index) -> index
+    %17 = "arith.muli"(%11, %2) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %18 = "arith.muli"(%13, %3) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %19 = "arith.addi"(%17, %18) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %20 = "arith.muli"(%11, %8) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %21 = "pto.addptr"(%arg9, %20) : (!pto.ptr<f32>, index) -> !pto.ptr<f32>
+    %22 = "arith.muli"(%11, %9) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %23 = "pto.addptr"(%arg10, %22) : (!pto.ptr<f16>, index) -> !pto.ptr<f16>
+    %24 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %25 = "pto.addptr"(%21, %24) : (!pto.ptr<f32>, index) -> !pto.ptr<f32>
+    %26 = "arith.constant"() <{value = 524288 : index}> : () -> index
+    %27 = "pto.addptr"(%23, %26) : (!pto.ptr<f16>, index) -> !pto.ptr<f16>
+    %28 = "arith.constant"() <{value = 393216 : index}> : () -> index
+    %29 = "pto.addptr"(%21, %28) : (!pto.ptr<f32>, index) -> !pto.ptr<f32>
+    %30 = "pto.make_tensor_view"(%25, %3, %6, %6, %1) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f32>, index, index, index, index) -> !pto.tensor_view<64x256xf32>
+    %31 = "pto.initialize_l2g2l_pipe"(%30) <{dir_mask = 1 : i8, flag_base = 0 : i32, operandSegmentSizes = array<i32: 1, 0, 0>, slot_num = 8 : i32, slot_size = 131072 : i32}> : (!pto.tensor_view<64x256xf32>) -> !pto.pipe
+    %32 = "pto.make_tensor_view"(%27, %3, %6, %6, %1) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f16>, index, index, index, index) -> !pto.tensor_view<64x256xf16>
+    %33 = "pto.initialize_l2g2l_pipe"(%32) <{dir_mask = 2 : i8, flag_base = 2 : i32, operandSegmentSizes = array<i32: 1, 0, 0>, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<64x256xf16>) -> !pto.pipe
+    %34 = "pto.make_tensor_view"(%29, %3, %5, %5, %1) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f32>, index, index, index, index) -> !pto.tensor_view<64x128xf32>
+    %35 = "pto.initialize_l2g2l_pipe"(%34) <{dir_mask = 1 : i8, flag_base = 4 : i32, operandSegmentSizes = array<i32: 1, 0, 0>, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<64x128xf32>) -> !pto.pipe
+    %36 = "pto.make_tensor_view"(%arg11, %14, %5, %5, %1) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.ptr<f32>, index, index, index, index) -> !pto.tensor_view<?x?xf32>
+    %37 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x256xf32>
+    %38 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x256xf32>
+    %39 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x256xf32>
+    %40 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x256xf16>
+    %41 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x128xf32>
+    %42 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x128xf32>
+    %43 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x128xf32>
+    %44 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x128xf32>
+    %45 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %46 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %47 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %48 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %49 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %50 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %51 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %52 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %53 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %54 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %55 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %56 = "pto.alloc_tile"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> !pto.tile_buf<vec, 32x1xf32, blayout=col_major>
+    %57 = "arith.constant"() <{value = 0.0883883461 : f32}> : () -> f32
+    %58 = "arith.constant"() <{value = 3 : index}> : () -> index
+    %59 = "pto.declare_global"() : () -> !pto.tensor_view<64x256xf32>
+    %60 = "pto.declare_global"() : () -> !pto.tensor_view<64x256xf16>
+    %61 = "pto.declare_global"() : () -> !pto.tensor_view<64x128xf32>
+    "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+    %62 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+    %63 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+    "pto.tload"(%62, %37) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.tload"(%63, %38) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+    "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowmax"(%37, %39, %45) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.trowexpandsub"(%37, %45, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowsum"(%37, %39, %48) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowmax"(%38, %39, %46) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.trowexpandsub"(%38, %46, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowsum"(%38, %39, %49) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+    "pto.tcvt"(%37, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+    %64 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+    "pto.tstore"(%40, %64) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+    "pto.tcvt"(%38, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+    %65 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+    "pto.tstore"(%40, %65) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+    "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+    %66 = "arith.constant"() <{value = 1 : index}> : () -> index
+    "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+    %67 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+    %68 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+    "pto.tload"(%67, %37) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.tload"(%68, %38) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+    "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    %69 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %70 = "pto.treshape"(%52) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %71 = "pto.treshape"(%45) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %72 = "pto.treshape"(%48) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %73 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    "pto.tmax"(%69, %71, %69) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tsub"(%71, %69, %70) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.texp"(%70, %70) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmov"(%69, %71) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmul"(%72, %70, %72) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.tadd"(%72, %73, %72) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    %74 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %75 = "pto.treshape"(%55) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %76 = "pto.treshape"(%46) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %77 = "pto.treshape"(%49) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %78 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    "pto.tmax"(%74, %76, %74) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tsub"(%76, %74, %75) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.texp"(%75, %75) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmov"(%74, %76) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmul"(%77, %75, %77) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.tadd"(%77, %78, %77) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+    "pto.tcvt"(%37, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+    %79 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+    "pto.tstore"(%40, %79) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+    "pto.tcvt"(%38, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+    %80 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+    "pto.tstore"(%40, %80) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+    "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+    %81 = "arith.constant"() <{value = 2 : index}> : () -> index
+    "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+    %82 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+    %83 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+    "pto.tload"(%82, %37) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.tload"(%83, %38) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+    "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    %84 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %85 = "pto.treshape"(%53) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %86 = "pto.treshape"(%45) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %87 = "pto.treshape"(%48) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %88 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    "pto.tmax"(%84, %86, %84) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tsub"(%86, %84, %85) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.texp"(%85, %85) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmov"(%84, %86) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmul"(%87, %85, %87) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.tadd"(%87, %88, %87) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    %89 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %90 = "pto.treshape"(%56) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %91 = "pto.treshape"(%46) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %92 = "pto.treshape"(%49) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    %93 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+    "pto.tmax"(%89, %91, %89) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tsub"(%91, %89, %90) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.texp"(%90, %90) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmov"(%89, %91) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.tmul"(%92, %90, %92) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+    "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+    "pto.tadd"(%92, %93, %92) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+    "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+    "pto.tcvt"(%37, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+    %94 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+    "pto.tstore"(%40, %94) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+    "pto.tcvt"(%38, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+    %95 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+    "pto.tstore"(%40, %95) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+    "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+    %96 = "arith.constant"() <{value = 3 : index}> : () -> index
+    %97 = "arith.subi"(%16, %96) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %98 = "arith.cmpi"(%97, %0) <{predicate = 4 : i64}> : (index, index) -> i1
+    "scf.if"(%98) ({
+      "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+      %184 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+      %185 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+      "pto.tload"(%184, %43) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tload"(%185, %44) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tmov"(%43, %41) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tmov"(%44, %42) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+      "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+      %186 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+      %187 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+      "pto.tload"(%186, %37) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.tload"(%187, %38) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+      "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+      %188 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %189 = "pto.treshape"(%51) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %190 = "pto.treshape"(%45) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %191 = "pto.treshape"(%48) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %192 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      "pto.tmax"(%188, %190, %188) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tsub"(%190, %188, %189) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.texp"(%189, %189) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tmov"(%188, %190) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tmul"(%191, %189, %191) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+      "pto.tadd"(%191, %192, %191) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+      %193 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %194 = "pto.treshape"(%54) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %195 = "pto.treshape"(%46) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %196 = "pto.treshape"(%49) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      %197 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+      "pto.tmax"(%193, %195, %193) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tsub"(%195, %193, %194) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.texp"(%194, %194) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tmov"(%193, %195) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.tmul"(%196, %194, %196) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+      "pto.tadd"(%196, %197, %196) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+      "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+      "pto.tcvt"(%37, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+      %198 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+      "pto.tstore"(%40, %198) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+      "pto.tcvt"(%38, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+      %199 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+      "pto.tstore"(%40, %199) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+      "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+      "scf.yield"() : () -> ()
+    }, {
+    }) : (i1) -> ()
+    "scf.for"(%1, %97, %1) ({
+    ^bb0(%arg14: index):
+      %135 = "arith.addi"(%arg14, %96) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+      "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+      %136 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+      %137 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+      "pto.tload"(%136, %43) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tload"(%137, %44) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      %138 = "arith.remsi"(%arg14, %58) : (index, index) -> index
+      %139 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %140 = "arith.cmpi"(%138, %139) <{predicate = 0 : i64}> : (index, index) -> i1
+      "scf.if"(%140) ({
+        "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "scf.yield"() : () -> ()
+      }, {
+        %181 = "arith.remsi"(%arg14, %58) : (index, index) -> index
+        %182 = "arith.constant"() <{value = 1 : index}> : () -> index
+        %183 = "arith.cmpi"(%181, %182) <{predicate = 0 : i64}> : (index, index) -> i1
+        "scf.if"(%183) ({
+          "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }, {
+          "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }) : (i1) -> ()
+        "scf.yield"() : () -> ()
+      }) : (i1) -> ()
+      "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+      "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+      %141 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+      %142 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32>
+      "pto.tload"(%141, %37) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.tload"(%142, %38) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+      "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> ()
+      %143 = "arith.remsi"(%135, %58) : (index, index) -> index
+      %144 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %145 = "arith.cmpi"(%143, %144) <{predicate = 0 : i64}> : (index, index) -> i1
+      "scf.if"(%145) ({
+        "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+        "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+        %171 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %172 = "pto.treshape"(%51) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %173 = "pto.treshape"(%45) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %174 = "pto.treshape"(%48) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %175 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        "pto.tmax"(%171, %173, %171) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tsub"(%173, %171, %172) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.texp"(%172, %172) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tmov"(%171, %173) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tmul"(%174, %172, %174) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+        "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+        "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+        "pto.tadd"(%174, %175, %174) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+        "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+        %176 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %177 = "pto.treshape"(%54) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %178 = "pto.treshape"(%46) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %179 = "pto.treshape"(%49) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        %180 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+        "pto.tmax"(%176, %178, %176) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tsub"(%178, %176, %177) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.texp"(%177, %177) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tmov"(%176, %178) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.tmul"(%179, %177, %179) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+        "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+        "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+        "pto.tadd"(%179, %180, %179) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+        "scf.yield"() : () -> ()
+      }, {
+        %148 = "arith.remsi"(%135, %58) : (index, index) -> index
+        %149 = "arith.constant"() <{value = 1 : index}> : () -> index
+        %150 = "arith.cmpi"(%148, %149) <{predicate = 0 : i64}> : (index, index) -> i1
+        "scf.if"(%150) ({
+          "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          %161 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %162 = "pto.treshape"(%52) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %163 = "pto.treshape"(%45) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %164 = "pto.treshape"(%48) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %165 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          "pto.tmax"(%161, %163, %161) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tsub"(%163, %161, %162) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.texp"(%162, %162) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmov"(%161, %163) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmul"(%164, %162, %164) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          "pto.tadd"(%164, %165, %164) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          %166 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %167 = "pto.treshape"(%55) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %168 = "pto.treshape"(%46) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %169 = "pto.treshape"(%49) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %170 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          "pto.tmax"(%166, %168, %166) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tsub"(%168, %166, %167) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.texp"(%167, %167) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmov"(%166, %168) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmul"(%169, %167, %169) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          "pto.tadd"(%169, %170, %169) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }, {
+          "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          %151 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %152 = "pto.treshape"(%53) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %153 = "pto.treshape"(%45) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %154 = "pto.treshape"(%48) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %155 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          "pto.tmax"(%151, %153, %151) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tsub"(%153, %151, %152) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.texp"(%152, %152) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmov"(%151, %153) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmul"(%154, %152, %154) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.texp"(%37, %37) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          "pto.tadd"(%154, %155, %154) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf<vec, 32x256xf32>, f32, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          %156 = "pto.treshape"(%47) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %157 = "pto.treshape"(%56) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %158 = "pto.treshape"(%46) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %159 = "pto.treshape"(%49) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          %160 = "pto.treshape"(%50) : (!pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> !pto.tile_buf<vec, 1x32xf32>
+          "pto.tmax"(%156, %158, %156) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tsub"(%158, %156, %157) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.texp"(%157, %157) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmov"(%156, %158) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.tmul"(%159, %157, %159) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.texp"(%38, %38) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>) -> ()
+          "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>) -> ()
+          "pto.tadd"(%159, %160, %159) : (!pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>, !pto.tile_buf<vec, 1x32xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }) : (i1) -> ()
+        "scf.yield"() : () -> ()
+      }) : (i1) -> ()
+      "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+      "pto.tcvt"(%37, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+      %146 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+      "pto.tstore"(%40, %146) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+      "pto.tcvt"(%38, %40) <{rmode = #pto<round_mode CAST_RINT>}> : (!pto.tile_buf<vec, 32x256xf32>, !pto.tile_buf<vec, 32x256xf16>) -> ()
+      %147 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16>
+      "pto.tstore"(%40, %147) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x256xf16>, !pto.partition_tensor_view<32x256xf16>) -> ()
+      "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> ()
+      "scf.yield"() : () -> ()
+    }) : (index, index, index) -> ()
+    %99 = "arith.constant"() <{value = 0 : index}> : () -> index
+    %100 = "arith.addi"(%97, %99) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+    %101 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    %102 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    "pto.tload"(%101, %43) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    "pto.tload"(%102, %44) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    %103 = "arith.cmpi"(%100, %0) <{predicate = 0 : i64}> : (index, index) -> i1
+    "scf.if"(%103) ({
+      "pto.tmov"(%43, %41) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tmov"(%44, %42) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "scf.yield"() : () -> ()
+    }, {
+      %129 = "arith.remsi"(%100, %58) : (index, index) -> index
+      %130 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %131 = "arith.cmpi"(%129, %130) <{predicate = 0 : i64}> : (index, index) -> i1
+      "scf.if"(%131) ({
+        "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "scf.yield"() : () -> ()
+      }, {
+        %132 = "arith.remsi"(%100, %58) : (index, index) -> index
+        %133 = "arith.constant"() <{value = 1 : index}> : () -> index
+        %134 = "arith.cmpi"(%132, %133) <{predicate = 0 : i64}> : (index, index) -> i1
+        "scf.if"(%134) ({
+          "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }, {
+          "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }) : (i1) -> ()
+        "scf.yield"() : () -> ()
+      }) : (i1) -> ()
+      "scf.yield"() : () -> ()
+    }) : (i1) -> ()
+    "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+    %104 = "arith.constant"() <{value = 1 : index}> : () -> index
+    %105 = "arith.addi"(%97, %104) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+    %106 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    %107 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    "pto.tload"(%106, %43) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    "pto.tload"(%107, %44) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    %108 = "arith.cmpi"(%105, %0) <{predicate = 0 : i64}> : (index, index) -> i1
+    "scf.if"(%108) ({
+      "pto.tmov"(%43, %41) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tmov"(%44, %42) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "scf.yield"() : () -> ()
+    }, {
+      %123 = "arith.remsi"(%105, %58) : (index, index) -> index
+      %124 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %125 = "arith.cmpi"(%123, %124) <{predicate = 0 : i64}> : (index, index) -> i1
+      "scf.if"(%125) ({
+        "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "scf.yield"() : () -> ()
+      }, {
+        %126 = "arith.remsi"(%105, %58) : (index, index) -> index
+        %127 = "arith.constant"() <{value = 1 : index}> : () -> index
+        %128 = "arith.cmpi"(%126, %127) <{predicate = 0 : i64}> : (index, index) -> i1
+        "scf.if"(%128) ({
+          "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }, {
+          "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }) : (i1) -> ()
+        "scf.yield"() : () -> ()
+      }) : (i1) -> ()
+      "scf.yield"() : () -> ()
+    }) : (i1) -> ()
+    "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+    %109 = "arith.constant"() <{value = 2 : index}> : () -> index
+    %110 = "arith.addi"(%97, %109) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+    %111 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    %112 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    "pto.tload"(%111, %43) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    "pto.tload"(%112, %44) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    %113 = "arith.cmpi"(%110, %0) <{predicate = 0 : i64}> : (index, index) -> i1
+    "scf.if"(%113) ({
+      "pto.tmov"(%43, %41) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "pto.tmov"(%44, %42) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, reluPreMode = #pto<relu_pre_mode no_relu>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+      "scf.yield"() : () -> ()
+    }, {
+      %117 = "arith.remsi"(%110, %58) : (index, index) -> index
+      %118 = "arith.constant"() <{value = 0 : index}> : () -> index
+      %119 = "arith.cmpi"(%117, %118) <{predicate = 0 : i64}> : (index, index) -> i1
+      "scf.if"(%119) ({
+        "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+        "scf.yield"() : () -> ()
+      }, {
+        %120 = "arith.remsi"(%110, %58) : (index, index) -> index
+        %121 = "arith.constant"() <{value = 1 : index}> : () -> index
+        %122 = "arith.cmpi"(%120, %121) <{predicate = 0 : i64}> : (index, index) -> i1
+        "scf.if"(%122) ({
+          "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }, {
+          "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%41, %43, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "pto.tadd"(%42, %44, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+          "scf.yield"() : () -> ()
+        }) : (i1) -> ()
+        "scf.yield"() : () -> ()
+      }) : (i1) -> ()
+      "scf.yield"() : () -> ()
+    }) : (i1) -> ()
+    "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> ()
+    "pto.trowexpanddiv"(%41, %48, %41) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    "pto.trowexpanddiv"(%42, %49, %42) : (!pto.tile_buf<vec, 32x128xf32>, !pto.tile_buf<vec, 32x1xf32, blayout=col_major>, !pto.tile_buf<vec, 32x128xf32>) -> ()
+    %114 = "pto.partition_view"(%36, %19, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    "pto.tstore"(%41, %114) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.partition_tensor_view<32x128xf32>) -> ()
+    %115 = "arith.addi"(%19, %4) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
+    %116 = "pto.partition_view"(%36, %115, %0, %4, %5) <{operandSegmentSizes = array<i32: 1, 2, 2>}> : (!pto.tensor_view<?x?xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32>
+    "pto.tstore"(%42, %116) <{atomicType = #pto<atomic_type atomic_none>, reluPreMode = #pto<relu_pre_mode no_relu>, stPhase = #pto<st_phase unspecified>}> : (!pto.tile_buf<vec, 32x128xf32>, !pto.partition_tensor_view<32x128xf32>) -> ()
+    "func.return"() : () -> ()
+  }) {pto.kernel_kind = #pto.kernel_kind<vector>} : () -> ()
+  "func.func"() <{function_type = (memref<256xi64>, !pto.ptr<f32>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f32>, i64, i64) -> (), sym_name = "call_both"}> ({
+  ^bb0(%arg0: memref<256xi64>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f16>, %arg3: !pto.ptr<f16>, %arg4: !pto.ptr<f16>, %arg5: !pto.ptr<f16>, %arg6: !pto.ptr<f32>, %arg7: i64, %arg8: i64):
+    "pto.set_ffts"(%arg0) : (memref<256xi64>) -> ()
+    "func.call"(%arg1, %arg2, %arg3, %arg4, %arg5, %arg7, %arg8) <{callee = @cube_kernel}> : (!pto.ptr<f32>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>, i64, i64) -> ()
+    "func.call"(%arg1, %arg2, %arg6, %arg7, %arg8) <{callee = @vector_kernel}> : (!pto.ptr<f32>, !pto.ptr<f16>, !pto.ptr<f32>, i64, i64) -> ()
+    "func.return"() : () -> ()
+  }) {pto.entry} : () -> ()
+}) : () -> ()
+
diff --git a/test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto b/test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto
new file mode 100644
index 000000000..df4401dd9
--- /dev/null
+++ b/test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto
@@ -0,0 +1,402 @@
+module {
+  func.func @cube_kernel(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f16>, %arg2: !pto.ptr<f16>, %arg3: !pto.ptr<f16>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c32 = arith.constant 32 : index
+    %c128 = arith.constant 128 : index
+    %c256 = arith.constant 256 : index
+    %c8192 = arith.constant 8192 : index
+    %c32_0 = arith.constant 32 : index
+    %c96 = arith.constant 96 : index
+    %0 = pto.get_block_num
+    %1 = arith.index_cast %0 : i64 to index
+    %2 = pto.get_block_idx
+    %3 = arith.index_cast %2 : i64 to index
+    %4 = arith.divsi %c96, %1 : index
+    %5 = arith.remsi %c96, %1 : index
+    %6 = arith.addi %4, %c1 : index
+    %7 = arith.muli %3, %6 : index
+    %8 = arith.addi %4, %c1 : index
+    %9 = arith.muli %5, %8 : index
+    %10 = arith.subi %3, %5 : index
+    %11 = arith.muli %10, %4 : index
+    %12 = arith.addi %9, %11 : index
+    %13 = arith.cmpi slt, %3, %5 : index
+    %14 = arith.select %13, %7, %12 : index
+    %15 = arith.cmpi slt, %3, %5 : index
+    %16 = arith.addi %4, %c1 : index
+    %17 = arith.select %15, %16, %4 : index
+    %18 = arith.addi %14, %17 : index
+    %c131072 = arith.constant 131072 : index
+    %19 = arith.muli %3, %c131072 : index
+    %20 = pto.addptr %arg0, %19 : <f32> -> <f32>
+    %c0_1 = arith.constant 0 : index
+    %21 = pto.addptr %20, %c0_1 : <f32> -> <f32>
+    %c65536 = arith.constant 65536 : index
+    %22 = pto.addptr %20, %c65536 : <f32> -> <f32>
+    %c98304 = arith.constant 98304 : index
+    %23 = pto.addptr %20, %c98304 : <f32> -> <f32>
+    %24 = pto.import_reserved_buffer{name = "fa_qk_c2v_fifo", peer_func = @vector_kernel} -> i32
+    %25 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 32768, slot_num = 8, local_slot_num = 1} (%21 : !pto.ptr<f32>, %24 : i32) -> !pto.pipe
+    %26 = pto.import_reserved_buffer{name = "fa_pv_c2v_fifo", peer_func = @vector_kernel} -> i32
+    %27 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 16384, slot_num = 8, local_slot_num = 1} (%22 : !pto.ptr<f32>, %26 : i32) -> !pto.pipe
+    %28 = pto.reserve_buffer{name = "fa_p_v2c_fifo", size = 16384, location = <mat>, auto = false, base = 262144} -> i32
+    %c0_i32 = arith.constant 0 : i32
+    pto.aic_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false}(gm_slot_buffer = %23 : !pto.ptr<f32>, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32)
+    %c0_i64 = arith.constant 0 : i64
+    %c0_i64_2 = arith.constant 0 : i64
+    %29 = pto.alloc_tile addr = %c0_i64_2 : !pto.tile_buf<mat, 32x128xf16, blayout=col_major, slayout=row_major>
+    %c0_i64_3 = arith.constant 0 : i64
+    %30 = pto.alloc_tile addr = %c0_i64_3 : !pto.tile_buf<left, 32x128xf16, slayout=row_major>
+    %c8192_i64 = arith.constant 8192 : i64
+    %31 = pto.alloc_tile addr = %c8192_i64 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>
+    %32 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<right, 128x256xf16, slayout=col_major>
+    %c0_i64_4 = arith.constant 0 : i64
+    %33 = pto.alloc_tile addr = %c0_i64_4 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>
+    %c73728_i64 = arith.constant 73728 : i64
+    %34 = pto.alloc_tile addr = %c73728_i64 : !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>
+    %c8192_i64_5 = arith.constant 8192 : i64
+    %35 = pto.alloc_tile addr = %c8192_i64_5 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>
+    %c90112_i64 = arith.constant 90112 : i64
+    %36 = pto.alloc_tile addr = %c90112_i64 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>
+    %37 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<right, 256x128xf16, slayout=col_major>
+    %c32768_i64 = arith.constant 32768 : i64
+    %38 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>
+    %c3072 = arith.constant 3072 : index
+    %39 = pto.make_tensor_view %arg1, shape = [%c3072, %c128], strides = [%c128, %c1] : !pto.tensor_view<?x?xf16>
+    %40 = pto.make_tensor_view %arg2, shape = [%c128, %c8192], strides = [%c1, %c128] : !pto.tensor_view<?x?xf16>
+    %41 = pto.make_tensor_view %arg3, shape = [%c8192, %c128], strides = [%c128, %c1] : !pto.tensor_view<?x?xf16>
+    scf.for %arg4 = %14 to %18 step %c1 {
+      %42 = arith.muli %arg4, %c32 : index
+      %43 = pto.partition_view %39, offsets = [%42, %c0], sizes = [%c32, %c128] : !pto.tensor_view<?x?xf16>
+      pto.tload ins(%43 : !pto.partition_tensor_view<32x128xf16>) outs(%29 : !pto.tile_buf<mat, 32x128xf16, blayout=col_major, slayout=row_major>)
+      pto.tmov ins(%29 : !pto.tile_buf<mat, 32x128xf16, blayout=col_major, slayout=row_major>) outs(%30 : !pto.tile_buf<left, 32x128xf16, slayout=row_major>)
+      %c0_6 = arith.constant 0 : index
+      %44 = pto.partition_view %40, offsets = [%c0, %c0_6], sizes = [%c128, %c256] : !pto.tensor_view<?x?xf16>
+      pto.tload ins(%44 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>)
+      pto.tmov ins(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>) outs(%32 : !pto.tile_buf<right, 128x256xf16, slayout=col_major>)
+      pto.tmatmul ins(%30, %32 : !pto.tile_buf<left, 32x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x256xf16, slayout=col_major>) outs(%33 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+      pto.tpush(%33, %25 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+      %c256_7 = arith.constant 256 : index
+      %45 = pto.partition_view %40, offsets = [%c0, %c256_7], sizes = [%c128, %c256] : !pto.tensor_view<?x?xf16>
+      pto.tload ins(%45 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>)
+      pto.tmov ins(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>) outs(%32 : !pto.tile_buf<right, 128x256xf16, slayout=col_major>)
+      pto.tmatmul ins(%30, %32 : !pto.tile_buf<left, 32x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x256xf16, slayout=col_major>) outs(%33 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+      pto.tpush(%33, %25 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+      %46 = pto.partition_view %41, offsets = [%c0, %c0], sizes = [%c256, %c128] : !pto.tensor_view<?x?xf16>
+      pto.tload ins(%46 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>)
+      %c15 = arith.constant 15 : index
+      scf.for %arg5 = %c0 to %c15 step %c1 {
+        %50 = arith.muli %arg5, %c2 : index
+        %c2_8 = arith.constant 2 : index
+        %51 = arith.addi %50, %c2_8 : index
+        %52 = arith.muli %51, %c256 : index
+        %53 = pto.partition_view %40, offsets = [%c0, %52], sizes = [%c128, %c256] : !pto.tensor_view<?x?xf16>
+        pto.tload ins(%53 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>)
+        %54 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>
+        pto.tmov ins(%54 : !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>) outs(%35 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>)
+        pto.tfree_from_aiv {id = 30, split = 1}
+        pto.tmov ins(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>) outs(%37 : !pto.tile_buf<right, 256x128xf16, slayout=col_major>)
+        %55 = arith.addi %50, %c1 : index
+        %56 = arith.muli %55, %c256 : index
+        %57 = pto.partition_view %41, offsets = [%56, %c0], sizes = [%c256, %c128] : !pto.tensor_view<?x?xf16>
+        pto.tload ins(%57 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>)
+        pto.tmatmul ins(%35, %37 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>, !pto.tile_buf<right, 256x128xf16, slayout=col_major>) outs(%38 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+        pto.tpush(%38, %27 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+        pto.tmov ins(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>) outs(%32 : !pto.tile_buf<right, 128x256xf16, slayout=col_major>)
+        pto.tmatmul ins(%30, %32 : !pto.tile_buf<left, 32x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x256xf16, slayout=col_major>) outs(%33 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+        pto.tpush(%33, %25 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+        %58 = arith.muli %arg5, %c2 : index
+        %59 = arith.addi %58, %c1 : index
+        %c2_9 = arith.constant 2 : index
+        %60 = arith.addi %59, %c2_9 : index
+        %61 = arith.muli %60, %c256 : index
+        %62 = pto.partition_view %40, offsets = [%c0, %61], sizes = [%c128, %c256] : !pto.tensor_view<?x?xf16>
+        pto.tload ins(%62 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>)
+        %63 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>
+        pto.tmov ins(%63 : !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>) outs(%35 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>)
+        pto.tfree_from_aiv {id = 30, split = 1}
+        pto.tmov ins(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>) outs(%37 : !pto.tile_buf<right, 256x128xf16, slayout=col_major>)
+        %64 = arith.addi %59, %c1 : index
+        %65 = arith.muli %64, %c256 : index
+        %66 = pto.partition_view %41, offsets = [%65, %c0], sizes = [%c256, %c128] : !pto.tensor_view<?x?xf16>
+        pto.tload ins(%66 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>)
+        pto.tmatmul ins(%35, %37 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>, !pto.tile_buf<right, 256x128xf16, slayout=col_major>) outs(%38 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+        pto.tpush(%38, %27 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+        pto.tmov ins(%31 : !pto.tile_buf<mat, 128x256xf16, slayout=col_major>) outs(%32 : !pto.tile_buf<right, 128x256xf16, slayout=col_major>)
+        pto.tmatmul ins(%30, %32 : !pto.tile_buf<left, 32x128xf16, slayout=row_major>, !pto.tile_buf<right, 128x256xf16, slayout=col_major>) outs(%33 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+        pto.tpush(%33, %25 : !pto.tile_buf<acc, 32x256xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+      }
+      %47 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>
+      pto.tmov ins(%47 : !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>) outs(%35 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>)
+      pto.tfree_from_aiv {id = 30, split = 1}
+      pto.tmov ins(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>) outs(%37 : !pto.tile_buf<right, 256x128xf16, slayout=col_major>)
+      %c7936 = arith.constant 7936 : index
+      %48 = pto.partition_view %41, offsets = [%c7936, %c0], sizes = [%c256, %c128] : !pto.tensor_view<?x?xf16>
+      pto.tload ins(%48 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>)
+      pto.tmatmul ins(%35, %37 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>, !pto.tile_buf<right, 256x128xf16, slayout=col_major>) outs(%38 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+      pto.tpush(%38, %27 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+      %49 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>
+      pto.tmov ins(%49 : !pto.tile_buf<mat, 32x256xf16, blayout=col_major, slayout=row_major>) outs(%35 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>)
+      pto.tfree_from_aiv {id = 30, split = 1}
+      pto.tmov ins(%36 : !pto.tile_buf<mat, 256x128xf16, blayout=col_major, slayout=row_major>) outs(%37 : !pto.tile_buf<right, 256x128xf16, slayout=col_major>)
+      pto.tmatmul ins(%35, %37 : !pto.tile_buf<left, 32x256xf16, slayout=row_major>, !pto.tile_buf<right, 256x128xf16, slayout=col_major>) outs(%38 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>)
+      pto.tpush(%38, %27 : !pto.tile_buf<acc, 32x128xf32, blayout=col_major, slayout=row_major, fractal=1024>, !pto.pipe) {split = 1}
+    }
+    return
+  }
+  func.func @vector_kernel(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c32 = arith.constant 32 : index
+    %c16 = arith.constant 16 : index
+    %c128 = arith.constant 128 : index
+    %c32_0 = arith.constant 32 : index
+    %c96 = arith.constant 96 : index
+    %0 = pto.get_block_num
+    %1 = arith.index_cast %0 : i64 to index
+    %2 = pto.get_block_idx
+    %3 = arith.index_cast %2 : i64 to index
+    %4 = arith.divsi %c96, %1 : index
+    %5 = arith.remsi %c96, %1 : index
+    %6 = arith.addi %4, %c1 : index
+    %7 = arith.muli %3, %6 : index
+    %8 = arith.addi %4, %c1 : index
+    %9 = arith.muli %5, %8 : index
+    %10 = arith.subi %3, %5 : index
+    %11 = arith.muli %10, %4 : index
+    %12 = arith.addi %9, %11 : index
+    %13 = arith.cmpi slt, %3, %5 : index
+    %14 = arith.select %13, %7, %12 : index
+    %15 = arith.cmpi slt, %3, %5 : index
+    %16 = arith.addi %4, %c1 : index
+    %17 = arith.select %15, %16, %4 : index
+    %18 = arith.addi %14, %17 : index
+    %c131072 = arith.constant 131072 : index
+    %19 = arith.muli %3, %c131072 : index
+    %20 = pto.addptr %arg0, %19 : <f32> -> <f32>
+    %c0_1 = arith.constant 0 : index
+    %21 = pto.addptr %20, %c0_1 : <f32> -> <f32>
+    %c65536 = arith.constant 65536 : index
+    %22 = pto.addptr %20, %c65536 : <f32> -> <f32>
+    %c98304 = arith.constant 98304 : index
+    %23 = pto.addptr %20, %c98304 : <f32> -> <f32>
+    %24 = pto.reserve_buffer{name = "fa_qk_c2v_fifo", size = 32768, location = <vec>, auto = false, base = 0} -> i32
+    %25 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 32768, slot_num = 8, local_slot_num = 1} (%21 : !pto.ptr<f32>, %24 : i32) -> !pto.pipe
+    %26 = pto.reserve_buffer{name = "fa_pv_c2v_fifo", size = 16384, location = <vec>, auto = false, base = 32768} -> i32
+    %27 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 16384, slot_num = 8, local_slot_num = 1} (%22 : !pto.ptr<f32>, %26 : i32) -> !pto.pipe
+    %28 = pto.import_reserved_buffer{name = "fa_p_v2c_fifo", peer_func = @cube_kernel} -> i32
+    %c0_i32 = arith.constant 0 : i32
+    pto.aiv_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false}(gm_slot_buffer = %23 : !pto.ptr<f32>, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32)
+    %29 = pto.get_subblock_idx
+    %30 = arith.index_cast %29 : i64 to index
+    %31 = arith.muli %30, %c16 : index
+    %c49152_i64 = arith.constant 49152 : i64
+    %32 = pto.alloc_tile addr = %c49152_i64 : !pto.tile_buf<vec, 16x256xf32>
+    %c65536_i64 = arith.constant 65536 : i64
+    %33 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf<vec, 16x256xf32>
+    %c81920_i64 = arith.constant 81920 : i64
+    %34 = pto.alloc_tile addr = %c81920_i64 : !pto.tile_buf<vec, 16x256xf16>
+    %c90112_i64 = arith.constant 90112 : i64
+    %35 = pto.alloc_tile addr = %c90112_i64 : !pto.tile_buf<vec, 16x128xf32>
+    %c98304_i64 = arith.constant 98304 : i64
+    %36 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>
+    %c98816_i64 = arith.constant 98816 : i64
+    %37 = pto.alloc_tile addr = %c98816_i64 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>
+    %c99328_i64 = arith.constant 99328 : i64
+    %38 = pto.alloc_tile addr = %c99328_i64 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>
+    %c99840_i64 = arith.constant 99840 : i64
+    %39 = pto.alloc_tile addr = %c99840_i64 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>
+    %c100352_i64 = arith.constant 100352 : i64
+    %40 = pto.alloc_tile addr = %c100352_i64 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>
+    %c100864_i64 = arith.constant 100864 : i64
+    %41 = pto.alloc_tile addr = %c100864_i64 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>
+    %cst = arith.constant 0.0883883461 : f32
+    %cst_2 = arith.constant 1.000000e+00 : f32
+    %c3072 = arith.constant 3072 : index
+    %42 = pto.make_tensor_view %arg1, shape = [%c3072, %c128], strides = [%c128, %c1] : !pto.tensor_view<?x?xf32>
+    scf.for %arg2 = %14 to %18 step %c1 {
+      %43 = arith.muli %arg2, %c32 : index
+      %c101376_i64 = arith.constant 101376 : i64
+      %44 = pto.alloc_tile addr = %c101376_i64 : !pto.tile_buf<vec, 16x256xf32>
+      pto.tpop(%44, %25 : !pto.tile_buf<vec, 16x256xf32>, !pto.pipe) {split = 1}
+      pto.tmuls ins(%44, %cst : !pto.tile_buf<vec, 16x256xf32>, f32) outs(%44 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.trowmax ins(%44, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      %45 = pto.treshape %37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %46 = pto.treshape %36 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %47 = pto.treshape %40 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %48 = pto.treshape %38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %49 = pto.treshape %39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      pto.trowexpandsub ins(%44, %37 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.tmuls ins(%45, %cst_2 : !pto.tile_buf<vec, 1x16xf32>, f32) outs(%46 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.texp ins(%33 : !pto.tile_buf<vec, 16x256xf32>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.trowsum ins(%33, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      pto.tcvt ins(%33 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<vec, 16x256xf32>) outs(%34 : !pto.tile_buf<vec, 16x256xf16>)
+      pto.tpush_to_aic(%34 : !pto.tile_buf<vec, 16x256xf16>) {id = 30, split = 1}
+      pto.tfree(%25 : !pto.pipe) {split = 1}
+      %c101376_i64_3 = arith.constant 101376 : i64
+      %50 = pto.alloc_tile addr = %c101376_i64_3 : !pto.tile_buf<vec, 16x256xf32>
+      pto.tpop(%50, %25 : !pto.tile_buf<vec, 16x256xf32>, !pto.pipe) {split = 1}
+      pto.tmuls ins(%50, %cst : !pto.tile_buf<vec, 16x256xf32>, f32) outs(%50 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.trowmax ins(%50, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      %51 = pto.treshape %37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %52 = pto.treshape %36 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %53 = pto.treshape %41 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %54 = pto.treshape %38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %55 = pto.treshape %39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      pto.tmax ins(%51, %52 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%51 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tsub ins(%52, %51 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%53 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tmuls ins(%51, %cst_2 : !pto.tile_buf<vec, 1x16xf32>, f32) outs(%52 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.trowexpandsub ins(%50, %37 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.texp ins(%53 : !pto.tile_buf<vec, 1x16xf32>) outs(%53 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.texp ins(%33 : !pto.tile_buf<vec, 16x256xf32>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.tmul ins(%54, %53 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%54 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.trowsum ins(%33, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      pto.tadd ins(%54, %55 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%54 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tcvt ins(%33 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<vec, 16x256xf32>) outs(%34 : !pto.tile_buf<vec, 16x256xf16>)
+      pto.tpush_to_aic(%34 : !pto.tile_buf<vec, 16x256xf16>) {id = 30, split = 1}
+      pto.tfree(%25 : !pto.pipe) {split = 1}
+      %c101376_i64_4 = arith.constant 101376 : i64
+      %56 = pto.alloc_tile addr = %c101376_i64_4 : !pto.tile_buf<vec, 16x128xf32>
+      pto.tpop(%56, %27 : !pto.tile_buf<vec, 16x128xf32>, !pto.pipe) {split = 1}
+      pto.tmov ins(%56 : !pto.tile_buf<vec, 16x128xf32>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tfree(%27 : !pto.pipe) {split = 1}
+      %c101376_i64_5 = arith.constant 101376 : i64
+      %57 = pto.alloc_tile addr = %c101376_i64_5 : !pto.tile_buf<vec, 16x256xf32>
+      pto.tpop(%57, %25 : !pto.tile_buf<vec, 16x256xf32>, !pto.pipe) {split = 1}
+      pto.tmuls ins(%57, %cst : !pto.tile_buf<vec, 16x256xf32>, f32) outs(%57 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.trowmax ins(%57, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      %58 = pto.treshape %37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %59 = pto.treshape %36 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %60 = pto.treshape %40 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %61 = pto.treshape %38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %62 = pto.treshape %39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      pto.tmax ins(%58, %59 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%58 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tsub ins(%59, %58 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%60 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tmuls ins(%58, %cst_2 : !pto.tile_buf<vec, 1x16xf32>, f32) outs(%59 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.trowexpandsub ins(%57, %37 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.texp ins(%60 : !pto.tile_buf<vec, 1x16xf32>) outs(%60 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.texp ins(%33 : !pto.tile_buf<vec, 16x256xf32>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.tmul ins(%61, %60 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%61 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.trowsum ins(%33, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      pto.tadd ins(%61, %62 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%61 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tcvt ins(%33 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<vec, 16x256xf32>) outs(%34 : !pto.tile_buf<vec, 16x256xf16>)
+      pto.tpush_to_aic(%34 : !pto.tile_buf<vec, 16x256xf16>) {id = 30, split = 1}
+      pto.tfree(%25 : !pto.pipe) {split = 1}
+      %c101376_i64_6 = arith.constant 101376 : i64
+      %63 = pto.alloc_tile addr = %c101376_i64_6 : !pto.tile_buf<vec, 16x128xf32>
+      pto.tpop(%63, %27 : !pto.tile_buf<vec, 16x128xf32>, !pto.pipe) {split = 1}
+      pto.trowexpandmul ins(%35, %41 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tadd ins(%35, %63 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x128xf32>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tfree(%27 : !pto.pipe) {split = 1}
+      %c101376_i64_7 = arith.constant 101376 : i64
+      %64 = pto.alloc_tile addr = %c101376_i64_7 : !pto.tile_buf<vec, 16x256xf32>
+      pto.tpop(%64, %25 : !pto.tile_buf<vec, 16x256xf32>, !pto.pipe) {split = 1}
+      pto.tmuls ins(%64, %cst : !pto.tile_buf<vec, 16x256xf32>, f32) outs(%64 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.trowmax ins(%64, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      %65 = pto.treshape %37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %66 = pto.treshape %36 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %67 = pto.treshape %41 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %68 = pto.treshape %38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      %69 = pto.treshape %39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+      pto.tmax ins(%65, %66 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%65 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tsub ins(%66, %65 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%67 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tmuls ins(%65, %cst_2 : !pto.tile_buf<vec, 1x16xf32>, f32) outs(%66 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.trowexpandsub ins(%64, %37 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.texp ins(%67 : !pto.tile_buf<vec, 1x16xf32>) outs(%67 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.texp ins(%33 : !pto.tile_buf<vec, 16x256xf32>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+      pto.tmul ins(%68, %67 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%68 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.trowsum ins(%33, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+      pto.tadd ins(%68, %69 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%68 : !pto.tile_buf<vec, 1x16xf32>)
+      pto.tcvt ins(%33 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<vec, 16x256xf32>) outs(%34 : !pto.tile_buf<vec, 16x256xf16>)
+      pto.tpush_to_aic(%34 : !pto.tile_buf<vec, 16x256xf16>) {id = 30, split = 1}
+      pto.tfree(%25 : !pto.pipe) {split = 1}
+      %c15 = arith.constant 15 : index
+      scf.for %arg3 = %c1 to %c15 step %c1 {
+        %c101376_i64_10 = arith.constant 101376 : i64
+        %74 = pto.alloc_tile addr = %c101376_i64_10 : !pto.tile_buf<vec, 16x128xf32>
+        pto.tpop(%74, %27 : !pto.tile_buf<vec, 16x128xf32>, !pto.pipe) {split = 1}
+        pto.trowexpandmul ins(%35, %40 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+        pto.tadd ins(%35, %74 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x128xf32>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+        pto.tfree(%27 : !pto.pipe) {split = 1}
+        %c101376_i64_11 = arith.constant 101376 : i64
+        %75 = pto.alloc_tile addr = %c101376_i64_11 : !pto.tile_buf<vec, 16x256xf32>
+        pto.tpop(%75, %25 : !pto.tile_buf<vec, 16x256xf32>, !pto.pipe) {split = 1}
+        pto.tmuls ins(%75, %cst : !pto.tile_buf<vec, 16x256xf32>, f32) outs(%75 : !pto.tile_buf<vec, 16x256xf32>)
+        pto.trowmax ins(%75, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+        %76 = pto.treshape %37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %77 = pto.treshape %36 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %78 = pto.treshape %40 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %79 = pto.treshape %38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %80 = pto.treshape %39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        pto.tmax ins(%76, %77 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%76 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.tsub ins(%77, %76 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%78 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.tmuls ins(%76, %cst_2 : !pto.tile_buf<vec, 1x16xf32>, f32) outs(%77 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.trowexpandsub ins(%75, %37 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+        pto.texp ins(%78 : !pto.tile_buf<vec, 1x16xf32>) outs(%78 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.texp ins(%33 : !pto.tile_buf<vec, 16x256xf32>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+        pto.tmul ins(%79, %78 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%79 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.trowsum ins(%33, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+        pto.tadd ins(%79, %80 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%79 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.tcvt ins(%33 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<vec, 16x256xf32>) outs(%34 : !pto.tile_buf<vec, 16x256xf16>)
+        pto.tpush_to_aic(%34 : !pto.tile_buf<vec, 16x256xf16>) {id = 30, split = 1}
+        pto.tfree(%25 : !pto.pipe) {split = 1}
+        %c101376_i64_12 = arith.constant 101376 : i64
+        %81 = pto.alloc_tile addr = %c101376_i64_12 : !pto.tile_buf<vec, 16x128xf32>
+        pto.tpop(%81, %27 : !pto.tile_buf<vec, 16x128xf32>, !pto.pipe) {split = 1}
+        pto.trowexpandmul ins(%35, %41 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+        pto.tadd ins(%35, %81 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x128xf32>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+        pto.tfree(%27 : !pto.pipe) {split = 1}
+        %c101376_i64_13 = arith.constant 101376 : i64
+        %82 = pto.alloc_tile addr = %c101376_i64_13 : !pto.tile_buf<vec, 16x256xf32>
+        pto.tpop(%82, %25 : !pto.tile_buf<vec, 16x256xf32>, !pto.pipe) {split = 1}
+        pto.tmuls ins(%82, %cst : !pto.tile_buf<vec, 16x256xf32>, f32) outs(%82 : !pto.tile_buf<vec, 16x256xf32>)
+        pto.trowmax ins(%82, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+        %83 = pto.treshape %37 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %84 = pto.treshape %36 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %85 = pto.treshape %41 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %86 = pto.treshape %38 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        %87 = pto.treshape %39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major> -> !pto.tile_buf<vec, 1x16xf32>
+        pto.tmax ins(%83, %84 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%83 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.tsub ins(%84, %83 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%85 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.tmuls ins(%83, %cst_2 : !pto.tile_buf<vec, 1x16xf32>, f32) outs(%84 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.trowexpandsub ins(%82, %37 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+        pto.texp ins(%85 : !pto.tile_buf<vec, 1x16xf32>) outs(%85 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.texp ins(%33 : !pto.tile_buf<vec, 16x256xf32>) outs(%33 : !pto.tile_buf<vec, 16x256xf32>)
+        pto.tmul ins(%86, %85 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%86 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.trowsum ins(%33, %32 : !pto.tile_buf<vec, 16x256xf32>, !pto.tile_buf<vec, 16x256xf32>) outs(%39 : !pto.tile_buf<vec, 16x1xf32, blayout=col_major>)
+        pto.tadd ins(%86, %87 : !pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>) outs(%86 : !pto.tile_buf<vec, 1x16xf32>)
+        pto.tcvt ins(%33 {rmode = #pto<round_mode CAST_RINT>} : !pto.tile_buf<vec, 16x256xf32>) outs(%34 : !pto.tile_buf<vec, 16x256xf16>)
+        pto.tpush_to_aic(%34 : !pto.tile_buf<vec, 16x256xf16>) {id = 30, split = 1}
+        pto.tfree(%25 : !pto.pipe) {split = 1}
+      }
+      %c101376_i64_8 = arith.constant 101376 : i64
+      %70 = pto.alloc_tile addr = %c101376_i64_8 : !pto.tile_buf<vec, 16x128xf32>
+      pto.tpop(%70, %27 : !pto.tile_buf<vec, 16x128xf32>, !pto.pipe) {split = 1}
+      pto.trowexpandmul ins(%35, %40 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tadd ins(%35, %70 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x128xf32>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tfree(%27 : !pto.pipe) {split = 1}
+      %c101376_i64_9 = arith.constant 101376 : i64
+      %71 = pto.alloc_tile addr = %c101376_i64_9 : !pto.tile_buf<vec, 16x128xf32>
+      pto.tpop(%71, %27 : !pto.tile_buf<vec, 16x128xf32>, !pto.pipe) {split = 1}
+      pto.trowexpandmul ins(%35, %41 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tadd ins(%35, %71 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x128xf32>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      pto.tfree(%27 : !pto.pipe) {split = 1}
+      pto.trowexpanddiv ins(%35, %38 : !pto.tile_buf<vec, 16x128xf32>, !pto.tile_buf<vec, 16x1xf32, blayout=col_major>) outs(%35 : !pto.tile_buf<vec, 16x128xf32>)
+      %72 = arith.addi %43, %31 : index
+      %73 = pto.partition_view %42, offsets = [%72, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32>
+      pto.tstore ins(%35 : !pto.tile_buf<vec, 16x128xf32>) outs(%73 : !pto.partition_tensor_view<16x128xf32>)
+    }
+    return
+  }
+  func.func @call_both(%arg0: memref<256xi64>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f16>, %arg3: !pto.ptr<f16>, %arg4: !pto.ptr<f16>, %arg5: !pto.ptr<f32>) attributes {pto.entry} {
+    pto.set_ffts %arg0 : memref<256xi64>
+    call @cube_kernel(%arg1, %arg2, %arg3, %arg4) : (!pto.ptr<f32>, !pto.ptr<f16>, !pto.ptr<f16>, !pto.ptr<f16>) -> ()
+    call @vector_kernel(%arg1, %arg5) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    return
+  }
+}
+
+