From b87ccf3b9cbc1db1d5d1c2a7a03671596ca21809 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Thu, 7 May 2026 13:41:44 +0200 Subject: [PATCH] added benchmark and mlirs for 140tflops dls flash attention and ptoas mlir version --- .../FlashAttention/compile_and_run/README.md | 36 + .../benchmark_flashattention.py | 167 ++++ .../FlashAttention/compile_and_run/caller.cpp | 29 + .../compile_and_run/caller_140tflops.cpp | 30 + .../compile_and_run/compile_flashattention.sh | 54 ++ .../compile_and_run/fa_140tflops.pto | 828 ++++++++++++++++++ .../fa_patched_s1_256_q3072_s0_8192.pto | 402 +++++++++ 7 files changed, 1546 insertions(+) create mode 100644 test/samples/FlashAttention/compile_and_run/README.md create mode 100644 test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py create mode 100644 test/samples/FlashAttention/compile_and_run/caller.cpp create mode 100644 test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp create mode 100755 test/samples/FlashAttention/compile_and_run/compile_flashattention.sh create mode 100644 test/samples/FlashAttention/compile_and_run/fa_140tflops.pto create mode 100644 test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto diff --git a/test/samples/FlashAttention/compile_and_run/README.md b/test/samples/FlashAttention/compile_and_run/README.md new file mode 100644 index 000000000..8e2bb7913 --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/README.md @@ -0,0 +1,36 @@ +# FlashAttention compile and benchmark + +This directory contains two PTO FlashAttention variants: + +- `fa_140tflops.pto` +- `fa_patched_s1_256_q3072_s0_8192.pto` + +## Requirements + +- Run inside the configured Ascend/CANN container environment. +- `ptoas` and `bisheng` must already be available in `PATH`. +- `/sources/pto-isa/include` must exist. +- Python benchmark requires `torch_npu==2.9.0`. + +## Compile + +From this directory, run: + +```bash +bash compile_flashattention.sh +``` + +This builds: + +- `/tmp/fa_140tflops.so` +- `/tmp/compiler_team_fa.so` + +## Benchmark + +After compiling, run: + +```bash +python3 benchmark_flashattention.py +``` + +The benchmark compares both PTO kernels against `torch_npu.npu_fused_infer_attention_score`, checks correctness against both fp32 reference attention and torch_npu output, and reports latency, TFLOP/s, and speedup. \ No newline at end of file diff --git a/test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py b/test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py new file mode 100644 index 000000000..79f495ee7 --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/benchmark_flashattention.py @@ -0,0 +1,167 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import ctypes +import math + +import torch +import torch_npu # noqa: F401 + +KERNELS = [ + ("fa_140tflops", "/tmp/fa_140tflops.so", 524288, True), + ("patched", "/tmp/compiler_team_fa.so", 229376, False), +] +DEVICE = "npu:0" +WARMUP_ITERS = 10 +BENCH_ITERS = 100 +NUM_CUBE_CORES = 24 +RTOL = 1e-3 +ATOL = 1e-3 + +Q_ROWS = 3072 +HEAD = 128 +S1_TOTAL = 8192 +NUM_Q_BLOCKS = Q_ROWS // 32 + + +def load_lib(lib_path, pass_shape): + lib = ctypes.CDLL(lib_path) + argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ] + if pass_shape: + argtypes += [ctypes.c_int64, ctypes.c_int64] + lib.call_kernel.argtypes = argtypes + lib.call_kernel.restype = None + return lib + + +def ptr(t): + return ctypes.c_void_p(t.data_ptr()) + + +def fused_attention(q_bsh, k_bsh, v_bsh): + scale = 1.0 / math.sqrt(q_bsh.shape[-1]) + out, _ = torch_npu.npu_fused_infer_attention_score( + q_bsh, + k_bsh, + v_bsh, + num_heads=1, + input_layout="BSH", + scale=scale, + next_tokens=65535, + ) + return out + + +def fa_reference(q, k, v): + scale = 1.0 / math.sqrt(q.shape[1]) + scores = q.float() @ k.float().T * scale + return torch.softmax(scores, dim=-1) @ v.float() + + +def run_pto_kernel(lib, pass_shape, block_dim, gm, q, k, v, o): + stream = torch.npu.current_stream()._as_parameter_ + args = [block_dim, stream, ptr(gm), ptr(q), ptr(k), ptr(v), ptr(o)] + if pass_shape: + args += [q.shape[0], k.shape[0]] + lib.call_kernel(*args) + + +def check_close(out_pto, out_fp32, out_torch_npu): + max_err_fp32 = (out_pto - out_fp32).abs().max().item() + max_err_torch_npu = (out_pto - out_torch_npu).abs().max().item() + try: + torch.testing.assert_close(out_pto, out_fp32, rtol=RTOL, atol=ATOL) + torch.testing.assert_close(out_pto, out_torch_npu, rtol=RTOL, atol=ATOL) + return "PASSED", max_err_fp32, max_err_torch_npu + except AssertionError: + return "FAILED", max_err_fp32, max_err_torch_npu + + +def bench(fn): + for _ in range(WARMUP_ITERS): + fn() + torch.npu.synchronize() + + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + start.record() + for _ in range(BENCH_ITERS): + fn() + end.record() + torch.npu.synchronize() + return start.elapsed_time(end) * 1000.0 / BENCH_ITERS + + +def main(): + device = torch.device(DEVICE) + block_dim = min(NUM_Q_BLOCKS, NUM_CUBE_CORES) + flops = 4 * Q_ROWS * HEAD * S1_TOTAL + + torch.manual_seed(0) + q = torch.randn((Q_ROWS, HEAD), dtype=torch.float16, device=device) + k = torch.randn((S1_TOTAL, HEAD), dtype=torch.float16, device=device) + v = torch.randn((S1_TOTAL, HEAD), dtype=torch.float16, device=device) + q_bsh = q.unsqueeze(0) + k_bsh = k.unsqueeze(0) + v_bsh = v.unsqueeze(0) + + def run_torch_npu(): + fused_attention(q_bsh, k_bsh, v_bsh) + + out_torch_npu = fused_attention(q_bsh, k_bsh, v_bsh).squeeze(0).float().cpu() + out_fp32 = fa_reference(q, k, v).float().cpu() + torch.npu.synchronize() + + torch_npu_us = bench(run_torch_npu) + torch_npu_tflops = flops / (torch_npu_us * 1e-6) / 1e12 + + print( + f"PTO FA variants vs torch_npu fused attention: Q={Q_ROWS} S1={S1_TOTAL} H={HEAD} " + f"blockDim={block_dim}" + ) + print(f" torch_npu: {torch_npu_us:8.2f} us {torch_npu_tflops:7.3f} TFLOP/s") + + for name, lib_path, gm_elems_per_block, pass_shape in KERNELS: + lib = load_lib(lib_path, pass_shape) + gm = torch.zeros( + (gm_elems_per_block * block_dim,), dtype=torch.float32, device=device + ) + o = torch.zeros((Q_ROWS, HEAD), dtype=torch.float32, device=device) + + def run_pto(): + run_pto_kernel(lib, pass_shape, block_dim, gm, q, k, v, o) + + # Correctness check against torch_npu fused attention. + gm.zero_() + o.zero_() + run_pto() + torch.npu.synchronize() + out_pto = o.float().cpu() + correctness, max_err_fp32, max_err_torch_npu = check_close( + out_pto, out_fp32, out_torch_npu + ) + + pto_us = bench(run_pto) + pto_tflops = flops / (pto_us * 1e-6) / 1e12 + print( + f" {name:12s}: {pto_us:8.2f} us {pto_tflops:7.3f} TFLOP/s " + f"speedup={torch_npu_us / pto_us:.2f}x {correctness} " + f"max_err(fp32={max_err_fp32:.3e}, torch_npu={max_err_torch_npu:.3e})" + ) + + +if __name__ == "__main__": + main() diff --git a/test/samples/FlashAttention/compile_and_run/caller.cpp b/test/samples/FlashAttention/compile_and_run/caller.cpp new file mode 100644 index 000000000..02bae6e11 --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/caller.cpp @@ -0,0 +1,29 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef KERNEL_CPP +#error "KERNEL_CPP must be defined at compile time." +#endif + +extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); + +#include KERNEL_CPP + +extern "C" void call_kernel(uint32_t blockDim, void *stream, uint8_t *gmSlotBuffer, uint8_t *q, uint8_t *k, uint8_t *v, + uint8_t *o) +{ + void *fftsAddr = nullptr; + uint32_t fftsLen = 0; + (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); + (void)fftsLen; + + call_both<<>>((__gm__ int64_t *)fftsAddr, (__gm__ float *)gmSlotBuffer, + (__gm__ half *)q, (__gm__ half *)k, (__gm__ half *)v, (__gm__ float *)o); +} diff --git a/test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp b/test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp new file mode 100644 index 000000000..faeb3a540 --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/caller_140tflops.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef KERNEL_CPP +#error "KERNEL_CPP must be defined at compile time." +#endif + +extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); + +#include KERNEL_CPP + +extern "C" void call_kernel(uint32_t blockDim, void *stream, uint8_t *gmSlotBuffer, uint8_t *q, uint8_t *k, uint8_t *v, + uint8_t *o, int64_t s0, int64_t s1) +{ + void *fftsAddr = nullptr; + uint32_t fftsLen = 0; + (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); + (void)fftsLen; + + call_both<<>>((__gm__ int64_t *)fftsAddr, (__gm__ float *)gmSlotBuffer, + (__gm__ half *)gmSlotBuffer, (__gm__ half *)q, (__gm__ half *)k, + (__gm__ half *)v, (__gm__ float *)o, s0, s1); +} diff --git a/test/samples/FlashAttention/compile_and_run/compile_flashattention.sh b/test/samples/FlashAttention/compile_and_run/compile_flashattention.sh new file mode 100755 index 000000000..140d16d73 --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/compile_flashattention.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +set -euo pipefail + +cd "$(dirname "${BASH_SOURCE[0]}")" + +ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync \ + fa_patched_s1_256_q3072_s0_8192.pto \ + >/tmp/compiler_team_fa.cpp + +bisheng \ + -I/sources/pto-isa/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + -cce-enable-mix \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"/tmp/compiler_team_fa.cpp\"" \ + "caller.cpp" \ + -o /tmp/compiler_team_fa.so + +ptoas --pto-arch=a3 --enable-insert-sync \ + fa_140tflops.pto \ + >/tmp/fa_140tflops.cpp + +bisheng \ + -I/sources/pto-isa/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + -cce-enable-mix \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"/tmp/fa_140tflops.cpp\"" \ + "caller_140tflops.cpp" \ + -o /tmp/fa_140tflops.so diff --git a/test/samples/FlashAttention/compile_and_run/fa_140tflops.pto b/test/samples/FlashAttention/compile_and_run/fa_140tflops.pto new file mode 100644 index 000000000..e60bd83ba --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/fa_140tflops.pto @@ -0,0 +1,828 @@ +"builtin.module"() ({ + "func.func"() <{function_type = (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, i64, i64) -> (), sym_name = "cube_kernel"}> ({ + ^bb0(%arg15: !pto.ptr, %arg16: !pto.ptr, %arg17: !pto.ptr, %arg18: !pto.ptr, %arg19: !pto.ptr, %arg20: i64, %arg21: i64): + %200 = "arith.constant"() <{value = 0 : index}> : () -> index + %201 = "arith.constant"() <{value = 1 : index}> : () -> index + %202 = "arith.constant"() <{value = 128 : index}> : () -> index + %203 = "arith.constant"() <{value = 128 : index}> : () -> index + %204 = "arith.constant"() <{value = 256 : index}> : () -> index + %205 = "arith.constant"() <{value = 128 : index}> : () -> index + %206 = "arith.constant"() <{value = 524288 : index}> : () -> index + %207 = "arith.constant"() <{value = 1048576 : index}> : () -> index + %208 = "pto.get_block_idx"() : () -> i64 + %209 = "arith.index_cast"(%208) : (i64) -> index + %210 = "arith.index_cast"(%arg20) : (i64) -> index + %211 = "arith.index_cast"(%arg21) : (i64) -> index + %212 = "arith.divsi"(%211, %204) : (index, index) -> index + %213 = "arith.muli"(%209, %202) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %214 = "arith.muli"(%209, %206) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %215 = "pto.addptr"(%arg15, %214) : (!pto.ptr, index) -> !pto.ptr + %216 = "arith.muli"(%209, %207) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %217 = "pto.addptr"(%arg16, %216) : (!pto.ptr, index) -> !pto.ptr + %218 = "arith.constant"() <{value = 0 : index}> : () -> index + %219 = "pto.addptr"(%215, %218) : (!pto.ptr, index) -> !pto.ptr + %220 = "arith.constant"() <{value = 524288 : index}> : () -> index + %221 = "pto.addptr"(%217, %220) : (!pto.ptr, index) -> !pto.ptr + %222 = "arith.constant"() <{value = 393216 : index}> : () -> index + %223 = "pto.addptr"(%215, %222) : (!pto.ptr, index) -> !pto.ptr + %224 = "pto.make_tensor_view"(%219, %202, %204, %204, %201) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view<128x256xf32> + %225 = "pto.initialize_l2g2l_pipe"(%224) <{dir_mask = 1 : i8, flag_base = 0 : i32, operandSegmentSizes = array, slot_num = 8 : i32, slot_size = 131072 : i32}> : (!pto.tensor_view<128x256xf32>) -> !pto.pipe + %226 = "pto.make_tensor_view"(%221, %202, %204, %204, %201) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view<128x256xf16> + %227 = "pto.initialize_l2g2l_pipe"(%226) <{dir_mask = 2 : i8, flag_base = 2 : i32, operandSegmentSizes = array, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<128x256xf16>) -> !pto.pipe + %228 = "pto.make_tensor_view"(%223, %202, %203, %203, %201) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view<128x128xf32> + %229 = "pto.initialize_l2g2l_pipe"(%228) <{dir_mask = 1 : i8, flag_base = 4 : i32, operandSegmentSizes = array, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<128x128xf32>) -> !pto.pipe + %230 = "pto.make_tensor_view"(%arg17, %210, %203, %203, %201) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view + %231 = "pto.make_tensor_view"(%arg18, %203, %211, %201, %203) <{layout = #pto.layout, operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view + %232 = "pto.make_tensor_view"(%arg19, %211, %203, %203, %201) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view + %233 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %234 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %235 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %236 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %237 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %238 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %239 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %240 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %241 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %242 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %243 = "pto.partition_view"(%230, %213, %200, %202, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%243, %233) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%233, %234) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %244 = "pto.declare_global"() : () -> !pto.tensor_view<128x256xf32> + %245 = "pto.declare_global"() : () -> !pto.tensor_view<128x256xf16> + %246 = "pto.declare_global"() : () -> !pto.tensor_view<128x128xf32> + %247 = "arith.constant"() <{value = 0 : index}> : () -> index + "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %248 = "arith.muli"(%247, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %249 = "arith.constant"() <{value = 0 : index}> : () -> index + %250 = "arith.addi"(%248, %249) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %251 = "pto.partition_view"(%231, %200, %250, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%251, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %252 = "arith.constant"() <{value = 0 : index}> : () -> index + %253 = "pto.partition_view"(%244, %200, %252, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %253) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + %254 = "arith.muli"(%247, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %255 = "arith.constant"() <{value = 128 : index}> : () -> index + %256 = "arith.addi"(%254, %255) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %257 = "pto.partition_view"(%231, %200, %256, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%257, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %258 = "arith.constant"() <{value = 128 : index}> : () -> index + %259 = "pto.partition_view"(%244, %200, %258, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %259) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %260 = "arith.constant"() <{value = 1 : index}> : () -> index + "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %261 = "arith.muli"(%260, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %262 = "arith.constant"() <{value = 0 : index}> : () -> index + %263 = "arith.addi"(%261, %262) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %264 = "pto.partition_view"(%231, %200, %263, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%264, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %265 = "arith.constant"() <{value = 0 : index}> : () -> index + %266 = "pto.partition_view"(%244, %200, %265, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %266) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + %267 = "arith.muli"(%260, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %268 = "arith.constant"() <{value = 128 : index}> : () -> index + %269 = "arith.addi"(%267, %268) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %270 = "pto.partition_view"(%231, %200, %269, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%270, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %271 = "arith.constant"() <{value = 128 : index}> : () -> index + %272 = "pto.partition_view"(%244, %200, %271, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %272) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %273 = "arith.constant"() <{value = 2 : index}> : () -> index + "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %274 = "arith.muli"(%273, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %275 = "arith.constant"() <{value = 0 : index}> : () -> index + %276 = "arith.addi"(%274, %275) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %277 = "pto.partition_view"(%231, %200, %276, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%277, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %278 = "arith.constant"() <{value = 0 : index}> : () -> index + %279 = "pto.partition_view"(%244, %200, %278, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %279) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + %280 = "arith.muli"(%273, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %281 = "arith.constant"() <{value = 128 : index}> : () -> index + %282 = "arith.addi"(%280, %281) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %283 = "pto.partition_view"(%231, %200, %282, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%283, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %284 = "arith.constant"() <{value = 128 : index}> : () -> index + %285 = "pto.partition_view"(%244, %200, %284, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %285) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %286 = "arith.constant"() <{value = 3 : index}> : () -> index + %287 = "arith.subi"(%212, %286) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "scf.for"(%200, %287, %201) ({ + ^bb0(%arg22: index): + %333 = "arith.addi"(%arg22, %286) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + %334 = "arith.muli"(%arg22, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %335 = "arith.constant"() <{value = 0 : index}> : () -> index + %336 = "arith.addi"(%334, %335) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %337 = "pto.partition_view"(%232, %336, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%337, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %338 = "arith.constant"() <{value = 0 : index}> : () -> index + %339 = "pto.partition_view"(%245, %200, %338, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%339, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.talloc"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + %340 = "arith.muli"(%333, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %341 = "arith.constant"() <{value = 0 : index}> : () -> index + %342 = "arith.addi"(%340, %341) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %343 = "pto.partition_view"(%231, %200, %342, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%343, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %344 = "arith.constant"() <{value = 0 : index}> : () -> index + %345 = "pto.partition_view"(%244, %200, %344, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %345) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + %346 = "arith.muli"(%arg22, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %347 = "arith.constant"() <{value = 128 : index}> : () -> index + %348 = "arith.addi"(%346, %347) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %349 = "pto.partition_view"(%232, %348, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%349, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %350 = "arith.constant"() <{value = 128 : index}> : () -> index + %351 = "pto.partition_view"(%245, %200, %350, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%351, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %352 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%242, %352) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %353 = "arith.muli"(%333, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %354 = "arith.constant"() <{value = 128 : index}> : () -> index + %355 = "arith.addi"(%353, %354) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %356 = "pto.partition_view"(%231, %200, %355, %203, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%356, %235) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%235, %236) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%234, %236, %237) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %357 = "arith.constant"() <{value = 128 : index}> : () -> index + %358 = "pto.partition_view"(%244, %200, %357, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%237, %358) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%244, %225) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf32>, !pto.pipe) -> () + "scf.yield"() : () -> () + }) : (index, index, index) -> () + %288 = "arith.constant"() <{value = 0 : index}> : () -> index + %289 = "arith.addi"(%287, %288) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + %290 = "arith.muli"(%289, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %291 = "arith.constant"() <{value = 0 : index}> : () -> index + %292 = "arith.addi"(%290, %291) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %293 = "pto.partition_view"(%232, %292, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%293, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %294 = "arith.constant"() <{value = 0 : index}> : () -> index + %295 = "pto.partition_view"(%245, %200, %294, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%295, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %296 = "arith.muli"(%289, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %297 = "arith.constant"() <{value = 128 : index}> : () -> index + %298 = "arith.addi"(%296, %297) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %299 = "pto.partition_view"(%232, %298, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%299, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %300 = "arith.constant"() <{value = 128 : index}> : () -> index + %301 = "pto.partition_view"(%245, %200, %300, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%301, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %302 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%242, %302) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %303 = "arith.constant"() <{value = 1 : index}> : () -> index + %304 = "arith.addi"(%287, %303) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + %305 = "arith.muli"(%304, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %306 = "arith.constant"() <{value = 0 : index}> : () -> index + %307 = "arith.addi"(%305, %306) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %308 = "pto.partition_view"(%232, %307, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%308, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %309 = "arith.constant"() <{value = 0 : index}> : () -> index + %310 = "pto.partition_view"(%245, %200, %309, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%310, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %311 = "arith.muli"(%304, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %312 = "arith.constant"() <{value = 128 : index}> : () -> index + %313 = "arith.addi"(%311, %312) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %314 = "pto.partition_view"(%232, %313, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%314, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %315 = "arith.constant"() <{value = 128 : index}> : () -> index + %316 = "pto.partition_view"(%245, %200, %315, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%316, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %317 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%242, %317) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %318 = "arith.constant"() <{value = 2 : index}> : () -> index + %319 = "arith.addi"(%287, %318) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + %320 = "arith.muli"(%319, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %321 = "arith.constant"() <{value = 0 : index}> : () -> index + %322 = "arith.addi"(%320, %321) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %323 = "pto.partition_view"(%232, %322, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%323, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %324 = "arith.constant"() <{value = 0 : index}> : () -> index + %325 = "pto.partition_view"(%245, %200, %324, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%325, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul"(%239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %326 = "arith.muli"(%319, %204) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %327 = "arith.constant"() <{value = 128 : index}> : () -> index + %328 = "arith.addi"(%326, %327) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %329 = "pto.partition_view"(%232, %328, %200, %205, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%329, %240) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + %330 = "arith.constant"() <{value = 128 : index}> : () -> index + %331 = "pto.partition_view"(%245, %200, %330, %202, %205) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf16> + "pto.tload"(%331, %238) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<128x128xf16>, !pto.tile_buf) -> () + "pto.tmov"(%238, %239) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%240, %241) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmatmul.acc"(%242, %239, %241, %242) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tfree"(%245, %227) <{split = 1 : i8}> : (!pto.tensor_view<128x256xf16>, !pto.pipe) -> () + "pto.talloc"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + %332 = "pto.partition_view"(%246, %200, %200, %202, %203) <{operandSegmentSizes = array}> : (!pto.tensor_view<128x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<128x128xf32> + "pto.tstore"(%242, %332) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<128x128xf32>) -> () + "pto.tpush"(%246, %229) <{split = 1 : i8}> : (!pto.tensor_view<128x128xf32>, !pto.pipe) -> () + "func.return"() : () -> () + }) {pto.kernel_kind = #pto.kernel_kind} : () -> () + "func.func"() <{function_type = (!pto.ptr, !pto.ptr, !pto.ptr, i64, i64) -> (), sym_name = "vector_kernel"}> ({ + ^bb0(%arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr, %arg12: i64, %arg13: i64): + %0 = "arith.constant"() <{value = 0 : index}> : () -> index + %1 = "arith.constant"() <{value = 1 : index}> : () -> index + %2 = "arith.constant"() <{value = 128 : index}> : () -> index + %3 = "arith.constant"() <{value = 64 : index}> : () -> index + %4 = "arith.constant"() <{value = 32 : index}> : () -> index + %5 = "arith.constant"() <{value = 128 : index}> : () -> index + %6 = "arith.constant"() <{value = 256 : index}> : () -> index + %7 = "arith.constant"() <{value = 128 : index}> : () -> index + %8 = "arith.constant"() <{value = 524288 : index}> : () -> index + %9 = "arith.constant"() <{value = 1048576 : index}> : () -> index + %10 = "pto.get_block_idx"() : () -> i64 + %11 = "arith.index_cast"(%10) : (i64) -> index + %12 = "pto.get_subblock_idx"() : () -> i64 + %13 = "arith.index_cast"(%12) : (i64) -> index + %14 = "arith.index_cast"(%arg12) : (i64) -> index + %15 = "arith.index_cast"(%arg13) : (i64) -> index + %16 = "arith.divsi"(%15, %6) : (index, index) -> index + %17 = "arith.muli"(%11, %2) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %18 = "arith.muli"(%13, %3) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %19 = "arith.addi"(%17, %18) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %20 = "arith.muli"(%11, %8) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %21 = "pto.addptr"(%arg9, %20) : (!pto.ptr, index) -> !pto.ptr + %22 = "arith.muli"(%11, %9) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %23 = "pto.addptr"(%arg10, %22) : (!pto.ptr, index) -> !pto.ptr + %24 = "arith.constant"() <{value = 0 : index}> : () -> index + %25 = "pto.addptr"(%21, %24) : (!pto.ptr, index) -> !pto.ptr + %26 = "arith.constant"() <{value = 524288 : index}> : () -> index + %27 = "pto.addptr"(%23, %26) : (!pto.ptr, index) -> !pto.ptr + %28 = "arith.constant"() <{value = 393216 : index}> : () -> index + %29 = "pto.addptr"(%21, %28) : (!pto.ptr, index) -> !pto.ptr + %30 = "pto.make_tensor_view"(%25, %3, %6, %6, %1) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view<64x256xf32> + %31 = "pto.initialize_l2g2l_pipe"(%30) <{dir_mask = 1 : i8, flag_base = 0 : i32, operandSegmentSizes = array, slot_num = 8 : i32, slot_size = 131072 : i32}> : (!pto.tensor_view<64x256xf32>) -> !pto.pipe + %32 = "pto.make_tensor_view"(%27, %3, %6, %6, %1) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view<64x256xf16> + %33 = "pto.initialize_l2g2l_pipe"(%32) <{dir_mask = 2 : i8, flag_base = 2 : i32, operandSegmentSizes = array, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<64x256xf16>) -> !pto.pipe + %34 = "pto.make_tensor_view"(%29, %3, %5, %5, %1) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view<64x128xf32> + %35 = "pto.initialize_l2g2l_pipe"(%34) <{dir_mask = 1 : i8, flag_base = 4 : i32, operandSegmentSizes = array, slot_num = 8 : i32, slot_size = 65536 : i32}> : (!pto.tensor_view<64x128xf32>) -> !pto.pipe + %36 = "pto.make_tensor_view"(%arg11, %14, %5, %5, %1) <{operandSegmentSizes = array}> : (!pto.ptr, index, index, index, index) -> !pto.tensor_view + %37 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %38 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %39 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %40 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %41 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %42 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %43 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %44 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %45 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %46 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %47 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %48 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %49 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %50 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %51 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %52 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %53 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %54 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %55 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %56 = "pto.alloc_tile"() <{operandSegmentSizes = array}> : () -> !pto.tile_buf + %57 = "arith.constant"() <{value = 0.0883883461 : f32}> : () -> f32 + %58 = "arith.constant"() <{value = 3 : index}> : () -> index + %59 = "pto.declare_global"() : () -> !pto.tensor_view<64x256xf32> + %60 = "pto.declare_global"() : () -> !pto.tensor_view<64x256xf16> + %61 = "pto.declare_global"() : () -> !pto.tensor_view<64x128xf32> + "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + %62 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + %63 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + "pto.tload"(%62, %37) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tload"(%63, %38) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %45) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %45, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %48) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %46) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %46, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %49) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "pto.tcvt"(%37, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %64 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %64) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tcvt"(%38, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %65 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %65) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + %66 = "arith.constant"() <{value = 1 : index}> : () -> index + "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + %67 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + %68 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + "pto.tload"(%67, %37) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tload"(%68, %38) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %69 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %70 = "pto.treshape"(%52) : (!pto.tile_buf) -> !pto.tile_buf + %71 = "pto.treshape"(%45) : (!pto.tile_buf) -> !pto.tile_buf + %72 = "pto.treshape"(%48) : (!pto.tile_buf) -> !pto.tile_buf + %73 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%69, %71, %69) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%71, %69, %70) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%70, %70) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%69, %71) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%72, %70, %72) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%72, %73, %72) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %74 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %75 = "pto.treshape"(%55) : (!pto.tile_buf) -> !pto.tile_buf + %76 = "pto.treshape"(%46) : (!pto.tile_buf) -> !pto.tile_buf + %77 = "pto.treshape"(%49) : (!pto.tile_buf) -> !pto.tile_buf + %78 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%74, %76, %74) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%76, %74, %75) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%75, %75) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%74, %76) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%77, %75, %77) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%77, %78, %77) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "pto.tcvt"(%37, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %79 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %79) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tcvt"(%38, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %80 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %80) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + %81 = "arith.constant"() <{value = 2 : index}> : () -> index + "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + %82 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + %83 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + "pto.tload"(%82, %37) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tload"(%83, %38) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %84 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %85 = "pto.treshape"(%53) : (!pto.tile_buf) -> !pto.tile_buf + %86 = "pto.treshape"(%45) : (!pto.tile_buf) -> !pto.tile_buf + %87 = "pto.treshape"(%48) : (!pto.tile_buf) -> !pto.tile_buf + %88 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%84, %86, %84) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%86, %84, %85) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%85, %85) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%84, %86) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%87, %85, %87) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%87, %88, %87) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %89 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %90 = "pto.treshape"(%56) : (!pto.tile_buf) -> !pto.tile_buf + %91 = "pto.treshape"(%46) : (!pto.tile_buf) -> !pto.tile_buf + %92 = "pto.treshape"(%49) : (!pto.tile_buf) -> !pto.tile_buf + %93 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%89, %91, %89) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%91, %89, %90) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%90, %90) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%89, %91) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%92, %90, %92) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%92, %93, %92) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "pto.tcvt"(%37, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %94 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %94) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tcvt"(%38, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %95 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %95) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + %96 = "arith.constant"() <{value = 3 : index}> : () -> index + %97 = "arith.subi"(%16, %96) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %98 = "arith.cmpi"(%97, %0) <{predicate = 4 : i64}> : (index, index) -> i1 + "scf.if"(%98) ({ + "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %184 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + %185 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tload"(%184, %43) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + "pto.tload"(%185, %44) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + "pto.tmov"(%43, %41) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%44, %42) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + %186 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + %187 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + "pto.tload"(%186, %37) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tload"(%187, %38) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %188 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %189 = "pto.treshape"(%51) : (!pto.tile_buf) -> !pto.tile_buf + %190 = "pto.treshape"(%45) : (!pto.tile_buf) -> !pto.tile_buf + %191 = "pto.treshape"(%48) : (!pto.tile_buf) -> !pto.tile_buf + %192 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%188, %190, %188) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%190, %188, %189) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%189, %189) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%188, %190) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%191, %189, %191) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%191, %192, %191) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %193 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %194 = "pto.treshape"(%54) : (!pto.tile_buf) -> !pto.tile_buf + %195 = "pto.treshape"(%46) : (!pto.tile_buf) -> !pto.tile_buf + %196 = "pto.treshape"(%49) : (!pto.tile_buf) -> !pto.tile_buf + %197 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%193, %195, %193) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%195, %193, %194) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%194, %194) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%193, %195) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%196, %194, %196) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%196, %197, %196) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "pto.tcvt"(%37, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %198 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %198) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tcvt"(%38, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %199 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %199) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "scf.yield"() : () -> () + }, { + }) : (i1) -> () + "scf.for"(%1, %97, %1) ({ + ^bb0(%arg14: index): + %135 = "arith.addi"(%arg14, %96) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %136 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + %137 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tload"(%136, %43) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + "pto.tload"(%137, %44) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + %138 = "arith.remsi"(%arg14, %58) : (index, index) -> index + %139 = "arith.constant"() <{value = 0 : index}> : () -> index + %140 = "arith.cmpi"(%138, %139) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%140) ({ + "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %181 = "arith.remsi"(%arg14, %58) : (index, index) -> index + %182 = "arith.constant"() <{value = 1 : index}> : () -> index + %183 = "arith.cmpi"(%181, %182) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%183) ({ + "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + "pto.tpop"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + %141 = "pto.partition_view"(%59, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + %142 = "pto.partition_view"(%59, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf32> + "pto.tload"(%141, %37) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tload"(%142, %38) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x256xf32>, !pto.tile_buf) -> () + "pto.tfree"(%59, %31) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf32>, !pto.pipe) -> () + %143 = "arith.remsi"(%135, %58) : (index, index) -> index + %144 = "arith.constant"() <{value = 0 : index}> : () -> index + %145 = "arith.cmpi"(%143, %144) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%145) ({ + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %171 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %172 = "pto.treshape"(%51) : (!pto.tile_buf) -> !pto.tile_buf + %173 = "pto.treshape"(%45) : (!pto.tile_buf) -> !pto.tile_buf + %174 = "pto.treshape"(%48) : (!pto.tile_buf) -> !pto.tile_buf + %175 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%171, %173, %171) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%173, %171, %172) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%172, %172) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%171, %173) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%174, %172, %174) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%174, %175, %174) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %176 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %177 = "pto.treshape"(%54) : (!pto.tile_buf) -> !pto.tile_buf + %178 = "pto.treshape"(%46) : (!pto.tile_buf) -> !pto.tile_buf + %179 = "pto.treshape"(%49) : (!pto.tile_buf) -> !pto.tile_buf + %180 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%176, %178, %176) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%178, %176, %177) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%177, %177) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%176, %178) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%179, %177, %179) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%179, %180, %179) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %148 = "arith.remsi"(%135, %58) : (index, index) -> index + %149 = "arith.constant"() <{value = 1 : index}> : () -> index + %150 = "arith.cmpi"(%148, %149) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%150) ({ + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %161 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %162 = "pto.treshape"(%52) : (!pto.tile_buf) -> !pto.tile_buf + %163 = "pto.treshape"(%45) : (!pto.tile_buf) -> !pto.tile_buf + %164 = "pto.treshape"(%48) : (!pto.tile_buf) -> !pto.tile_buf + %165 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%161, %163, %161) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%163, %161, %162) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%162, %162) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%161, %163) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%164, %162, %164) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%164, %165, %164) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %166 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %167 = "pto.treshape"(%55) : (!pto.tile_buf) -> !pto.tile_buf + %168 = "pto.treshape"(%46) : (!pto.tile_buf) -> !pto.tile_buf + %169 = "pto.treshape"(%49) : (!pto.tile_buf) -> !pto.tile_buf + %170 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%166, %168, %166) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%168, %166, %167) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%167, %167) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%166, %168) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%169, %167, %169) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%169, %170, %169) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + "pto.tmuls"(%37, %57, %37) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%37, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %151 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %152 = "pto.treshape"(%53) : (!pto.tile_buf) -> !pto.tile_buf + %153 = "pto.treshape"(%45) : (!pto.tile_buf) -> !pto.tile_buf + %154 = "pto.treshape"(%48) : (!pto.tile_buf) -> !pto.tile_buf + %155 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%151, %153, %151) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%153, %151, %152) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%152, %152) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%151, %153) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%154, %152, %154) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%37, %47, %37) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%37, %37) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%37, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%154, %155, %154) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tmuls"(%38, %57, %38) : (!pto.tile_buf, f32, !pto.tile_buf) -> () + "pto.trowmax"(%38, %39, %47) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %156 = "pto.treshape"(%47) : (!pto.tile_buf) -> !pto.tile_buf + %157 = "pto.treshape"(%56) : (!pto.tile_buf) -> !pto.tile_buf + %158 = "pto.treshape"(%46) : (!pto.tile_buf) -> !pto.tile_buf + %159 = "pto.treshape"(%49) : (!pto.tile_buf) -> !pto.tile_buf + %160 = "pto.treshape"(%50) : (!pto.tile_buf) -> !pto.tile_buf + "pto.tmax"(%156, %158, %156) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tsub"(%158, %156, %157) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%157, %157) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%156, %158) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmul"(%159, %157, %159) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandsub"(%38, %47, %38) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.texp"(%38, %38) : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.trowsum"(%38, %39, %50) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%159, %160, %159) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "pto.talloc"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "pto.tcvt"(%37, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %146 = "pto.partition_view"(%60, %0, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %146) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tcvt"(%38, %40) <{rmode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + %147 = "pto.partition_view"(%60, %4, %0, %4, %6) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x256xf16>, index, index, index, index) -> !pto.partition_tensor_view<32x256xf16> + "pto.tstore"(%40, %147) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x256xf16>) -> () + "pto.tpush"(%60, %33) <{split = 1 : i8}> : (!pto.tensor_view<64x256xf16>, !pto.pipe) -> () + "scf.yield"() : () -> () + }) : (index, index, index) -> () + %99 = "arith.constant"() <{value = 0 : index}> : () -> index + %100 = "arith.addi"(%97, %99) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %101 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + %102 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tload"(%101, %43) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + "pto.tload"(%102, %44) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + %103 = "arith.cmpi"(%100, %0) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%103) ({ + "pto.tmov"(%43, %41) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%44, %42) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %129 = "arith.remsi"(%100, %58) : (index, index) -> index + %130 = "arith.constant"() <{value = 0 : index}> : () -> index + %131 = "arith.cmpi"(%129, %130) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%131) ({ + "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %132 = "arith.remsi"(%100, %58) : (index, index) -> index + %133 = "arith.constant"() <{value = 1 : index}> : () -> index + %134 = "arith.cmpi"(%132, %133) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%134) ({ + "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %104 = "arith.constant"() <{value = 1 : index}> : () -> index + %105 = "arith.addi"(%97, %104) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %106 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + %107 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tload"(%106, %43) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + "pto.tload"(%107, %44) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + %108 = "arith.cmpi"(%105, %0) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%108) ({ + "pto.tmov"(%43, %41) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%44, %42) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %123 = "arith.remsi"(%105, %58) : (index, index) -> index + %124 = "arith.constant"() <{value = 0 : index}> : () -> index + %125 = "arith.cmpi"(%123, %124) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%125) ({ + "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %126 = "arith.remsi"(%105, %58) : (index, index) -> index + %127 = "arith.constant"() <{value = 1 : index}> : () -> index + %128 = "arith.cmpi"(%126, %127) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%128) ({ + "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %109 = "arith.constant"() <{value = 2 : index}> : () -> index + %110 = "arith.addi"(%97, %109) <{overflowFlags = #arith.overflow}> : (index, index) -> index + "pto.tpop"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + %111 = "pto.partition_view"(%61, %0, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + %112 = "pto.partition_view"(%61, %4, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view<64x128xf32>, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tload"(%111, %43) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + "pto.tload"(%112, %44) <{operandSegmentSizes = array}> : (!pto.partition_tensor_view<32x128xf32>, !pto.tile_buf) -> () + %113 = "arith.cmpi"(%110, %0) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%113) ({ + "pto.tmov"(%43, %41) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "pto.tmov"(%44, %42) <{operandSegmentSizes = array, reluPreMode = #pto}> : (!pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %117 = "arith.remsi"(%110, %58) : (index, index) -> index + %118 = "arith.constant"() <{value = 0 : index}> : () -> index + %119 = "arith.cmpi"(%117, %118) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%119) ({ + "pto.trowexpandmul"(%41, %51, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %54, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + %120 = "arith.remsi"(%110, %58) : (index, index) -> index + %121 = "arith.constant"() <{value = 1 : index}> : () -> index + %122 = "arith.cmpi"(%120, %121) <{predicate = 0 : i64}> : (index, index) -> i1 + "scf.if"(%122) ({ + "pto.trowexpandmul"(%41, %52, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %55, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }, { + "pto.trowexpandmul"(%41, %53, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%41, %43, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpandmul"(%42, %56, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.tadd"(%42, %44, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "scf.yield"() : () -> () + }) : (i1) -> () + "pto.tfree"(%61, %35) <{split = 1 : i8}> : (!pto.tensor_view<64x128xf32>, !pto.pipe) -> () + "pto.trowexpanddiv"(%41, %48, %41) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + "pto.trowexpanddiv"(%42, %49, %42) : (!pto.tile_buf, !pto.tile_buf, !pto.tile_buf) -> () + %114 = "pto.partition_view"(%36, %19, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tstore"(%41, %114) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x128xf32>) -> () + %115 = "arith.addi"(%19, %4) <{overflowFlags = #arith.overflow}> : (index, index) -> index + %116 = "pto.partition_view"(%36, %115, %0, %4, %5) <{operandSegmentSizes = array}> : (!pto.tensor_view, index, index, index, index) -> !pto.partition_tensor_view<32x128xf32> + "pto.tstore"(%42, %116) <{atomicType = #pto, reluPreMode = #pto, stPhase = #pto}> : (!pto.tile_buf, !pto.partition_tensor_view<32x128xf32>) -> () + "func.return"() : () -> () + }) {pto.kernel_kind = #pto.kernel_kind} : () -> () + "func.func"() <{function_type = (memref<256xi64>, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, i64, i64) -> (), sym_name = "call_both"}> ({ + ^bb0(%arg0: memref<256xi64>, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: i64, %arg8: i64): + "pto.set_ffts"(%arg0) : (memref<256xi64>) -> () + "func.call"(%arg1, %arg2, %arg3, %arg4, %arg5, %arg7, %arg8) <{callee = @cube_kernel}> : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, i64, i64) -> () + "func.call"(%arg1, %arg2, %arg6, %arg7, %arg8) <{callee = @vector_kernel}> : (!pto.ptr, !pto.ptr, !pto.ptr, i64, i64) -> () + "func.return"() : () -> () + }) {pto.entry} : () -> () +}) : () -> () + diff --git a/test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto b/test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto new file mode 100644 index 000000000..df4401dd9 --- /dev/null +++ b/test/samples/FlashAttention/compile_and_run/fa_patched_s1_256_q3072_s0_8192.pto @@ -0,0 +1,402 @@ +module { + func.func @cube_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c8192 = arith.constant 8192 : index + %c32_0 = arith.constant 32 : index + %c96 = arith.constant 96 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c96, %1 : index + %5 = arith.remsi %c96, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c131072 = arith.constant 131072 : index + %19 = arith.muli %3, %c131072 : index + %20 = pto.addptr %arg0, %19 : -> + %c0_1 = arith.constant 0 : index + %21 = pto.addptr %20, %c0_1 : -> + %c65536 = arith.constant 65536 : index + %22 = pto.addptr %20, %c65536 : -> + %c98304 = arith.constant 98304 : index + %23 = pto.addptr %20, %c98304 : -> + %24 = pto.import_reserved_buffer{name = "fa_qk_c2v_fifo", peer_func = @vector_kernel} -> i32 + %25 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 32768, slot_num = 8, local_slot_num = 1} (%21 : !pto.ptr, %24 : i32) -> !pto.pipe + %26 = pto.import_reserved_buffer{name = "fa_pv_c2v_fifo", peer_func = @vector_kernel} -> i32 + %27 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 16384, slot_num = 8, local_slot_num = 1} (%22 : !pto.ptr, %26 : i32) -> !pto.pipe + %28 = pto.reserve_buffer{name = "fa_p_v2c_fifo", size = 16384, location = , auto = false, base = 262144} -> i32 + %c0_i32 = arith.constant 0 : i32 + pto.aic_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false}(gm_slot_buffer = %23 : !pto.ptr, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32) + %c0_i64 = arith.constant 0 : i64 + %c0_i64_2 = arith.constant 0 : i64 + %29 = pto.alloc_tile addr = %c0_i64_2 : !pto.tile_buf + %c0_i64_3 = arith.constant 0 : i64 + %30 = pto.alloc_tile addr = %c0_i64_3 : !pto.tile_buf + %c8192_i64 = arith.constant 8192 : i64 + %31 = pto.alloc_tile addr = %c8192_i64 : !pto.tile_buf + %32 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c0_i64_4 = arith.constant 0 : i64 + %33 = pto.alloc_tile addr = %c0_i64_4 : !pto.tile_buf + %c73728_i64 = arith.constant 73728 : i64 + %34 = pto.alloc_tile addr = %c73728_i64 : !pto.tile_buf + %c8192_i64_5 = arith.constant 8192 : i64 + %35 = pto.alloc_tile addr = %c8192_i64_5 : !pto.tile_buf + %c90112_i64 = arith.constant 90112 : i64 + %36 = pto.alloc_tile addr = %c90112_i64 : !pto.tile_buf + %37 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %c32768_i64 = arith.constant 32768 : i64 + %38 = pto.alloc_tile addr = %c32768_i64 : !pto.tile_buf + %c3072 = arith.constant 3072 : index + %39 = pto.make_tensor_view %arg1, shape = [%c3072, %c128], strides = [%c128, %c1] : !pto.tensor_view + %40 = pto.make_tensor_view %arg2, shape = [%c128, %c8192], strides = [%c1, %c128] : !pto.tensor_view + %41 = pto.make_tensor_view %arg3, shape = [%c8192, %c128], strides = [%c128, %c1] : !pto.tensor_view + scf.for %arg4 = %14 to %18 step %c1 { + %42 = arith.muli %arg4, %c32 : index + %43 = pto.partition_view %39, offsets = [%42, %c0], sizes = [%c32, %c128] : !pto.tensor_view + pto.tload ins(%43 : !pto.partition_tensor_view<32x128xf16>) outs(%29 : !pto.tile_buf) + pto.tmov ins(%29 : !pto.tile_buf) outs(%30 : !pto.tile_buf) + %c0_6 = arith.constant 0 : index + %44 = pto.partition_view %40, offsets = [%c0, %c0_6], sizes = [%c128, %c256] : !pto.tensor_view + pto.tload ins(%44 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%32 : !pto.tile_buf) + pto.tmatmul ins(%30, %32 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tpush(%33, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + %c256_7 = arith.constant 256 : index + %45 = pto.partition_view %40, offsets = [%c0, %c256_7], sizes = [%c128, %c256] : !pto.tensor_view + pto.tload ins(%45 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf) + pto.tmov ins(%31 : !pto.tile_buf) outs(%32 : !pto.tile_buf) + pto.tmatmul ins(%30, %32 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tpush(%33, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + %46 = pto.partition_view %41, offsets = [%c0, %c0], sizes = [%c256, %c128] : !pto.tensor_view + pto.tload ins(%46 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf) + %c15 = arith.constant 15 : index + scf.for %arg5 = %c0 to %c15 step %c1 { + %50 = arith.muli %arg5, %c2 : index + %c2_8 = arith.constant 2 : index + %51 = arith.addi %50, %c2_8 : index + %52 = arith.muli %51, %c256 : index + %53 = pto.partition_view %40, offsets = [%c0, %52], sizes = [%c128, %c256] : !pto.tensor_view + pto.tload ins(%53 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf) + %54 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%54 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aiv {id = 30, split = 1} + pto.tmov ins(%36 : !pto.tile_buf) outs(%37 : !pto.tile_buf) + %55 = arith.addi %50, %c1 : index + %56 = arith.muli %55, %c256 : index + %57 = pto.partition_view %41, offsets = [%56, %c0], sizes = [%c256, %c128] : !pto.tensor_view + pto.tload ins(%57 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf) + pto.tmatmul ins(%35, %37 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tpush(%38, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmov ins(%31 : !pto.tile_buf) outs(%32 : !pto.tile_buf) + pto.tmatmul ins(%30, %32 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tpush(%33, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + %58 = arith.muli %arg5, %c2 : index + %59 = arith.addi %58, %c1 : index + %c2_9 = arith.constant 2 : index + %60 = arith.addi %59, %c2_9 : index + %61 = arith.muli %60, %c256 : index + %62 = pto.partition_view %40, offsets = [%c0, %61], sizes = [%c128, %c256] : !pto.tensor_view + pto.tload ins(%62 : !pto.partition_tensor_view<128x256xf16>) outs(%31 : !pto.tile_buf) + %63 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%63 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aiv {id = 30, split = 1} + pto.tmov ins(%36 : !pto.tile_buf) outs(%37 : !pto.tile_buf) + %64 = arith.addi %59, %c1 : index + %65 = arith.muli %64, %c256 : index + %66 = pto.partition_view %41, offsets = [%65, %c0], sizes = [%c256, %c128] : !pto.tensor_view + pto.tload ins(%66 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf) + pto.tmatmul ins(%35, %37 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tpush(%38, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmov ins(%31 : !pto.tile_buf) outs(%32 : !pto.tile_buf) + pto.tmatmul ins(%30, %32 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tpush(%33, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + } + %47 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%47 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aiv {id = 30, split = 1} + pto.tmov ins(%36 : !pto.tile_buf) outs(%37 : !pto.tile_buf) + %c7936 = arith.constant 7936 : index + %48 = pto.partition_view %41, offsets = [%c7936, %c0], sizes = [%c256, %c128] : !pto.tensor_view + pto.tload ins(%48 : !pto.partition_tensor_view<256x128xf16>) outs(%36 : !pto.tile_buf) + pto.tmatmul ins(%35, %37 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tpush(%38, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + %49 = pto.tpop_from_aiv {id = 30, split = 1} -> !pto.tile_buf + pto.tmov ins(%49 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree_from_aiv {id = 30, split = 1} + pto.tmov ins(%36 : !pto.tile_buf) outs(%37 : !pto.tile_buf) + pto.tmatmul ins(%35, %37 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tpush(%38, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + } + return + } + func.func @vector_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %c32_0 = arith.constant 32 : index + %c96 = arith.constant 96 : index + %0 = pto.get_block_num + %1 = arith.index_cast %0 : i64 to index + %2 = pto.get_block_idx + %3 = arith.index_cast %2 : i64 to index + %4 = arith.divsi %c96, %1 : index + %5 = arith.remsi %c96, %1 : index + %6 = arith.addi %4, %c1 : index + %7 = arith.muli %3, %6 : index + %8 = arith.addi %4, %c1 : index + %9 = arith.muli %5, %8 : index + %10 = arith.subi %3, %5 : index + %11 = arith.muli %10, %4 : index + %12 = arith.addi %9, %11 : index + %13 = arith.cmpi slt, %3, %5 : index + %14 = arith.select %13, %7, %12 : index + %15 = arith.cmpi slt, %3, %5 : index + %16 = arith.addi %4, %c1 : index + %17 = arith.select %15, %16, %4 : index + %18 = arith.addi %14, %17 : index + %c131072 = arith.constant 131072 : index + %19 = arith.muli %3, %c131072 : index + %20 = pto.addptr %arg0, %19 : -> + %c0_1 = arith.constant 0 : index + %21 = pto.addptr %20, %c0_1 : -> + %c65536 = arith.constant 65536 : index + %22 = pto.addptr %20, %c65536 : -> + %c98304 = arith.constant 98304 : index + %23 = pto.addptr %20, %c98304 : -> + %24 = pto.reserve_buffer{name = "fa_qk_c2v_fifo", size = 32768, location = , auto = false, base = 0} -> i32 + %25 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 32768, slot_num = 8, local_slot_num = 1} (%21 : !pto.ptr, %24 : i32) -> !pto.pipe + %26 = pto.reserve_buffer{name = "fa_pv_c2v_fifo", size = 16384, location = , auto = false, base = 32768} -> i32 + %27 = pto.initialize_l2g2l_pipe{dir_mask = 1, slot_size = 16384, slot_num = 8, local_slot_num = 1} (%22 : !pto.ptr, %26 : i32) -> !pto.pipe + %28 = pto.import_reserved_buffer{name = "fa_p_v2c_fifo", peer_func = @cube_kernel} -> i32 + %c0_i32 = arith.constant 0 : i32 + pto.aiv_initialize_pipe {id = 30, dir_mask = 2, slot_size = 16384, local_slot_num = 1, nosplit = false}(gm_slot_buffer = %23 : !pto.ptr, c2v_consumer_buf = %c0_i32 : i32, v2c_consumer_buf = %28 : i32) + %29 = pto.get_subblock_idx + %30 = arith.index_cast %29 : i64 to index + %31 = arith.muli %30, %c16 : index + %c49152_i64 = arith.constant 49152 : i64 + %32 = pto.alloc_tile addr = %c49152_i64 : !pto.tile_buf + %c65536_i64 = arith.constant 65536 : i64 + %33 = pto.alloc_tile addr = %c65536_i64 : !pto.tile_buf + %c81920_i64 = arith.constant 81920 : i64 + %34 = pto.alloc_tile addr = %c81920_i64 : !pto.tile_buf + %c90112_i64 = arith.constant 90112 : i64 + %35 = pto.alloc_tile addr = %c90112_i64 : !pto.tile_buf + %c98304_i64 = arith.constant 98304 : i64 + %36 = pto.alloc_tile addr = %c98304_i64 : !pto.tile_buf + %c98816_i64 = arith.constant 98816 : i64 + %37 = pto.alloc_tile addr = %c98816_i64 : !pto.tile_buf + %c99328_i64 = arith.constant 99328 : i64 + %38 = pto.alloc_tile addr = %c99328_i64 : !pto.tile_buf + %c99840_i64 = arith.constant 99840 : i64 + %39 = pto.alloc_tile addr = %c99840_i64 : !pto.tile_buf + %c100352_i64 = arith.constant 100352 : i64 + %40 = pto.alloc_tile addr = %c100352_i64 : !pto.tile_buf + %c100864_i64 = arith.constant 100864 : i64 + %41 = pto.alloc_tile addr = %c100864_i64 : !pto.tile_buf + %cst = arith.constant 0.0883883461 : f32 + %cst_2 = arith.constant 1.000000e+00 : f32 + %c3072 = arith.constant 3072 : index + %42 = pto.make_tensor_view %arg1, shape = [%c3072, %c128], strides = [%c128, %c1] : !pto.tensor_view + scf.for %arg2 = %14 to %18 step %c1 { + %43 = arith.muli %arg2, %c32 : index + %c101376_i64 = arith.constant 101376 : i64 + %44 = pto.alloc_tile addr = %c101376_i64 : !pto.tile_buf + pto.tpop(%44, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmuls ins(%44, %cst : !pto.tile_buf, f32) outs(%44 : !pto.tile_buf) + pto.trowmax ins(%44, %32 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %45 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %46 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %47 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf + %48 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %49 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.trowexpandsub ins(%44, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmuls ins(%45, %cst_2 : !pto.tile_buf, f32) outs(%46 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%38 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%25 : !pto.pipe) {split = 1} + %c101376_i64_3 = arith.constant 101376 : i64 + %50 = pto.alloc_tile addr = %c101376_i64_3 : !pto.tile_buf + pto.tpop(%50, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmuls ins(%50, %cst : !pto.tile_buf, f32) outs(%50 : !pto.tile_buf) + pto.trowmax ins(%50, %32 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %51 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %52 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %53 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf + %54 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %55 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%51, %52 : !pto.tile_buf, !pto.tile_buf) outs(%51 : !pto.tile_buf) + pto.tsub ins(%52, %51 : !pto.tile_buf, !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.tmuls ins(%51, %cst_2 : !pto.tile_buf, f32) outs(%52 : !pto.tile_buf) + pto.trowexpandsub ins(%50, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%53 : !pto.tile_buf) outs(%53 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%54, %53 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%54, %55 : !pto.tile_buf, !pto.tile_buf) outs(%54 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%25 : !pto.pipe) {split = 1} + %c101376_i64_4 = arith.constant 101376 : i64 + %56 = pto.alloc_tile addr = %c101376_i64_4 : !pto.tile_buf + pto.tpop(%56, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmov ins(%56 : !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree(%27 : !pto.pipe) {split = 1} + %c101376_i64_5 = arith.constant 101376 : i64 + %57 = pto.alloc_tile addr = %c101376_i64_5 : !pto.tile_buf + pto.tpop(%57, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmuls ins(%57, %cst : !pto.tile_buf, f32) outs(%57 : !pto.tile_buf) + pto.trowmax ins(%57, %32 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %58 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %59 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %60 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf + %61 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %62 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%58, %59 : !pto.tile_buf, !pto.tile_buf) outs(%58 : !pto.tile_buf) + pto.tsub ins(%59, %58 : !pto.tile_buf, !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.tmuls ins(%58, %cst_2 : !pto.tile_buf, f32) outs(%59 : !pto.tile_buf) + pto.trowexpandsub ins(%57, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%60 : !pto.tile_buf) outs(%60 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%61, %60 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%61, %62 : !pto.tile_buf, !pto.tile_buf) outs(%61 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%25 : !pto.pipe) {split = 1} + %c101376_i64_6 = arith.constant 101376 : i64 + %63 = pto.alloc_tile addr = %c101376_i64_6 : !pto.tile_buf + pto.tpop(%63, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %63 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree(%27 : !pto.pipe) {split = 1} + %c101376_i64_7 = arith.constant 101376 : i64 + %64 = pto.alloc_tile addr = %c101376_i64_7 : !pto.tile_buf + pto.tpop(%64, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmuls ins(%64, %cst : !pto.tile_buf, f32) outs(%64 : !pto.tile_buf) + pto.trowmax ins(%64, %32 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %65 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %66 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %67 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf + %68 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %69 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%65, %66 : !pto.tile_buf, !pto.tile_buf) outs(%65 : !pto.tile_buf) + pto.tsub ins(%66, %65 : !pto.tile_buf, !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.tmuls ins(%65, %cst_2 : !pto.tile_buf, f32) outs(%66 : !pto.tile_buf) + pto.trowexpandsub ins(%64, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%67 : !pto.tile_buf) outs(%67 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%68, %67 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%68, %69 : !pto.tile_buf, !pto.tile_buf) outs(%68 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%25 : !pto.pipe) {split = 1} + %c15 = arith.constant 15 : index + scf.for %arg3 = %c1 to %c15 step %c1 { + %c101376_i64_10 = arith.constant 101376 : i64 + %74 = pto.alloc_tile addr = %c101376_i64_10 : !pto.tile_buf + pto.tpop(%74, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %74 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree(%27 : !pto.pipe) {split = 1} + %c101376_i64_11 = arith.constant 101376 : i64 + %75 = pto.alloc_tile addr = %c101376_i64_11 : !pto.tile_buf + pto.tpop(%75, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmuls ins(%75, %cst : !pto.tile_buf, f32) outs(%75 : !pto.tile_buf) + pto.trowmax ins(%75, %32 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %76 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %77 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %78 = pto.treshape %40 : !pto.tile_buf -> !pto.tile_buf + %79 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %80 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%76, %77 : !pto.tile_buf, !pto.tile_buf) outs(%76 : !pto.tile_buf) + pto.tsub ins(%77, %76 : !pto.tile_buf, !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.tmuls ins(%76, %cst_2 : !pto.tile_buf, f32) outs(%77 : !pto.tile_buf) + pto.trowexpandsub ins(%75, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%78 : !pto.tile_buf) outs(%78 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%79, %78 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%79, %80 : !pto.tile_buf, !pto.tile_buf) outs(%79 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%25 : !pto.pipe) {split = 1} + %c101376_i64_12 = arith.constant 101376 : i64 + %81 = pto.alloc_tile addr = %c101376_i64_12 : !pto.tile_buf + pto.tpop(%81, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %81 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree(%27 : !pto.pipe) {split = 1} + %c101376_i64_13 = arith.constant 101376 : i64 + %82 = pto.alloc_tile addr = %c101376_i64_13 : !pto.tile_buf + pto.tpop(%82, %25 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.tmuls ins(%82, %cst : !pto.tile_buf, f32) outs(%82 : !pto.tile_buf) + pto.trowmax ins(%82, %32 : !pto.tile_buf, !pto.tile_buf) outs(%37 : !pto.tile_buf) + %83 = pto.treshape %37 : !pto.tile_buf -> !pto.tile_buf + %84 = pto.treshape %36 : !pto.tile_buf -> !pto.tile_buf + %85 = pto.treshape %41 : !pto.tile_buf -> !pto.tile_buf + %86 = pto.treshape %38 : !pto.tile_buf -> !pto.tile_buf + %87 = pto.treshape %39 : !pto.tile_buf -> !pto.tile_buf + pto.tmax ins(%83, %84 : !pto.tile_buf, !pto.tile_buf) outs(%83 : !pto.tile_buf) + pto.tsub ins(%84, %83 : !pto.tile_buf, !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.tmuls ins(%83, %cst_2 : !pto.tile_buf, f32) outs(%84 : !pto.tile_buf) + pto.trowexpandsub ins(%82, %37 : !pto.tile_buf, !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.texp ins(%85 : !pto.tile_buf) outs(%85 : !pto.tile_buf) + pto.texp ins(%33 : !pto.tile_buf) outs(%33 : !pto.tile_buf) + pto.tmul ins(%86, %85 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.trowsum ins(%33, %32 : !pto.tile_buf, !pto.tile_buf) outs(%39 : !pto.tile_buf) + pto.tadd ins(%86, %87 : !pto.tile_buf, !pto.tile_buf) outs(%86 : !pto.tile_buf) + pto.tcvt ins(%33 {rmode = #pto} : !pto.tile_buf) outs(%34 : !pto.tile_buf) + pto.tpush_to_aic(%34 : !pto.tile_buf) {id = 30, split = 1} + pto.tfree(%25 : !pto.pipe) {split = 1} + } + %c101376_i64_8 = arith.constant 101376 : i64 + %70 = pto.alloc_tile addr = %c101376_i64_8 : !pto.tile_buf + pto.tpop(%70, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.trowexpandmul ins(%35, %40 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %70 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree(%27 : !pto.pipe) {split = 1} + %c101376_i64_9 = arith.constant 101376 : i64 + %71 = pto.alloc_tile addr = %c101376_i64_9 : !pto.tile_buf + pto.tpop(%71, %27 : !pto.tile_buf, !pto.pipe) {split = 1} + pto.trowexpandmul ins(%35, %41 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tadd ins(%35, %71 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + pto.tfree(%27 : !pto.pipe) {split = 1} + pto.trowexpanddiv ins(%35, %38 : !pto.tile_buf, !pto.tile_buf) outs(%35 : !pto.tile_buf) + %72 = arith.addi %43, %31 : index + %73 = pto.partition_view %42, offsets = [%72, %c0], sizes = [%c16, %c128] : !pto.tensor_view + pto.tstore ins(%35 : !pto.tile_buf) outs(%73 : !pto.partition_tensor_view<16x128xf32>) + } + return + } + func.func @call_both(%arg0: memref<256xi64>, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.entry} { + pto.set_ffts %arg0 : memref<256xi64> + call @cube_kernel(%arg1, %arg2, %arg3, %arg4) : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + call @vector_kernel(%arg1, %arg5) : (!pto.ptr, !pto.ptr) -> () + return + } +} + +