From 4b39dab3651906328dc7d28dbf440647881405eb Mon Sep 17 00:00:00 2001 From: Ilya Panfilov Date: Mon, 11 May 2026 11:00:43 -0400 Subject: [PATCH 1/3] CK JIT integration --- .github/workflows/rocm-wheels-build.yml | 1 + .gitmodules | 3 + 3rdparty/ck_jit | 1 + ci/_utils.sh | 43 +- ci/ck_jit_prebuild.txt | 531 ++++++++++++++++++ ci/jax.sh | 3 + ci/pytorch.sh | 16 + setup.py | 12 +- .../common/ck_fused_attn/CMakeLists.txt | 85 ++- .../common/ck_fused_attn/aiter_prebuilt.cmake | 4 +- 10 files changed, 662 insertions(+), 37 deletions(-) create mode 160000 3rdparty/ck_jit create mode 100644 ci/ck_jit_prebuild.txt diff --git a/.github/workflows/rocm-wheels-build.yml b/.github/workflows/rocm-wheels-build.yml index c1a8ea087..63bd96578 100644 --- a/.github/workflows/rocm-wheels-build.yml +++ b/.github/workflows/rocm-wheels-build.yml @@ -87,6 +87,7 @@ jobs: 3rdparty/aotriton \ 3rdparty/aiter \ 3rdparty/QoLA \ + 3rdparty/ck_jit \ 3rdparty/hipify_torch - name: Derive Docker image tag diff --git a/.gitmodules b/.gitmodules index c81bdb590..d4b96f58f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -26,3 +26,6 @@ [submodule "3rdparty/QoLA"] path = 3rdparty/QoLA url = https://github.com/Micky774/QoLA.git +[submodule "3rdparty/ck_jit"] + path = 3rdparty/ck_jit + url = https://github.com/ipanfilo/ck_jit.git diff --git a/3rdparty/ck_jit b/3rdparty/ck_jit new file mode 160000 index 000000000..83f602449 --- /dev/null +++ b/3rdparty/ck_jit @@ -0,0 +1 @@ +Subproject commit 83f602449c525910fb98a0329c8b3a862711833d diff --git a/ci/_utils.sh b/ci/_utils.sh index b4aae9cc7..b9d453ced 100644 --- a/ci/_utils.sh +++ b/ci/_utils.sh @@ -237,11 +237,15 @@ start_message() { python --version } -configure_omp_threads() { +get_cpu_count() { n_vcpus=$(lscpu | grep "^CPU(s):" | awk '{print $2}') cpus_per_core=$(lscpu | grep "Thread(s) per core:" | awk '{print $NF}') - n_physical_cores=$((n_vcpus / cpus_per_core)) + echo $((n_vcpus / cpus_per_core)) +} + +configure_omp_threads() { + n_physical_cores=`get_cpu_count` n_parallel_jobs=$1 if [ -z ${OMP_NUM_THREADS} ]; then @@ -269,3 +273,38 @@ pytest_run() { pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" || test_run_error "[$_test_variant_tag] $1" echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`" } + +PYTHON_TE_IMPORT="import sys; sys.path[:] = [p for p in sys.path if p not in ['', '.']]; import transformer_engine" +ck_jit_prebuild() { + _prebuild_list="${TE_PATH}ci/ck_jit_prebuild.txt" + if [ ! -f "$_prebuild_list" ]; then + echo "ck_jit_prebuild: blob list not found: $_prebuild_list" >&2 + return 1 + fi + _gpu_arch=$(rocminfo | grep -E "^ *Name: *gfx" | head -1 | sed "s/.*gfx/gfx/;s/ .*//" 2>/dev/null) + if [ -n "$_gpu_arch" ]; then + _arch_arg="--arch $_gpu_arch" + else + echo "ck_jit_prebuild: GPU architecture not detected, omitting --arch" >&2 + _arch_arg="" + fi + _te_install_dir=$(python -c "${PYTHON_TE_IMPORT}; import os; print(os.path.dirname(transformer_engine.__file__))" 2>/dev/null) + if [ -z "$_te_install_dir" ]; then + echo "ck_jit_prebuild: failed to determine transformer_engine installation directory" >&2 + return 1 + fi + _prebuild_py="$_te_install_dir/lib/ck_jit/ck_jit_prebuild.py" + if [ ! -f "$_prebuild_py" ]; then + echo "ck_jit_prebuild: prebuild script not found: $_prebuild_py" >&2 + return 1 + fi + _cpu_count=$(get_cpu_count) + if [ -n "$_cpu_count" -a "$_cpu_count" != "0" ]; then + _jobs_arg="--jobs $((_cpu_count/2))" + fi + if [ "$1" = "build" ]; then + echo "Building CK JIT cache for arch=${_gpu_arch:-}..." + python "$_prebuild_py" build --blob-list "$_prebuild_list" $_arch_arg $_jobs_arg > /dev/null + fi + python "$_prebuild_py" cache | grep Cache +} diff --git a/ci/ck_jit_prebuild.txt b/ci/ck_jit_prebuild.txt new file mode 100644 index 000000000..f7becce10 --- /dev/null +++ b/ci/ck_jit_prebuild.txt @@ -0,0 +1,531 @@ +fmha_bwd_convert_dq_d128_bf16_b64x0_batch_o2_npad_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x0_batch_o2_npad_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_bf16_b64x0_batch_o2_pd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x0_batch_o2_pd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_bf16_b64x0_batch_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x0_batch_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_bf16_b64x0_group_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x0_group_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_npad_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_npad_deterministic_gfx9.so +fmha_bwd_convert_dq_d128_bf16_b64x128_group_o2_ps_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x128_group_o2_ps_deterministic_gfx9.so +fmha_bwd_convert_dq_d128_bf16_b64x192_batch_o2_npad_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_bf16_b64x192_group_o2_ps_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x0_batch_o2_npad_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x0_batch_o2_npad_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_fp16_b64x0_batch_o2_pd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x0_batch_o2_pd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_fp16_b64x0_batch_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_fp16_b64x0_group_o2_psd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x0_group_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_npad_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_npad_deterministic_gfx9.so +fmha_bwd_convert_dq_d128_fp16_b64x128_group_o2_ps_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x128_group_o2_ps_deterministic_gfx9.so +fmha_bwd_convert_dq_d128_fp16_b64x192_batch_o2_npad_deterministic_gfx950.so +fmha_bwd_convert_dq_d128_fp16_b64x192_group_o2_ps_deterministic_gfx950.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_npad_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_npad_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_pd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_pd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_psd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_psd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_bf16_b64x0_batch_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_bf16_b64x0_group_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_bf16_b64x0_group_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_fp16_b64x0_batch_o2_pd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_fp16_b64x0_batch_o2_pd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_fp16_b64x0_batch_o2_psd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_fp16_b64x0_batch_o2_psd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d256_fp16_b64x0_batch_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d256_fp16_b64x0_batch_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d32_bf16_b64x0_batch_o2_pd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d32_bf16_b64x0_batch_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d32_fp16_b64x0_batch_o2_npad_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d32_fp16_b64x0_batch_o2_npad_ndeterministic_gfx9.so.0PNcSK +fmha_bwd_convert_dq_d32_fp16_b64x0_batch_o2_npad_ndeterministic_gfx9.so.HqtomN +fmha_bwd_convert_dq_d32_fp16_b64x0_batch_o2_npad_ndeterministic_gfx9.so.PHbI9W +fmha_bwd_convert_dq_d32_fp16_b64x0_batch_o2_pd_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d32_fp16_b64x0_batch_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d64_bf16_b64x0_batch_o2_npad_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_bf16_b64x0_batch_o2_npad_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d64_bf16_b64x0_batch_o2_pd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_bf16_b64x0_batch_o2_psd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_bf16_b64x0_group_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_bf16_b64x0_group_o2_ps_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_pd_deterministic_gfx950.so +fmha_bwd_convert_dq_d64_fp16_b64x0_batch_o2_npad_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_fp16_b64x0_batch_o2_npad_ndeterministic_gfx9.so +fmha_bwd_convert_dq_d64_fp16_b64x0_batch_o2_pd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_fp16_b64x0_batch_o2_psd_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_fp16_b64x0_group_o2_ps_ndeterministic_gfx950.so +fmha_bwd_convert_dq_d64_fp16_b64x0_group_o2_ps_ndeterministic_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so.lzVzb4 +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so.vW7AN6 +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_batch_b16x16x128x16x128x16x16x128x128_r1x1x1_r1x1x1_r1x1x1_w16x16x32_w16x16x16_o2_maxq16_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x16x128x16x128x16x16x128x128_r1x1x1_r1x1x1_r1x1x1_w16x16x32_w16x16x16_o2_maxq16_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_bias_dbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_bias_dbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_batch_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_bf16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_bf16_group_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_batch_b16x16x128x16x128x16x16x128x128_r1x1x1_r1x1x1_r1x1x1_w16x16x32_w16x16x16_o2_maxq16_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x16x128x16x128x16x16x128x128_r1x1x1_r1x1x1_r1x1x1_w16x16x32_w16x16x16_o2_maxq16_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_bias_dbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_bias_dbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_bias_dbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_bias_dbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_batch_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d128_fp16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b16x192x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_deterministic_trload_gfx950.so +fmha_bwd_d128_fp16_group_b32x128x128x32x128x32x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_deterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_batch_b32x16x64x32x64x32x16x64x64_r1x1x1_r1x1x1_r1x1x1_w16x16x32_w16x16x16_o2_maxq32_pd8dv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_pdv8_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_alibi_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_bias_dbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_bias_dbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_bias_dbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_bias_dbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_pd8dv8_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_batch_b32x16x64x32x64x32x16x64x64_r1x1x1_r1x1x1_r1x1x1_w16x16x32_w16x16x16_o2_maxq32_pd8dv8_nbias_ndbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx950.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_ntrload_gfx9.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_mask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_dropout_wg16_ndeterministic_trload_gfx950.so +fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x32_o1_maxq0_npad_nbias_ndbias_nmask_ndropout_ndeterministic_trload_gfx950.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_npad_gfx950.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_npad_gfx9.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_pdv_gfx950.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_psdv_gfx9.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d128_bf16_b64_batch_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d128_bf16_b64_group_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d128_bf16_b64_group_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_npad_gfx950.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_npad_gfx9.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_pdv_gfx950.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_psdv_gfx9.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d128_fp16_b64_batch_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d128_fp16_b64_group_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d128_fp16_b64_group_o2_psdv_gfx9.so +fmha_bwd_dot_do_o_d128_fp16_b64_group_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d128_fp16_b64_group_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_npad_gfx950.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_npad_gfx9.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_pdv_gfx950.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_psdv_gfx9.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d256_bf16_b64_batch_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d256_bf16_b64_group_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d256_bf16_b64_group_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d256_fp16_b64_batch_o2_pdv_gfx950.so +fmha_bwd_dot_do_o_d256_fp16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d256_fp16_b64_batch_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d256_fp16_b64_batch_o2_psdv_gfx9.so +fmha_bwd_dot_do_o_d256_fp16_b64_batch_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d256_fp16_b64_batch_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d32_bf16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d32_bf16_b64_batch_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d32_fp16_b64_batch_o2_npad_gfx9.so +fmha_bwd_dot_do_o_d32_fp16_b64_batch_o2_npad_gfx9.so.AaNC0l +fmha_bwd_dot_do_o_d32_fp16_b64_batch_o2_npad_gfx9.so.LxYqV2 +fmha_bwd_dot_do_o_d32_fp16_b64_batch_o2_npad_gfx9.so.ub6U1V +fmha_bwd_dot_do_o_d32_fp16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d32_fp16_b64_batch_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d64_bf16_b64_batch_o2_npad_gfx950.so +fmha_bwd_dot_do_o_d64_bf16_b64_batch_o2_npad_gfx9.so +fmha_bwd_dot_do_o_d64_bf16_b64_batch_o2_pdv_gfx950.so +fmha_bwd_dot_do_o_d64_bf16_b64_batch_o2_pdv_gfx9.so +fmha_bwd_dot_do_o_d64_bf16_b64_batch_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d64_bf16_b64_group_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d64_bf16_b64_group_o2_psdv_gfx9.so +fmha_bwd_dot_do_o_d64_bf16_b64_group_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d64_bf16_b64_group_o2_ps_gfx9.so +fmha_bwd_dot_do_o_d64_fp16_b64_batch_o2_npad_gfx950.so +fmha_bwd_dot_do_o_d64_fp16_b64_batch_o2_npad_gfx9.so +fmha_bwd_dot_do_o_d64_fp16_b64_batch_o2_pdv_gfx950.so +fmha_bwd_dot_do_o_d64_fp16_b64_batch_o2_psdv_gfx950.so +fmha_bwd_dot_do_o_d64_fp16_b64_group_o2_ps_gfx950.so +fmha_bwd_dot_do_o_d64_fp16_b64_group_o2_ps_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so.uJYGn1 +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so.rIT0RY +fmha_fwd_d128_bf16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_npad_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_pddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b16x32x64x128x32x128_r1x1x1_r1x1x1_w16x16x32_w16x16x32_qr_async_trload_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b16x32x64x128x32x128_r1x1x1_r1x1x1_w16x16x32_w16x16x32_qr_async_trload_vr_npad_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b16x32x64x128x32x128_r1x1x1_r1x1x1_w16x16x32_w16x16x32_qr_async_trload_vr_pddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_bf16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_bf16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_bf16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_npad_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_pddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b128x64x32x128x16x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_trload_vr_pddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b16x32x64x128x32x128_r1x1x1_r1x1x1_w16x16x32_w16x16x32_qr_async_trload_vr_npad_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b16x32x64x128x32x128_r1x1x1_r1x1x1_w16x16x32_w16x16x32_qr_async_trload_vr_pddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_trload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_batch_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_group_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d128_fp16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d128_fp16_group_b64x128x32x128x32x128_r4x1x1_r4x1x1_w16x16x32_w16x16x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d192_bf16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d192_bf16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d192_bf16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d192_bf16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d192_bf16_batch_b128x128x32x192x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_o1_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d192_bf16_batch_b128x128x32x192x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_o1_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d192_fp16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d192_fp16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d192_fp16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d192_fp16_batch_b128x128x32x128x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d192_fp16_batch_b128x128x32x192x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_o1_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d192_fp16_batch_b128x128x32x192x32x192_r4x1x1_r4x1x1_w32x32x16_w32x32x16_o1_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_group_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_group_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_bf16_group_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_bf16_group_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_fp16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_fp16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_fp16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_fp16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_fp16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_fp16_batch_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d256_fp16_group_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d256_fp16_group_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_pssk_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_bf16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d32_fp16_batch_b128x64x16x32x32x32_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_psskddv_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_bf16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_alibi_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so.5n3I1a +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_batch_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_vr_npad_nlogits_bias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_mask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_dropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d64_fp16_group_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psskddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d96_bf16_batch_b128x128x32x128x32x96_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d96_bf16_batch_b128x128x32x128x32x96_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so +fmha_fwd_d96_fp16_batch_b128x128x32x128x32x96_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx950.so +fmha_fwd_d96_fp16_batch_b128x128x32x128x32x96_r4x1x1_r4x1x1_w32x32x16_w32x32x16_qr_async_vr_psddv_nlogits_nbias_nmask_lse_ndropout_nskip_nqscale_ntrload_nsink_gfx9.so diff --git a/ci/jax.sh b/ci/jax.sh index 0e1e356f6..0b3b8156f 100755 --- a/ci/jax.sh +++ b/ci/jax.sh @@ -106,6 +106,7 @@ install_prerequisites pip list | egrep "flax|fidle|jax|ml_dtypes|numpy|transformer_e|typing_ext" #check_test_jobs_requested #test $? -eq 0 && init_test_jobs `python -c "import jax; print(len([d for d in jax.devices() if 'rocm' in d.client.platform_version]))"` +ck_jit_prebuild build for _fus_attn in auto ck aotriton; do configure_fused_attn_env $_fus_attn || continue @@ -139,4 +140,6 @@ if [ -n "$TEST_JOBS_MODE" -a -n "$TEST_MGPU" ]; then configure_fused_attn_env $_fus_attn && run_test_config_mgpu done fi + +ck_jit_prebuild list return_run_results diff --git a/ci/pytorch.sh b/ci/pytorch.sh index 20d1b4e57..1bdf72ba9 100755 --- a/ci/pytorch.sh +++ b/ci/pytorch.sh @@ -133,11 +133,22 @@ if [ -n "$SINGLE_CONFIG" ]; then exit $? fi +check_flash_attn_installed() { + _result=$(python -c "${PYTHON_TE_IMPORT}; from transformer_engine.pytorch.attention.dot_product_attention.utils import FlashAttentionUtils; print(FlashAttentionUtils.is_installed)" 2>/dev/null) + if [ "$_result" = "True" ]; then + return 0 + else + echo "Flash attention is not installed" >&2 + return 1 + fi +} + #Master script mode: prepare testing prerequisites first start_message install_prerequisites pip list | egrep "flash|ml_dtypes|numpy|torch|transformer_e|typing_ext" #check_test_jobs_requested && init_test_jobs `python -c "import torch; print(torch.cuda.device_count())"` +ck_jit_prebuild build for _fus_attn in auto flash ck aotriton unfused; do configure_fused_attn_env $_fus_attn || continue @@ -160,6 +171,10 @@ for _fus_attn in auto flash ck aotriton unfused; do _DEFAULT_FUSED_ATTN="auto" fi + if [ $_fus_attn = flash ]; then + check_flash_attn_installed || continue + fi + if [ -n "$TEST_JOBS_MODE" ]; then test -n "$TEST_SGPU" && run_test_job "$_fus_attn" else @@ -182,4 +197,5 @@ if [ $TEST_LEVEL -ge 3 ]; then fi fi +ck_jit_prebuild list return_run_results diff --git a/setup.py b/setup.py index c66af39df..e3b9ed01a 100644 --- a/setup.py +++ b/setup.py @@ -76,16 +76,12 @@ def setup_common_extension() -> CMakeExtension: os.getenv("MPI_HOME") is not None ), "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1" cmake_flags.append("-DNVTE_UB_WITH_MPI=ON") - + if rocm_build(): cmake_flags.append("-DUSE_ROCM=ON") - if os.getenv("NVTE_AOTRITON_PATH"): - aotriton_path = Path(os.getenv("NVTE_AOTRITON_PATH")) - cmake_flags.append(f"-DAOTRITON_PATH={aotriton_path}") - cmake_flags.append(f"-DCK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT={os.getenv('NVTE_CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT', 3)}") - if os.getenv("NVTE_CK_FUSED_ATTN_PATH"): - ck_path = Path(os.getenv("NVTE_CK_FUSED_ATTN_PATH")) - cmake_flags.append(f"-DAITER_MHA_PATH={ck_path}") + cmake_flags.append( + f"-DCK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT={os.getenv('NVTE_CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT', '3')}" + ) if int(os.getenv("NVTE_FUSED_ATTN_AOTRITON", "1"))==0 or int(os.getenv("NVTE_FUSED_ATTN", "1"))==0: cmake_flags.append("-DUSE_FUSED_ATTN_AOTRITON=OFF") diff --git a/transformer_engine/common/ck_fused_attn/CMakeLists.txt b/transformer_engine/common/ck_fused_attn/CMakeLists.txt index 4640374aa..2c0d3b92b 100644 --- a/transformer_engine/common/ck_fused_attn/CMakeLists.txt +++ b/transformer_engine/common/ck_fused_attn/CMakeLists.txt @@ -6,7 +6,7 @@ set(CMAKE_CXX_STANDARD 17) project(ck_fused_attn LANGUAGES HIP CXX) -set(AITER_MHA_INSTALL_PREFIX "transformer_engine" CACHE STRING "aiter mha shared lib install prefix in TE") +set(AITER_MHA_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/transformer_engine/lib") #Corresponding runtime check is in nvte_get_fused_attn_backend() list(FIND CMAKE_HIP_ARCHITECTURES "gfx1250" _gfx1250_idx) @@ -67,22 +67,48 @@ else() message(WARNING "Python interpreter not found; skipping AITER API validation.") endif() -if(DEFINED AITER_MHA_PATH) - message(STATUS "[AITER-BUILD] Using AITER_MHA_PATH=${AITER_MHA_PATH}") - # use pre-built te_libmha_fwd.so te_libmha_bwd.so - set(__AITER_MHA_PATH ${AITER_MHA_PATH}) -else() - set(__AITER_MHA_PATH "") +set(__AITER_CACHE_DIR "") +set(__AITER_MHA_PATH "") +set(__QOLA_INCLUDE_DIR "") +if(DEFINED ENV{AITER_MHA_PATH}) + message(STATUS "[AITER-BUILD] Using AITER_MHA_PATH=$ENV{AITER_MHA_PATH}") + # use pre-built libraries and includes from a location specified by the user + set(__AITER_CACHE_DIR $ENV{AITER_MHA_PATH}) +elseif(NOT "$ENV{NVTE_CK_JIT}" STREQUAL "1") #disable for CK_JIT for now + # use pre-built cache include("${CMAKE_CURRENT_LIST_DIR}/aiter_prebuilt.cmake") - get_prebuilt_aiter(__AITER_MHA_PATH) + get_prebuilt_aiter(__AITER_CACHE_DIR) +endif() - if(__AITER_MHA_PATH STREQUAL "") - # If not available, fallback: Build from source via QoLA - list(JOIN CMAKE_HIP_ARCHITECTURES ";" GPU_ARCHS_STR) - message(STATUS "[AITER-BUILD] Building AITER kernels for ${GPU_ARCHS_STR} via QoLA.") - set(__QOLA_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/QoLA") +if(__AITER_CACHE_DIR STREQUAL "") + # If not available, fallback: Build from source via QoLA + list(JOIN CMAKE_HIP_ARCHITECTURES ";" GPU_ARCHS_STR) + message(STATUS "[AITER-BUILD] Building AITER kernels for ${GPU_ARCHS_STR} via QoLA.") + set(__QOLA_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/QoLA") + set(__QOLA_MANIFEST "${CMAKE_CURRENT_LIST_DIR}/qola_manifest.toml") + if(NOT "$ENV{NVTE_CK_JIT}" STREQUAL "0") + set(__CK_JIT_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/ck_jit") + set(__QOLA_BUILD_DIR "${__CK_JIT_BUILD_DIR}/qola") #Need it under ck_jit to clean on full build + if(DEFINED ENV{NVTE_CK_JIT_DIR}) + set(__CK_JIT_SOURCE_DIR $ENV{NVTE_CK_JIT_DIR}) + else() + set(__CK_JIT_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/ck_jit") + endif() + execute_process( + COMMAND ${Python_EXECUTABLE} "${__CK_JIT_SOURCE_DIR}/ck_jit_build.py" full + --with-qola + --qola-dir ${__QOLA_DIR} + --qola-manifest ${__QOLA_MANIFEST} + --qola-output "${__QOLA_BUILD_DIR}" + --gpu-archs "${GPU_ARCHS_STR}" + --aiter-dir ${__AITER_SOURCE_DIR} + --tmp-dir "${__CK_JIT_BUILD_DIR}" + --install-dir ${AITER_MHA_INSTALL_DIR} + --jit-name "te_ck_jit" + RESULT_VARIABLE QOLA_BUILD_RESULT + ) + else() set(__QOLA_BUILD_DIR "${__QOLA_DIR}/build") - set(__QOLA_MANIFEST "${CMAKE_CURRENT_LIST_DIR}/qola_manifest.toml") execute_process( COMMAND ${CMAKE_COMMAND} -E env "PYTHONPATH=${__QOLA_DIR}:$ENV{PYTHONPATH}" ${Python_EXECUTABLE} -m qola.cli build @@ -92,22 +118,30 @@ else() --arch "${GPU_ARCHS_STR}" RESULT_VARIABLE QOLA_BUILD_RESULT ) - if(NOT QOLA_BUILD_RESULT EQUAL 0) - message(FATAL_ERROR "[AITER-BUILD] QoLA build failed.") - endif() + endif() + if(NOT QOLA_BUILD_RESULT EQUAL 0) + message(FATAL_ERROR "[AITER-BUILD] QoLA build failed.") + endif() + if("$ENV{NVTE_CK_JIT}" STREQUAL "1") + set(__AITER_MHA_PATH ${AITER_MHA_INSTALL_DIR}) + set(__QOLA_INCLUDE_DIR "${__QOLA_BUILD_DIR}/include") + else() # Copy the final .so libs and exported public headers into the aiter # prebuilt cache so downstream consumers see a self-contained tree. - get_default_aiter_cache_dir(__QOLA_CACHE_DIR) - set(__QOLA_CACHE_LIB "${__QOLA_CACHE_DIR}/lib") + get_default_aiter_cache_dir(__AITER_CACHE_DIR) + set(__QOLA_CACHE_LIB "${__AITER_CACHE_DIR}/lib") file(MAKE_DIRECTORY ${__QOLA_CACHE_LIB}) file(GLOB __QOLA_BUILT_LIBS "${__QOLA_BUILD_DIR}/lib/*.so") file(COPY ${__QOLA_BUILT_LIBS} DESTINATION ${__QOLA_CACHE_LIB}) - file(COPY "${__QOLA_BUILD_DIR}/include" DESTINATION "${__QOLA_CACHE_DIR}") + file(COPY "${__QOLA_BUILD_DIR}/include" DESTINATION "${__AITER_CACHE_DIR}") set(__AITER_MHA_PATH "${__QOLA_CACHE_LIB}") - else() - message(STATUS "[AITER-BUILD] Using pre-built AITER from ${__AITER_MHA_PATH}") + set(__QOLA_INCLUDE_DIR "${__AITER_CACHE_DIR}/include") endif() +else() + message(STATUS "[AITER-BUILD] Using pre-built AITER from ${__AITER_CACHE_DIR}") + set(__AITER_MHA_PATH "${__AITER_CACHE_DIR}/lib") + set(__QOLA_INCLUDE_DIR "${__AITER_CACHE_DIR}/include") endif() set(ck_fused_attn_SOURCES) @@ -129,7 +163,6 @@ list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS # Public QoLA headers ship alongside the .so libs in ${__AITER_MHA_PATH}/../include # (emitted by qola.cli build, or copied from the QoLA build dir above for the # source-build path). -set(__QOLA_INCLUDE_DIR "${__AITER_MHA_PATH}/../include") if(NOT EXISTS "${__QOLA_INCLUDE_DIR}/qola_config.h") message(FATAL_ERROR "Could not find QoLA public headers at ${__QOLA_INCLUDE_DIR}.") endif() @@ -146,5 +179,7 @@ target_link_libraries(ck_fused_attn PUBLIC ${ck_fused_attn_LINKER_LIBS}) target_compile_options(ck_fused_attn PRIVATE ${CK_FUSED_ATTN_COMPILE_OPTIONS}) set_target_properties(ck_fused_attn PROPERTIES INSTALL_RPATH "$ORIGIN") -install(FILES ${__AITER_MHA_PATH}/te_libmha_fwd.so ${__AITER_MHA_PATH}/te_libmha_bwd.so DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib) -install(TARGETS ck_fused_attn DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib) +if (NOT "${__AITER_MHA_PATH}" STREQUAL "${AITER_MHA_INSTALL_DIR}") + install(FILES ${__AITER_MHA_PATH}/te_libmha_fwd.so ${__AITER_MHA_PATH}/te_libmha_bwd.so DESTINATION ${AITER_MHA_INSTALL_DIR}) +endif() +install(TARGETS ck_fused_attn DESTINATION ${AITER_MHA_INSTALL_DIR}) diff --git a/transformer_engine/common/ck_fused_attn/aiter_prebuilt.cmake b/transformer_engine/common/ck_fused_attn/aiter_prebuilt.cmake index ea0396116..da6182105 100644 --- a/transformer_engine/common/ck_fused_attn/aiter_prebuilt.cmake +++ b/transformer_engine/common/ck_fused_attn/aiter_prebuilt.cmake @@ -48,7 +48,7 @@ function(get_prebuilt_aiter PREBUILT_DIR_VAR) is_aiter_cache_valid("${ROCM_VER_PARAM}" RESULT) if(RESULT) get_aiter_cache_key("${ROCM_VER_PARAM}" _UNUSED CACHE_DIR) - set(${PREBUILT_DIR_VAR} "${CACHE_DIR}/lib" PARENT_SCOPE) + set(${PREBUILT_DIR_VAR} "${CACHE_DIR}" PARENT_SCOPE) return() endif() endforeach() @@ -62,7 +62,7 @@ function(get_prebuilt_aiter PREBUILT_DIR_VAR) download_aiter_prebuilt("${ROCM_VER_PARAM}" RESULT) if(RESULT) get_aiter_cache_key("${ROCM_VER_PARAM}" _UNUSED CACHE_DIR) - set(${PREBUILT_DIR_VAR} "${CACHE_DIR}/lib" PARENT_SCOPE) + set(${PREBUILT_DIR_VAR} "${CACHE_DIR}" PARENT_SCOPE) return() endif() endforeach() From 98b67975ed67b8b5c071957386c025f6fef3efb0 Mon Sep 17 00:00:00 2001 From: Ilya Panfilov Date: Mon, 11 May 2026 14:20:25 -0400 Subject: [PATCH 2/3] Properly enable CK_JIT by default --- .../common/ck_fused_attn/CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/CMakeLists.txt b/transformer_engine/common/ck_fused_attn/CMakeLists.txt index 2c0d3b92b..c45500f58 100644 --- a/transformer_engine/common/ck_fused_attn/CMakeLists.txt +++ b/transformer_engine/common/ck_fused_attn/CMakeLists.txt @@ -70,11 +70,16 @@ endif() set(__AITER_CACHE_DIR "") set(__AITER_MHA_PATH "") set(__QOLA_INCLUDE_DIR "") +if(NOT "$ENV{NVTE_CK_JIT}" STREQUAL "0") + set(__USE_CK_JIT TRUE) +else() + set(__USE_CK_JIT FALSE) +endif() if(DEFINED ENV{AITER_MHA_PATH}) message(STATUS "[AITER-BUILD] Using AITER_MHA_PATH=$ENV{AITER_MHA_PATH}") # use pre-built libraries and includes from a location specified by the user set(__AITER_CACHE_DIR $ENV{AITER_MHA_PATH}) -elseif(NOT "$ENV{NVTE_CK_JIT}" STREQUAL "1") #disable for CK_JIT for now +elseif(NOT __USE_CK_JIT) #disable for CK_JIT for now # use pre-built cache include("${CMAKE_CURRENT_LIST_DIR}/aiter_prebuilt.cmake") get_prebuilt_aiter(__AITER_CACHE_DIR) @@ -86,7 +91,8 @@ if(__AITER_CACHE_DIR STREQUAL "") message(STATUS "[AITER-BUILD] Building AITER kernels for ${GPU_ARCHS_STR} via QoLA.") set(__QOLA_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/QoLA") set(__QOLA_MANIFEST "${CMAKE_CURRENT_LIST_DIR}/qola_manifest.toml") - if(NOT "$ENV{NVTE_CK_JIT}" STREQUAL "0") + if(__USE_CK_JIT) + message(STATUS "[AITER-BUILD] CK_JIT is enabled; will build AITER kernels via CK_JIT.") set(__CK_JIT_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/ck_jit") set(__QOLA_BUILD_DIR "${__CK_JIT_BUILD_DIR}/qola") #Need it under ck_jit to clean on full build if(DEFINED ENV{NVTE_CK_JIT_DIR}) @@ -139,7 +145,6 @@ if(__AITER_CACHE_DIR STREQUAL "") set(__QOLA_INCLUDE_DIR "${__AITER_CACHE_DIR}/include") endif() else() - message(STATUS "[AITER-BUILD] Using pre-built AITER from ${__AITER_CACHE_DIR}") set(__AITER_MHA_PATH "${__AITER_CACHE_DIR}/lib") set(__QOLA_INCLUDE_DIR "${__AITER_CACHE_DIR}/include") endif() From 0d4448591da46d62ba1d237163029bae3f9145dc Mon Sep 17 00:00:00 2001 From: Ilya Panfilov Date: Mon, 11 May 2026 17:43:42 -0400 Subject: [PATCH 3/3] Properly enable CK_JIT by default 2 --- transformer_engine/common/ck_fused_attn/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/CMakeLists.txt b/transformer_engine/common/ck_fused_attn/CMakeLists.txt index c45500f58..5dac57cc6 100644 --- a/transformer_engine/common/ck_fused_attn/CMakeLists.txt +++ b/transformer_engine/common/ck_fused_attn/CMakeLists.txt @@ -99,7 +99,7 @@ if(__AITER_CACHE_DIR STREQUAL "") set(__CK_JIT_SOURCE_DIR $ENV{NVTE_CK_JIT_DIR}) else() set(__CK_JIT_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/ck_jit") - endif() + endif() execute_process( COMMAND ${Python_EXECUTABLE} "${__CK_JIT_SOURCE_DIR}/ck_jit_build.py" full --with-qola @@ -129,7 +129,7 @@ if(__AITER_CACHE_DIR STREQUAL "") message(FATAL_ERROR "[AITER-BUILD] QoLA build failed.") endif() - if("$ENV{NVTE_CK_JIT}" STREQUAL "1") + if(__USE_CK_JIT) set(__AITER_MHA_PATH ${AITER_MHA_INSTALL_DIR}) set(__QOLA_INCLUDE_DIR "${__QOLA_BUILD_DIR}/include") else()