From 2a66cacc0743bd35e0ea4fa7dc5541d6e0ff51d9 Mon Sep 17 00:00:00 2001
From: Cathal Lawlor <cathal.lawlor@arm.com>
Date: Fri, 29 May 2026 16:54:42 +0100
Subject: [PATCH] Apply fp16 split independent fixes

Squash the fp16 split independent fixes into a single IP review patch while preserving the original branch history separately.

Signed-off-by: Cathal Lawlor <cathal.lawlor@arm.com>
---
 .../core/mlas/lib/kleidiai/qgemm_kleidiai.cpp | 22 +++++-----
 .../test/contrib_ops/attention_op_test.cc     |  6 ++-
 onnxruntime/test/mlas/unittest/test_conv2d.h  | 41 ++++++++++---------
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
index 9e70b0742217a..2e5306d442476 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
@@ -101,7 +101,7 @@ MLASCALL
 ArmKleidiAI::MlasDynamicQGemmBatch(
     const MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS& Shape,
     const MLAS_GEMM_DYN_QUANT_DATA_PARAMS* DataParams,
-    const size_t BatchSize,
+    const size_t BatchN,
     MLAS_THREADPOOL* ThreadPool
 ) {
 
@@ -112,7 +112,7 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
     size_t m_step = qgemm_gemm.ukernel.get_m_step();
     size_t n_step = qgemm_gemm.ukernel.get_n_step();
 
-    if (BatchSize == 0 || Shape.M == 0 || Shape.N == 0 || Shape.K == 0) {
+    if (BatchN == 0 || Shape.M == 0 || Shape.N == 0 || Shape.K == 0) {
         return;
     }
 
@@ -123,7 +123,7 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
         MLAS_THROW_EX(std::runtime_error, "Dynamic QGEMM requires valid DataParams.");
     }
 
-    for (size_t batch_idx = 0; batch_idx < BatchSize; ++batch_idx) {
+    for (size_t batch_idx = 0; batch_idx < BatchN; ++batch_idx) {
         const auto& params = DataParams[batch_idx];
 
         if (params.A == nullptr) {
@@ -151,24 +151,24 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
     const size_t LhsPackedStride = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(Shape.M, Shape.K, mr, kr, sr);
     std::byte* LhsPackedData = nullptr;
 
-    if (g_kai_tls_qgemm.lhs_packed.capacity() < LhsPackedStride * BatchSize) {
+    if (g_kai_tls_qgemm.lhs_packed.capacity() < LhsPackedStride * BatchN) {
 
-        g_kai_tls_qgemm.lhs_packed.reserve(LhsPackedStride * BatchSize);
+        g_kai_tls_qgemm.lhs_packed.reserve(LhsPackedStride * BatchN);
     }
-    g_kai_tls_qgemm.lhs_packed.resize(LhsPackedStride * BatchSize);
+    g_kai_tls_qgemm.lhs_packed.resize(LhsPackedStride * BatchN);
     LhsPackedData = g_kai_tls_qgemm.lhs_packed.data();
 
     // Per-batch table of LHS base pointers.
-    if (g_kai_tls_qgemm.lhs_base_table.capacity() < BatchSize) {
+    if (g_kai_tls_qgemm.lhs_base_table.capacity() < BatchN) {
 
-        g_kai_tls_qgemm.lhs_base_table.reserve(BatchSize);
+        g_kai_tls_qgemm.lhs_base_table.reserve(BatchN);
     }
-    g_kai_tls_qgemm.lhs_base_table.resize(BatchSize);
+    g_kai_tls_qgemm.lhs_base_table.resize(BatchN);
     // Capture the shared batch table pointer so worker threads use the same backing storage.
     const std::byte** tls_lhs_base = g_kai_tls_qgemm.lhs_base_table.data();
     // B batches require no packing.
     // We have already decided the matmul variant we are using before having values for M, N, and K.
-    MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) {
+    MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t batch_idx) {
 
         std::byte* lhs = nullptr;
         if (DataParams[batch_idx].Workspace && DataParams[batch_idx].WorkspaceSize >= LhsPackedStride) {
@@ -184,7 +184,7 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
 
     // Tile iteration dimensions.
     std::array<size_t, 3> dim;
-    dim[0] = BatchSize;                  // B
+    dim[0] = BatchN;                     // B
     dim[1] = MlasDivRoundup(Shape.M, m_step);  // M
     dim[2] = MlasDivRoundup(Shape.N, n_step);  // N
 
diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index 6268e425ebb61..ad4c8941da782 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -521,8 +521,12 @@ TEST(ContribOpAttentionTest, AttentionBatch1_Float16) {
       3.154296875, 0.1082763671875, 4.25, 5.6484375,
       3.970703125, 0.072998046875, 4.25, 5.6484375};
 
+  // WebGPU Attention does not support mask_index input.
+  constexpr bool disable_webgpu = true;
   RunAttentionTest(input_data, weight_data, bias_data, mask_index_data, output_data,
-                   batch_size, sequence_length, hidden_size, number_of_heads, true);
+                   batch_size, sequence_length, hidden_size, number_of_heads, true /*use_float16*/,
+                   false, false, 0, nullptr, nullptr, AttentionMaskType::MASK_1D_KEY_SEQ_LEN, 0, 0,
+                   false /*disable_cpu*/, false /*disable_cuda*/, false /*disable_dml*/, disable_webgpu);
 }
 
 TEST(ContribOpAttentionTest, AttentionBatch2) {
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.h b/onnxruntime/test/mlas/unittest/test_conv2d.h
index ef65da4adb031..5f5d5e6b500c3 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d.h
+++ b/onnxruntime/test/mlas/unittest/test_conv2d.h
@@ -7,7 +7,7 @@
 
 #include "test_util.h"
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(USE_KLEIDIAI)
 #include "core/mlas/lib/mlasi.h"
 #endif
 
@@ -304,6 +304,25 @@ class MlasConv2DTest : public MlasTestBase {
 
   MlasConv2DTest() : threadpool_(Threaded ? GetMlasThreadPool() : nullptr) {}
 
+  void TestKleidiAiLhsPointerCacheByPadGrowth() {
+#if defined(USE_KLEIDIAI)
+    if (GetMlasPlatform().MlasConvPrepareOverride == nullptr) {
+      return;
+    }
+
+    // KleidiAI Conv caches LHS indirection tables by the per-thread padding
+    // buffer address. Grow the padding buffer, then reuse the original shape so
+    // stale pad pointers in the old cache entry would corrupt the result.
+    for (int i = 0; i < 4; ++i) {
+      Test(1, 1, 64, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1);
+      Test(1, 1, 320, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1);
+      Test(1, 1, 64, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1);
+    }
+#else
+    return;
+#endif
+  }
+
 #if defined(MLAS_TARGET_AMD64)
   void TestMobileClipAvx512DispatchSelection(size_t GroupCount,
                                              size_t InputHeight,
@@ -705,24 +724,7 @@ class MlasConv2DTest : public MlasTestBase {
       }
     }
 
-    //
-    // Regression test: exercise a KleidiAI Conv2D path when KleidiAI is enabled.
-    // See https://github.com/microsoft/onnxruntime/issues/26669.
-    //
-    // The KleidiAI implementation uses an internal per-thread padding buffer for out-of-bounds pixels
-    // when constructing the LHS indirection table. Historically, if the buffer was too small for a later
-    // convolution (larger CI), resizing could invalidate cached indirection pointers and lead to
-    // non-deterministic corruption.
-    //
-    // This sequence forces pad-buffer growth by running a smaller-CI convolution followed by a larger-CI
-    // convolution (with padding to ensure pad pointers are used), then runs the smaller-CI convolution again.
-    // Repeat a few times to increase the likelihood of triggering a reallocation and verify the path.
-    //
-    for (int i = 0; i < 4; ++i) {
-      Test(1, 1, 64, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1);   // smaller CI
-      Test(1, 1, 320, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1);  // larger CI forces pad buffer growth
-      Test(1, 1, 64, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1);   // sanity: back to smaller CI after growth
-    }
+    TestKleidiAiLhsPointerCacheByPadGrowth();
   }
 
   void ExecuteShort(void) override {
@@ -735,5 +737,6 @@ class MlasConv2DTest : public MlasTestBase {
     TestMobileClipBetaActivationRegression(128, 32, 32);
     TestMobileClipBetaActivationRegression(256, 16, 16);
     TestBatchedConv3DWorkingBufferUsesThreadTileSize();
+    TestKleidiAiLhsPointerCacheByPadGrowth();
   }
 };