diff --git a/Android.bp b/Android.bp
index 39ba145195..f97978deaa 100644
--- a/Android.bp
+++ b/Android.bp
@@ -462,6 +462,7 @@ cc_library_static {
         "src/cpu/kernels/CpuScatterKernel.cpp",
         "src/cpu/kernels/CpuSoftmaxKernel.cpp",
         "src/cpu/kernels/CpuSubKernel.cpp",
+        "src/cpu/kernels/CpuTopKVKernel.cpp",
         "src/cpu/kernels/CpuTransposeKernel.cpp",
         "src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
         "src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
@@ -611,6 +612,10 @@ cc_library_static {
         "src/cpu/kernels/sub/neon/qasymm8.cpp",
         "src/cpu/kernels/sub/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/sub/neon/qsymm16.cpp",
+        "src/cpu/kernels/topkv/generic/neon/fp16.cpp",
+        "src/cpu/kernels/topkv/generic/neon/fp32.cpp",
+        "src/cpu/kernels/topkv/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp",
         "src/cpu/operators/CpuActivation.cpp",
         "src/cpu/operators/CpuAdd.cpp",
         "src/cpu/operators/CpuAddMulAdd.cpp",
@@ -649,6 +654,7 @@ cc_library_static {
         "src/cpu/operators/CpuScatter.cpp",
         "src/cpu/operators/CpuSoftmax.cpp",
         "src/cpu/operators/CpuSub.cpp",
+        "src/cpu/operators/CpuTopKV.cpp",
         "src/cpu/operators/CpuTranspose.cpp",
         "src/cpu/operators/CpuWinogradConv2d.cpp",
         "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
@@ -978,6 +984,7 @@ cc_library_static {
         "src/runtime/NEON/functions/NEStackLayer.cpp",
         "src/runtime/NEON/functions/NEStridedSlice.cpp",
         "src/runtime/NEON/functions/NETile.cpp",
+        "src/runtime/NEON/functions/NETopKV.cpp",
         "src/runtime/NEON/functions/NETranspose.cpp",
         "src/runtime/NEON/functions/NEUnstack.cpp",
         "src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 49a6e9fefb..82388da2d7 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2025 Arm Limited.
+ * Copyright (c) 2016-2026 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -112,6 +112,7 @@
 #include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 #include "arm_compute/runtime/NEON/functions/NETile.h"
+#include "arm_compute/runtime/NEON/functions/NETopKV.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/NEON/functions/NEUnstack.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NETopKV.h b/arm_compute/runtime/NEON/functions/NETopKV.h
new file mode 100644
index 0000000000..8f63b2a62e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NETopKV.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NETOPKV_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NETOPKV_H
+
+/** @file
+ * @publicapi
+ */
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+
+/** Basic function to run cpu::kernels::CpuActivationKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class NETopKV : public IFunction
+{
+public:
+    /** Constructor */
+    NETopKV();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETopKV(const NETopKV &) = delete;
+    /** Default move constructor */
+    NETopKV(NETopKV &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETopKV &operator=(const NETopKV &) = delete;
+    /** Default move assignment operator */
+    NETopKV &operator=(NETopKV &&);
+    /** Destructor */
+    ~NETopKV();
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  predictions A batch_size x classes tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
+     * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
+     * @param[in]  k           Number of top elements to look at for computing precision.
+     */
+    void configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k);
+
+    /** Static function to check if given info will lead to a valid configuration.
+     *
+     * Similar to @ref NETopKV::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NETOPKV_H
diff --git a/filelist.json b/filelist.json
index 9cf4b72eb1..ae92ef763a 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2453,6 +2453,26 @@
           ]
         }
       },
+      "TopKV": {
+        "files": {
+          "common": [
+            "src/cpu/kernels/CpuTopKVKernel.cpp",
+            "src/cpu/operators/CpuTopKV.cpp",
+            "src/runtime/NEON/functions/NETopKV.cpp"
+          ],
+          "neon": {
+            "fp16": [ "src/cpu/kernels/topkv/generic/neon/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/topkv/generic/neon/fp32.cpp" ],
+            "qasymm8": [
+              "src/cpu/kernels/topkv/generic/neon/qasymm8.cpp"
+            ],
+            "qasymm8_signed": [
+              "src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp"
+            ]
+          }
+
+        }
+      },
       "Transpose": {
         "files": {
           "common": [
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index 3c0b83369a..6eb9570cd8 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -738,6 +738,7 @@ filegroup(
 	"cpu/kernels/CpuScatterKernel.cpp",
 	"cpu/kernels/CpuSoftmaxKernel.cpp",
 	"cpu/kernels/CpuSubKernel.cpp",
+	"cpu/kernels/CpuTopKVKernel.cpp",
 	"cpu/kernels/CpuTransposeKernel.cpp",
 	"cpu/kernels/CpuWeightsReshapeKernel.cpp",
 	"cpu/kernels/CpuWinogradConv2dKernel.cpp",
@@ -848,6 +849,9 @@ filegroup(
 	"cpu/kernels/sub/neon/qasymm8.cpp",
 	"cpu/kernels/sub/neon/qasymm8_signed.cpp",
 	"cpu/kernels/sub/neon/qsymm16.cpp",
+	"cpu/kernels/topkv/generic/neon/fp32.cpp",
+	"cpu/kernels/topkv/generic/neon/qasymm8.cpp",
+	"cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp",
 	"cpu/operators/CpuActivation.cpp",
 	"cpu/operators/CpuAdd.cpp",
 	"cpu/operators/CpuAddMulAdd.cpp",
@@ -886,6 +890,7 @@ filegroup(
 	"cpu/operators/CpuScatter.cpp",
 	"cpu/operators/CpuSoftmax.cpp",
 	"cpu/operators/CpuSub.cpp",
+	"cpu/operators/CpuTopKV.cpp",
 	"cpu/operators/CpuTranspose.cpp",
 	"cpu/operators/CpuWinogradConv2d.cpp",
 	"cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
@@ -994,6 +999,7 @@ filegroup(
 	"runtime/NEON/functions/NEStackLayer.cpp",
 	"runtime/NEON/functions/NEStridedSlice.cpp",
 	"runtime/NEON/functions/NETile.cpp",
+	"runtime/NEON/functions/NETopKV.cpp",
 	"runtime/NEON/functions/NETranspose.cpp",
 	"runtime/NEON/functions/NEUnstack.cpp",
 	"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
@@ -1109,7 +1115,8 @@ filegroup(
 	"cpu/kernels/scatter/generic/neon/fp16.cpp",
 	"cpu/kernels/select/generic/neon/fp16.cpp",
 	"cpu/kernels/softmax/generic/neon/fp16.cpp",
-	"cpu/kernels/sub/neon/fp16.cpp"]  +
+	"cpu/kernels/sub/neon/fp16.cpp",
+	"cpu/kernels/topkv/generic/neon/fp16.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 74847f6878..55e3c075e9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -732,6 +732,7 @@ target_sources(
 	cpu/kernels/CpuScatterKernel.cpp
 	cpu/kernels/CpuSoftmaxKernel.cpp
 	cpu/kernels/CpuSubKernel.cpp
+	cpu/kernels/CpuTopKVKernel.cpp
 	cpu/kernels/CpuTransposeKernel.cpp
 	cpu/kernels/CpuWeightsReshapeKernel.cpp
 	cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -842,6 +843,9 @@ target_sources(
 	cpu/kernels/sub/neon/qasymm8.cpp
 	cpu/kernels/sub/neon/qasymm8_signed.cpp
 	cpu/kernels/sub/neon/qsymm16.cpp
+	cpu/kernels/topkv/generic/neon/fp32.cpp
+	cpu/kernels/topkv/generic/neon/qasymm8.cpp
+	cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp
 	cpu/operators/CpuActivation.cpp
 	cpu/operators/CpuAdd.cpp
 	cpu/operators/CpuAddMulAdd.cpp
@@ -880,6 +884,7 @@ target_sources(
 	cpu/operators/CpuScatter.cpp
 	cpu/operators/CpuSoftmax.cpp
 	cpu/operators/CpuSub.cpp
+	cpu/operators/CpuTopKV.cpp
 	cpu/operators/CpuTranspose.cpp
 	cpu/operators/CpuWinogradConv2d.cpp
 	cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -988,6 +993,7 @@ target_sources(
 	runtime/NEON/functions/NEStackLayer.cpp
 	runtime/NEON/functions/NEStridedSlice.cpp
 	runtime/NEON/functions/NETile.cpp
+	runtime/NEON/functions/NETopKV.cpp
 	runtime/NEON/functions/NETranspose.cpp
 	runtime/NEON/functions/NEUnstack.cpp
 	runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1109,4 +1115,5 @@ target_sources(
 	cpu/kernels/select/generic/neon/fp16.cpp
 	cpu/kernels/softmax/generic/neon/fp16.cpp
 	cpu/kernels/sub/neon/fp16.cpp
+	cpu/kernels/topkv/generic/neon/fp16.cpp
 )
\ No newline at end of file
diff --git a/src/cpu/kernels/CpuTopKVKernel.cpp b/src/cpu/kernels/CpuTopKVKernel.cpp
new file mode 100644
index 0000000000..e0e17c3d32
--- /dev/null
+++ b/src/cpu/kernels/CpuTopKVKernel.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuTopKVKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/topkv/list.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+namespace kernels
+{
+namespace
+{
+
+static const std::vector<CpuTopKVKernel::TopKVKernel> available_kernels = {
+    {"neon_fp32_topkv", [](const CpuTopKVKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::topkv_fp32_neon)},
+    {"neon_fp16_topkv",
+     [](const CpuTopKVKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::topkv_fp16_neon)},
+    {"neon_qu8_topkv", [](const CpuTopKVKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::topkv_qasymm8_neon)},
+    {"neon_qs8_topkv",
+     [](const CpuTopKVKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::topkv_qasymm8_signed_neon)}};
+
+Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, uint32_t k)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
+
+    // src0: predictions
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+
+    // src1: targets (class indices)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U32);
+
+    // Basic dimensionality expectations:
+    // predictions: [C, N] (2D), targets: [N] (1D)
+    ARM_COMPUTE_RETURN_ERROR_ON(src0.num_dimensions() < 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1.num_dimensions() < 1);
+
+    const unsigned int C = src0.tensor_shape()[0]; // classes
+    const unsigned int N = src0.tensor_shape()[1]; // batch
+
+    // k constraints
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(k == 0, "k must be > 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(C == 0, "predictions classes dimension must be > 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(k > C, "k must be <= number of classes (C)");
+
+    // targets must match batch
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1.tensor_shape()[0] != N,
+                                    "targets dimension must match predictions batch dimension (N)");
+
+    // Output is one byte per batch element
+    const TensorShape out_shape(N);
+
+    // If dst is already configured, validate it
+    if (dst.total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst.tensor_shape() != out_shape, "dst shape must be [N]");
+    }
+
+    const auto uk = CpuTopKVKernel::get_implementation<CpuTopKVKernelDataTypeISASelectorData>(
+        CpuTopKVKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa()});
+
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    return Status{};
+}
+} // namespace
+
+void CpuTopKVKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, uint32_t k)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuTopKVKernel::configure");
+    ARM_COMPUTE_UNUSED(src1); // compiler error on v7a
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, k));
+
+    const auto uk = CpuTopKVKernel::get_implementation<CpuTopKVKernelDataTypeISASelectorData>(
+        CpuTopKVKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+    _run_method = uk->ukernel;
+    _name       = std::string("CpuTopKVKernel").append("/").append(uk->name);
+    _k          = k;
+    // Auto initialize dst if not initialized
+    auto_init_if_empty(*dst, TensorShape(src0->dimension(1)), 1U, DataType::U8);
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+size_t CpuTopKVKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    return 1024u;
+}
+
+Status CpuTopKVKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, uint32_t k)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuTopKVKernel::validate");
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, k));
+
+    return Status{};
+}
+
+void CpuTopKVKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuTopKVKernel::run_op");
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
+    _run_method(src0, src1, dst, _k, window);
+}
+
+const char *CpuTopKVKernel::name() const
+{
+    return _name.c_str();
+}
+
+const std::vector<CpuTopKVKernel::TopKVKernel> &CpuTopKVKernel::get_available_kernels()
+{
+    return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuTopKVKernel.h b/src/cpu/kernels/CpuTopKVKernel.h
new file mode 100644
index 0000000000..ebcddbcb36
--- /dev/null
+++ b/src/cpu/kernels/CpuTopKVKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_CPUTOPKVKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUTOPKVKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using CpuTopKVKernelDataTypeISASelectorData    = DataTypeISASelectorData;
+using CpuTopKVKernelDataTypeISASelectorDataPtr = DataTypeISASelectorPtr;
+
+/** Interface for the kernel to perform addition between two tensors */
+class CpuTopKVKernel : public ICpuKernel<CpuTopKVKernel>
+{
+private:
+    using TopKVKernelPtr =
+        std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, uint32_t, const Window &)>::type;
+
+public:
+    struct TopKVKernel
+    {
+        const char                                    *name;
+        const CpuTopKVKernelDataTypeISASelectorDataPtr is_selected;
+        TopKVKernelPtr                                 ukernel;
+    };
+
+    CpuTopKVKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTopKVKernel);
+    /** Initialise the kernel's input, dst and border mode.
+     *
+     * @param[in]  src0   First input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in]  src1   Second input tensor info. Data types supported: U32
+     * @param[out] dst    The dst tensor info. Data types supported: U8
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, uint32_t k);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuTopKVKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, uint32_t k);
+
+    // Inherited methods overridden:
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    static const std::vector<TopKVKernel> &get_available_kernels();
+
+private:
+    uint32_t       _k{};
+    TopKVKernelPtr _run_method{nullptr};
+    std::string    _name{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_CPUTOPKVKERNEL_H
diff --git a/src/cpu/kernels/topkv/generic/neon/fp16.cpp b/src/cpu/kernels/topkv/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..14c5d9f4a3
--- /dev/null
+++ b/src/cpu/kernels/topkv/generic/neon/fp16.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <limits>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void topkv_fp16_neon(const ITensor *in1, const ITensor *in2, ITensor *out, uint32_t k, const Window &win)
+{
+    // predictions: [C, N] F16
+    // targets:     [N]    U32
+    // output:      [N]    U8
+
+    const auto        &pred_info = *in1->info();
+    const unsigned int C         = pred_info.tensor_shape()[0];
+
+    // Match FP32 reference behaviour: threshold = target_val + epsilon (in float)
+    const float eps = std::numeric_limits<float>::epsilon();
+
+    // Assume classes are contiguous for each sample so we can vector load.
+    ARM_COMPUTE_ERROR_ON(pred_info.strides_in_bytes()[0] != sizeof(float16_t));
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int n = id.x(); // scheduled over output [N]
+
+            const uint32_t t = *reinterpret_cast<const uint32_t *>(in2->ptr_to_element(Coordinates{n}));
+
+            // Pointer to predictions[0, n]
+            const float16_t *base = reinterpret_cast<const float16_t *>(in1->ptr_to_element(Coordinates{0, n}));
+
+            // Read target value in FP16, compute threshold in FP32, then convert to FP16 for compares
+            const float target_val_f32 = static_cast<float>(base[t]);
+            const float threshold_f32  = target_val_f32 + eps;
+
+            // Convert threshold to fp16 and broadcast
+            const float16x8_t thr = vdupq_n_f16(static_cast<float16_t>(threshold_f32));
+
+            unsigned int c   = 0;
+            uint16x8_t   acc = vdupq_n_u16(0);
+
+            // Process 8 classes at a time (8 x fp16 lanes)
+            for (; c + 8 <= C; c += 8)
+            {
+                // Load 8 consecutive FP16 values
+                const float16x8_t v = vld1q_f16(reinterpret_cast<const __fp16 *>(base + c));
+
+                // Compare v > threshold (lane-wise), returns 0xFFFF/0x0000 per lane
+                const uint16x8_t m = vcgtq_f16(v, thr);
+
+                // Convert mask to 0/1 by shifting MSB into LSB, accumulate in u16 lanes
+                acc = vaddq_u16(acc, vshrq_n_u16(m, 15));
+            }
+
+            // Reduce accumulator to scalar
+            uint32_t rank = 0;
+
+#if defined(__aarch64__)
+            // Sum all 8 lanes
+            rank += static_cast<uint32_t>(vaddvq_u16(acc));
+#else
+            // ARMv7: manual horizontal sum
+            uint16x4_t s0 = vadd_u16(vget_low_u16(acc), vget_high_u16(acc));
+            s0            = vpadd_u16(s0, s0);
+            s0            = vpadd_u16(s0, s0);
+            rank += vget_lane_u16(s0, 0);
+#endif
+
+            // Tail
+            for (; c < C; ++c)
+            {
+                const float v = static_cast<float>(base[c]);
+                rank += (v > threshold_f32) ? 1u : 0u;
+            }
+
+            *reinterpret_cast<uint8_t *>(out->ptr_to_element(Coordinates{n})) = static_cast<uint8_t>(rank < k);
+        });
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/topkv/generic/neon/fp32.cpp b/src/cpu/kernels/topkv/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..3d78e9fe9e
--- /dev/null
+++ b/src/cpu/kernels/topkv/generic/neon/fp32.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <limits>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void topkv_fp32_neon(const ITensor *in1, const ITensor *in2, ITensor *out, uint32_t k, const Window &win)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "topkv_fp32_neon");
+
+    // predictions: [C, N] F32
+    // targets:     [N]    U32
+    // output:      [N]    U8
+
+    const auto        &pred_info = *in1->info();
+    const unsigned int C         = pred_info.tensor_shape()[0];
+    const float        eps       = std::numeric_limits<float>::epsilon();
+
+    // We assume classes are stored contiguously for each sample:
+    // stride[0] must equal sizeof(float) so we can use vld1q_f32 safely.
+    ARM_COMPUTE_ERROR_ON(pred_info.strides_in_bytes()[0] != sizeof(float));
+
+    execute_window_loop(win,
+                        [&](const Coordinates &id)
+                        {
+                            // Window is scheduled over output, so DimX corresponds to batch index (n)
+                            const int n = id.x();
+
+                            // Load target class index for sample n
+                            const uint32_t t = *reinterpret_cast<const uint32_t *>(in2->ptr_to_element(Coordinates{n}));
+
+                            // Pointer to predictions[0, n], i.e. the first class score for sample n
+                            const float *base = reinterpret_cast<const float *>(in1->ptr_to_element(Coordinates{0, n}));
+
+                            // Read prediction score for the target class
+                            const float target_val = base[t];
+
+                            // Threshold used for strict comparison:
+                            // equivalent to (v - target_val) > epsilon in the reference
+                            const float threshold = target_val + eps;
+
+                            // Broadcast threshold into a NEON vector
+                            const float32x4_t thr = vdupq_n_f32(threshold);
+
+                            unsigned int c = 0;
+
+                            // Accumulator holding the count of values greater than threshold
+                            // Each lane accumulates 0 or 1 per comparison
+                            uint32x4_t acc = vdupq_n_u32(0);
+
+                            // Process 4 class scores at a time
+                            for (; c + 4 <= C; c += 4)
+                            {
+                                // Load 4 consecutive prediction values:
+                                // v = { base[c], base[c+1], base[c+2], base[c+3] }
+                                const float32x4_t v = vld1q_f32(base + c);
+
+                                // Compare v > threshold elementwise
+                                // Result is a mask: 0xFFFFFFFF where true, 0x00000000 where false
+                                const uint32x4_t m = vcgtq_f32(v, thr);
+
+                                // Convert mask to {0,1} by shifting MSB into LSB position
+                                // Then accumulate into acc
+                                acc = vaddq_u32(acc, vshrq_n_u32(m, 31));
+                            }
+
+                            // Horizontal reduction of acc into scalar rank
+                            uint32_t rank = 0;
+
+#if defined(__aarch64__)
+                            // AArch64 has a single instruction to sum all lanes
+                            rank += vaddvq_u32(acc);
+#else
+            // ARMv7: pairwise add manually
+            uint32x2_t acc2 = vadd_u32(vget_low_u32(acc), vget_high_u32(acc));
+            acc2 = vpadd_u32(acc2, acc2);
+            rank += vget_lane_u32(acc2, 0);
+#endif
+
+                            // Handle remaining classes (tail) scalar
+                            for (; c < C; ++c)
+                            {
+                                rank += (base[c] > threshold) ? 1u : 0u;
+                            }
+
+                            // Output 1 if target class is in top-k (rank < k), else 0
+                            *reinterpret_cast<uint8_t *>(out->ptr_to_element(Coordinates{n})) =
+                                static_cast<uint8_t>(rank < k);
+                        });
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/topkv/generic/neon/qasymm8.cpp b/src/cpu/kernels/topkv/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..48b35e5901
--- /dev/null
+++ b/src/cpu/kernels/topkv/generic/neon/qasymm8.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void topkv_qasymm8_neon(const ITensor *in1, const ITensor *in2, ITensor *out, uint32_t k, const Window &win)
+{
+    // predictions: [C, N] QASYMM8 (U8)
+    // targets:     [N]    U32
+    // output:      [N]    U8
+
+    const auto        &pred_info = *in1->info();
+    const unsigned int C         = pred_info.tensor_shape()[0];
+
+    // Assume classes are contiguous for each sample
+    ARM_COMPUTE_ERROR_ON(pred_info.strides_in_bytes()[0] != sizeof(uint8_t));
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int n = id.x(); // scheduled over output [N]
+
+            const uint32_t t = *reinterpret_cast<const uint32_t *>(in2->ptr_to_element(Coordinates{n}));
+
+            const uint8_t *base = reinterpret_cast<const uint8_t *>(in1->ptr_to_element(Coordinates{0, n}));
+
+            const uint8_t    target_val = base[t];
+            const uint8x16_t thr        = vdupq_n_u8(target_val);
+
+            unsigned int c = 0;
+
+            // Accumulate counts in u16 lanes to avoid overflow
+            uint16x8_t acc0 = vdupq_n_u16(0);
+            uint16x8_t acc1 = vdupq_n_u16(0);
+
+            for (; c + 16 <= C; c += 16)
+            {
+                // Load 16 u8 scores
+                const uint8x16_t v = vld1q_u8(base + c);
+
+                // Compare v > target_val -> mask bytes are 0xFF (true) or 0x00 (false)
+                const uint8x16_t m = vcgtq_u8(v, thr);
+
+                // Turn 0xFF/0x00 into 1/0 cheaply: mask & 1
+                const uint8x16_t ones = vandq_u8(m, vdupq_n_u8(1));
+
+                // Widen low/high halves to u16 and accumulate
+                acc0 = vaddq_u16(acc0, vmovl_u8(vget_low_u8(ones)));
+                acc1 = vaddq_u16(acc1, vmovl_u8(vget_high_u8(ones)));
+            }
+
+            uint32_t rank = 0;
+
+#if defined(__aarch64__)
+            rank += vaddvq_u16(acc0);
+            rank += vaddvq_u16(acc1);
+#else
+            // ARMv7 horizontal sum for acc0
+            {
+                uint16x4_t s = vadd_u16(vget_low_u16(acc0), vget_high_u16(acc0));
+                s            = vpadd_u16(s, s);
+                s            = vpadd_u16(s, s);
+                rank += vget_lane_u16(s, 0);
+            }
+            // ARMv7 horizontal sum for acc1
+            {
+                uint16x4_t s = vadd_u16(vget_low_u16(acc1), vget_high_u16(acc1));
+                s            = vpadd_u16(s, s);
+                s            = vpadd_u16(s, s);
+                rank += vget_lane_u16(s, 0);
+            }
+#endif
+
+            // Tail
+            for (; c < C; ++c)
+            {
+                rank += (base[c] > target_val) ? 1u : 0u;
+            }
+
+            *reinterpret_cast<uint8_t *>(out->ptr_to_element(Coordinates{n})) = static_cast<uint8_t>(rank < k);
+        });
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..223094f4ba
--- /dev/null
+++ b/src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void topkv_qasymm8_signed_neon(const ITensor *in1, const ITensor *in2, ITensor *out, uint32_t k, const Window &win)
+{
+    // predictions: [C, N] QASYMM8_SIGNED (S8)
+    // targets:     [N]    U32
+    // output:      [N]    U8
+
+    const auto        &pred_info = *in1->info();
+    const unsigned int C         = pred_info.tensor_shape()[0];
+
+    // Assume classes are contiguous for each sample
+    ARM_COMPUTE_ERROR_ON(pred_info.strides_in_bytes()[0] != sizeof(int8_t));
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int n = id.x(); // scheduled over output [N]
+
+            const uint32_t t = *reinterpret_cast<const uint32_t *>(in2->ptr_to_element(Coordinates{n}));
+
+            const int8_t *base = reinterpret_cast<const int8_t *>(in1->ptr_to_element(Coordinates{0, n}));
+
+            const int8_t    target_val = base[t];
+            const int8x16_t thr        = vdupq_n_s8(target_val);
+
+            unsigned int c    = 0;
+            uint16x8_t   acc0 = vdupq_n_u16(0);
+            uint16x8_t   acc1 = vdupq_n_u16(0);
+
+            for (; c + 16 <= C; c += 16)
+            {
+                const int8x16_t v = vld1q_s8(base + c);
+
+                // Signed compare v > thr.
+                // NOTE: vcgtq_s8 returns a uint8x16_t mask (0xFF/0x00 per lane) on AArch64.
+                const uint8x16_t m = vcgtq_s8(v, thr);
+
+                // Convert mask to 1/0: mask & 1
+                const uint8x16_t ones = vandq_u8(m, vdupq_n_u8(1));
+
+                // Widen and accumulate
+                acc0 = vaddq_u16(acc0, vmovl_u8(vget_low_u8(ones)));
+                acc1 = vaddq_u16(acc1, vmovl_u8(vget_high_u8(ones)));
+            }
+
+            uint32_t rank = 0;
+
+#if defined(__aarch64__)
+            rank += vaddvq_u16(acc0);
+            rank += vaddvq_u16(acc1);
+#else
+            {
+                uint16x4_t s = vadd_u16(vget_low_u16(acc0), vget_high_u16(acc0));
+                s            = vpadd_u16(s, s);
+                s            = vpadd_u16(s, s);
+                rank += vget_lane_u16(s, 0);
+            }
+            {
+                uint16x4_t s = vadd_u16(vget_low_u16(acc1), vget_high_u16(acc1));
+                s            = vpadd_u16(s, s);
+                s            = vpadd_u16(s, s);
+                rank += vget_lane_u16(s, 0);
+            }
+#endif
+
+            for (; c < C; ++c)
+            {
+                rank += (base[c] > target_val) ? 1u : 0u;
+            }
+
+            *reinterpret_cast<uint8_t *>(out->ptr_to_element(Coordinates{n})) = static_cast<uint8_t>(rank < k);
+        });
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/topkv/list.h b/src/cpu/kernels/topkv/list.h
new file mode 100644
index 0000000000..b3488ae94c
--- /dev/null
+++ b/src/cpu/kernels/topkv/list.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_TOPKV_LIST_H
+#define ACL_SRC_CPU_KERNELS_TOPKV_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_TOPKV_KERNEL(func_name) \
+    void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, uint32_t k, const Window &win)
+
+DECLARE_TOPKV_KERNEL(topkv_qasymm8_neon);
+DECLARE_TOPKV_KERNEL(topkv_qasymm8_signed_neon);
+DECLARE_TOPKV_KERNEL(topkv_fp16_neon);
+DECLARE_TOPKV_KERNEL(topkv_fp32_neon);
+
+#undef DECLARE_TOPKV_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_TOPKV_LIST_H
diff --git a/src/cpu/operators/CpuTopKV.cpp b/src/cpu/operators/CpuTopKV.cpp
new file mode 100644
index 0000000000..a509b29c34
--- /dev/null
+++ b/src/cpu/operators/CpuTopKV.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuTopKV.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/IOperator.h"
+#include "src/common/utils/LegacySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/cpu/kernels/CpuTopKVKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuTopKV::configure(const ITensorInfo *predictions,
+                         const ITensorInfo *targets,
+                         ITensorInfo       *output,
+                         const unsigned int k)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuTopKV::configure");
+    ARM_COMPUTE_LOG_PARAMS(predictions, targets, output, k);
+
+    auto kernel = std::make_unique<kernels::CpuTopKVKernel>();
+    kernel->configure(predictions, targets, output, k);
+    _kernel = std::move(kernel);
+}
+
+Status CpuTopKV::validate(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuTopKV::validate");
+    return kernels::CpuTopKVKernel::validate(predictions, targets, output, k);
+}
+
+void CpuTopKV::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuTopKV::run");
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuTopKV.h b/src/cpu/operators/CpuTopKV.h
new file mode 100644
index 0000000000..859c36e8c9
--- /dev/null
+++ b/src/cpu/operators/CpuTopKV.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUTOPKV_H
+#define ACL_SRC_CPU_OPERATORS_CPUTOPKV_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuActivationKernel */
+class CpuTopKV : public ICpuOperator
+{
+public:
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  predictions A classes x batches tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
+     * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
+     * @param[in]  k           Number of top elements to look at for computing precision.
+     */
+    void
+    configure(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+
+    /** Static function to check if given info will lead to a valid configuration.
+     *
+     * Similar to @ref CpuTopKV::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUTOPKV_H
diff --git a/src/runtime/NEON/functions/NETopKV.cpp b/src/runtime/NEON/functions/NETopKV.cpp
new file mode 100644
index 0000000000..7a6eb1d692
--- /dev/null
+++ b/src/runtime/NEON/functions/NETopKV.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETopKV.h"
+
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/profile/acl_profile.h"
+#include "src/cpu/operators/CpuTopKV.h"
+
+namespace arm_compute
+{
+struct NETopKV::Impl
+{
+    const ITensor                 *predictions{nullptr};
+    const ITensor                 *targets{nullptr};
+    ITensor                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuTopKV> op{nullptr};
+};
+
+NETopKV::NETopKV() : _impl(std::make_unique<Impl>())
+{
+}
+NETopKV::NETopKV(NETopKV &&)            = default;
+NETopKV &NETopKV::operator=(NETopKV &&) = default;
+NETopKV::~NETopKV()                     = default;
+
+void NETopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "NETopKV::configure");
+    _impl->predictions = predictions;
+    _impl->targets     = targets;
+    _impl->dst         = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->predictions, _impl->targets, _impl->dst);
+
+    _impl->op = std::make_unique<cpu::CpuTopKV>();
+    _impl->op->configure(_impl->predictions->info(), _impl->targets->info(), _impl->dst->info(), k);
+}
+
+Status
+NETopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "NETopKV::validate");
+    ARM_COMPUTE_RETURN_ERROR_ON_DYNAMIC_SHAPE(predictions, targets, output);
+    return cpu::CpuTopKV::validate(predictions, targets, output, k);
+}
+
+void NETopKV::run()
+{
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "NETopKV::run");
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->predictions);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->targets);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 83ef352a75..14a62cd65c 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2025 Arm Limited.
+ * Copyright (c) 2017-2026 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1078,6 +1078,25 @@ class LogisticSMEStressShapesFp32 final : public ShapeDataset
     {
     }
 };
+
+/** Data set containing small 2D tensor shapes for TopKV operator. */
+class SmallTopKV final : public ShapeDataset
+{
+public:
+    SmallTopKV() : ShapeDataset("Shape", {TensorShape{8U, 7U}, TensorShape{15U, 13U}, TensorShape{32U, 64U}})
+    {
+    }
+};
+
+/** Data set containing small 2D tensor shapes for TopKV operator. */
+class LargeTopKV final : public ShapeDataset
+{
+public:
+    LargeTopKV()
+        : ShapeDataset("Shape", {TensorShape{1000U, 64U}, TensorShape{1500U, 128U}, TensorShape{1000U, 32000U}})
+    {
+    }
+};
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/TopKV.cpp b/tests/validation/NEON/TopKV.cpp
new file mode 100644
index 0000000000..b794edb034
--- /dev/null
+++ b/tests/validation/NEON/TopKV.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/Acl.hpp"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
+#include "arm_compute/runtime/NEON/functions/NETopKV.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "support/AclRequires.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/validation/fixtures/TopKVLayerFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+const auto small_dataset_topkv = combine(datasets::SmallTopKV(), make("K", 3, 5));
+const auto large_dataset_topkv = combine(datasets::LargeTopKV(), make("K", 3, 5));
+const auto f32_small_dataset   = combine(small_dataset_topkv, make("DataType", DataType::F32));
+const auto f32_large_dataset   = combine(large_dataset_topkv, make("DataType", DataType::F32));
+const auto f16_small_dataset   = combine(small_dataset_topkv, make("DataType", DataType::F16));
+const auto f16_large_dataset   = combine(large_dataset_topkv, make("DataType", DataType::F16));
+const auto qu8_small_dataset   = combine(small_dataset_topkv, make("DataType", DataType::QASYMM8));
+const auto qu8_large_dataset   = combine(large_dataset_topkv, make("DataType", DataType::QASYMM8));
+const auto qs8_small_dataset   = combine(small_dataset_topkv, make("DataType", DataType::QASYMM8_SIGNED));
+const auto qs8_large_dataset   = combine(large_dataset_topkv, make("DataType", DataType::QASYMM8_SIGNED));
+
+TEST_SUITE(NEON)
+TEST_SUITE(TopKVLayer)
+// clang-format on
+// *INDENT-ON*
+template <typename T>
+using NETopKVFixture = TopKVValidationFixture<Tensor, Accessor, NETopKV, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NETopKVFixture<uint8_t>, framework::DatasetMode::ALL, qu8_small_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NETopKVFixture<uint8_t>, framework::DatasetMode::NIGHTLY, qu8_large_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NETopKVFixture<int8_t>, framework::DatasetMode::ALL, qs8_small_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NETopKVFixture<int8_t>, framework::DatasetMode::NIGHTLY, qs8_large_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE(Float)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NETopKVFixture<half>, framework::DatasetMode::ALL, f16_small_dataset)
+{
+    if (CPUInfo::get().has_fp16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_WARNING("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_WARNING();
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NETopKVFixture<half>, framework::DatasetMode::NIGHTLY, f16_large_dataset)
+{
+    if (CPUInfo::get().has_fp16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_WARNING("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_WARNING();
+    }
+}
+
+TEST_SUITE_END() // FP16
+#endif           /* ARM_COMPUTE_ENABLE_FP16 */
+
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NETopKVFixture<float>, framework::DatasetMode::ALL, f32_small_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NETopKVFixture<float>, framework::DatasetMode::NIGHTLY, f32_large_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // TopKVLayer
+TEST_SUITE_END() // Neon
+
+TEST_SUITE(CPP)
+TEST_SUITE(TopKVLayer)
+
+template <typename T>
+using CPPTopKVLayerFixture = TopKVValidationFixture<Tensor, Accessor, CPPTopKV, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+// Used only for benchmarking
+FIXTURE_DATA_TEST_CASE(RunLarge, CPPTopKVLayerFixture<float>, framework::DatasetMode::DISABLED, f32_large_dataset)
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0));
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // TopKVLayer
+TEST_SUITE_END() // Neon
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/TopKVLayerFixture.h b/tests/validation/fixtures/TopKVLayerFixture.h
new file mode 100644
index 0000000000..c2d13efa37
--- /dev/null
+++ b/tests/validation/fixtures/TopKVLayerFixture.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_TOPKVLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_TOPKVLAYERFIXTURE_H
+
+#include "tests/AssetsLibrary.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/Globals.h"
+#include "tests/validation/reference/TopKV.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class TopKVValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape predictions_shape, uint32_t k, DataType input_data_type)
+    {
+        if (std::is_same<TensorType, Tensor>::value && // Cpu
+            input_data_type == DataType::F16 && !CPUInfo::get().has_fp16())
+        {
+            return;
+        }
+        const TensorShape targets_shape(predictions_shape[1]);
+        _data_type = input_data_type;
+        _target    = compute_target(predictions_shape, targets_shape, k);
+        _reference = compute_reference(predictions_shape, targets_shape, k);
+    }
+
+protected:
+    // Fills prediction scores with small random noise, assigns a random target class to each sample,
+    // and slightly boosts the target score so some targets fall inside the top-K and others do not.
+    template <typename U, typename W>
+    void fill(U &&predictions, W &&targets, int32_t i, uint32_t C)
+    {
+        std::mt19937 gen(0);
+
+        // 1) Fill targets with valid class indices in [0, C-1]
+        std::uniform_int_distribution<uint32_t> class_dist(0, C - 1);
+        library->fill(targets, class_dist, i);
+
+        const unsigned int N = targets.shape()[0];
+
+        // 2) Fill predictions with small noise + 3) add a per-sample boost to the target class
+        if (_data_type == DataType::F32 || _data_type == DataType::F16)
+        {
+            std::normal_distribution<float> noise(0.f, 0.1f);
+            library->fill(predictions, noise, i);
+
+            // Mixture of boosts so output is not always 1
+            std::uniform_real_distribution<float> u01(0.f, 1.f);
+            std::normal_distribution<float>       strong_boost(0.8f, 0.15f);
+            std::normal_distribution<float>       medium_boost(0.35f, 0.15f);
+            std::normal_distribution<float>       weak_boost(0.05f, 0.10f);
+
+            for (unsigned int n = 0; n < N; ++n)
+            {
+                const uint32_t target = *reinterpret_cast<const uint32_t *>(targets(Coordinates{static_cast<int>(n)}));
+
+                float       boost = 0.f;
+                const float r     = u01(gen);
+                if (r < 0.35f)
+                {
+                    boost = strong_boost(gen);
+                }
+                else if (r < 0.80f)
+                {
+                    boost = medium_boost(gen);
+                }
+                else
+                {
+                    boost = weak_boost(gen);
+                }
+                boost = std::max(-0.2f, std::min(boost, 1.5f));
+
+                const Coordinates pc{static_cast<int>(target), static_cast<int>(n)};
+
+                if (_data_type == DataType::F32)
+                {
+                    *reinterpret_cast<float *>(predictions(pc)) += boost;
+                }
+                else // F16
+                {
+                    auto *p = reinterpret_cast<half *>(predictions(pc));
+                    *p      = static_cast<half>(static_cast<float>(*p) + boost);
+                }
+            }
+            return;
+        }
+
+        if (_data_type == DataType::QASYMM8 || _data_type == DataType::QASYMM8_SIGNED)
+        {
+            const bool is_signed = (_data_type == DataType::QASYMM8_SIGNED);
+
+            // Small integer noise
+            if (is_signed)
+            {
+                std::uniform_int_distribution<int> noise(-6, 6);
+                library->fill(predictions, noise, i);
+            }
+            else
+            {
+                std::uniform_int_distribution<int> noise(0, 12);
+                library->fill(predictions, noise, i);
+            }
+
+            // Small, variable boost in quantized units
+            std::uniform_int_distribution<int> boost_dist(0, 18);
+
+            for (unsigned int n = 0; n < N; ++n)
+            {
+                const uint32_t target = *reinterpret_cast<const uint32_t *>(targets(Coordinates{static_cast<int>(n)}));
+
+                const int         boost = boost_dist(gen);
+                const Coordinates pc{static_cast<int>(target), static_cast<int>(n)};
+
+                if (is_signed)
+                {
+                    auto     *p = reinterpret_cast<int8_t *>(predictions(pc));
+                    const int v = static_cast<int>(*p) + boost;
+                    *p          = static_cast<int8_t>(std::max(-128, std::min(127, v)));
+                }
+                else
+                {
+                    auto     *p = reinterpret_cast<uint8_t *>(predictions(pc));
+                    const int v = static_cast<int>(*p) + boost;
+                    *p          = static_cast<uint8_t>(std::max(0, std::min(255, v)));
+                }
+            }
+            return;
+        }
+    }
+
+    TensorType compute_target(const TensorShape &pred_shape, const TensorShape &targets_shape, uint32_t k)
+    {
+        // Create tensors
+        TensorType pred    = create_tensor<TensorType>(pred_shape, _data_type, 1, QuantizationInfo());
+        TensorType targets = create_tensor<TensorType>(targets_shape, DataType::U32, 1, QuantizationInfo());
+        TensorType output;
+
+        // Create and configure function
+        FunctionType topkv_layer;
+        topkv_layer.configure(&pred, &targets, &output, k);
+
+        ARM_COMPUTE_ASSERT(pred.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(targets.info()->is_resizable());
+
+        // Allocate tensors
+        pred.allocator()->allocate();
+        targets.allocator()->allocate();
+        output.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!pred.info()->is_resizable());
+
+        // Fill tensors
+        const auto C{pred_shape[0]};
+        fill(AccessorType(pred), AccessorType(targets), 0, C);
+
+        topkv_layer.run();
+
+        return output;
+    }
+
+    SimpleTensor<uint8_t> compute_reference(const TensorShape &pred_shape, const TensorShape &targets_shape, uint32_t k)
+    {
+        // Create reference
+        SimpleTensor<T>        pred{pred_shape, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<uint32_t> targets{targets_shape, DataType::U32, 1, QuantizationInfo()};
+
+        const auto C{pred_shape[0]};
+        fill(pred, targets, 0, C);
+
+        return reference::topkv(pred, targets, k);
+    }
+
+protected:
+    TensorType            _target{};
+    SimpleTensor<uint8_t> _reference{};
+    DataType              _data_type{};
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_TOPKVLAYERFIXTURE_H
diff --git a/tests/validation/reference/TopKV.cpp b/tests/validation/reference/TopKV.cpp
new file mode 100644
index 0000000000..7cb3a0fb93
--- /dev/null
+++ b/tests/validation/reference/TopKV.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "TopKV.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "tests/SimpleTensor.h"
+
+#include <cstdint>
+#include <limits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+
+template <typename T>
+SimpleTensor<uint8_t> topkv(SimpleTensor<T> &predictions, SimpleTensor<uint32_t> &targets, uint32_t k)
+{
+    const TensorShape &ps = predictions.shape();
+    const unsigned int C  = ps[0]; // classes
+    const unsigned int N  = ps[1]; // batch
+
+    SimpleTensor<uint8_t> expected(TensorShape(N), DataType::U8);
+
+    const float eps = std::numeric_limits<float>::epsilon();
+
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        // targets[i] (U32)
+        const uint32_t target_class = *reinterpret_cast<const uint32_t *>(targets(Coordinates{static_cast<int>(i)}));
+
+        // Read predictions[target_class, i] as T, then promote to float
+        const T target_t =
+            *reinterpret_cast<const T *>(predictions(Coordinates{static_cast<int>(target_class), static_cast<int>(i)}));
+        const float target_val = static_cast<float>(target_t);
+
+        unsigned int rank = 0;
+        for (unsigned int c = 0; c < C; ++c)
+        {
+            const T vt =
+                *reinterpret_cast<const T *>(predictions(Coordinates{static_cast<int>(c), static_cast<int>(i)}));
+            const float v = static_cast<float>(vt);
+
+            if ((v - target_val) > eps)
+            {
+                ++rank;
+            }
+        }
+
+        expected[i] = static_cast<uint8_t>(rank < k);
+    }
+
+    return expected;
+}
+
+template SimpleTensor<uint8_t> topkv<float>(SimpleTensor<float> &, SimpleTensor<uint32_t> &, uint32_t);
+template SimpleTensor<uint8_t> topkv<half>(SimpleTensor<half> &, SimpleTensor<uint32_t> &, uint32_t);
+template SimpleTensor<uint8_t> topkv<uint8_t>(SimpleTensor<uint8_t> &, SimpleTensor<uint32_t> &, uint32_t);
+template SimpleTensor<uint8_t> topkv<int8_t>(SimpleTensor<int8_t> &, SimpleTensor<uint32_t> &, uint32_t);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/TopKV.h b/tests/validation/reference/TopKV.h
new file mode 100644
index 0000000000..5cb1772c12
--- /dev/null
+++ b/tests/validation/reference/TopKV.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2026 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_TOPKV_H
+#define ACL_TESTS_VALIDATION_REFERENCE_TOPKV_H
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<uint8_t> topkv(SimpleTensor<T> &predictions, SimpleTensor<uint32_t> &targets, uint32_t k);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_REFERENCE_TOPKV_H