hw-native-sys · HecreReed · May 8, 2026 · gemini-code-assist · May 8, 2026
diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
@@ -1861,6 +1861,8 @@ def generate_testcase(
     kernel_has_tscatter = "TSCATTER" in raw_kernel
     kernel_has_tgather = "TGATHER" in raw_kernel
     kernel_has_tgatherb = "TGATHERB" in raw_kernel
+    kernel_has_mscatter = "MSCATTER" in raw_kernel
+    kernel_has_mgather = "MGATHER" in raw_kernel
     # Some kernels use an integer tensor as "indices". The safe in-range domain
     # depends on the op semantics:
     # - TSCATTER: use a deterministic, collision-free permutation so NPU-vs-NPU
@@ -1872,6 +1874,27 @@ def generate_testcase(
         index_mod = max(elem_count, 1)
     elif kernel_has_tgather and not kernel_has_tgatherb:
         index_mod = max(elem_count, 1)
+    mgather_table_input = None
+    if kernel_has_mgather:
+        for p in init_ptrs:
+            if p.get("role") == "input":
+                mgather_table_input = p
+                break
+    mscatter_indices_input = None
+    mscatter_output = output_ptrs[0] if kernel_has_mscatter and output_ptrs else None
+    if kernel_has_mscatter:
+        for p in reversed(init_ptrs):
+            p_dtype = _np_dtype_for_cpp(p["cpp_type"])
+            if p.get("role") == "input" and (
+                p_dtype.startswith("np.int") or p_dtype.startswith("np.uint")
+            ):
+                mscatter_indices_input = p
+                break
+        if mscatter_output is not None:
+            index_mod = max(
+                int(ptr_elem_counts.get(mscatter_output["name"], logical_elem_count)),
+                1,
+            )
     mrgsort_packed = "TMRGSORT" in raw_kernel
     for p in init_ptrs:
         np_dtype = _np_dtype_for_cpp(p["cpp_type"])
@@ -1880,6 +1903,18 @@ def generate_testcase(
         is_output = p.get("role") == "output"
         is_integer = np_dtype.startswith("np.int") or np_dtype.startswith("np.uint")
         is_tscatter_indices = kernel_has_tscatter and p.get("role") == "input" and is_integer and size == elem_count
+        is_mscatter_indices = (
+            kernel_has_mscatter
+            and mscatter_indices_input is not None
+            and name == mscatter_indices_input["name"]
+        )
+        is_mgather_indices = (
+            kernel_has_mgather
+            and mgather_table_input is not None
+            and p.get("role") == "input"
+            and is_integer
+            and name != mgather_table_input["name"]
+        )
         is_tgatherb_offset = kernel_has_tgatherb and p.get("role") == "input" and is_integer and size < elem_count
         is_tgatherb_src = kernel_has_tgatherb and p.get("role") == "input" and not is_tgatherb_offset
         # If the kernel has both inputs and outputs, default to zero-init for
@@ -1954,6 +1989,26 @@ def generate_testcase(
                 f"    {name} = ({name}__row_perm * {cols} + {name}__cols).astype({np_dtype}).reshape(-1)"
             )
             input_generate.append(f"    {name}.tofile(\"{name}.bin\")")
+        elif is_mscatter_indices:
+            out_count = (
+                int(ptr_elem_counts.get(mscatter_output["name"], logical_elem_count))
+                if mscatter_output is not None
+                else max(size, 1)
+            )
+            input_generate.append(
+                f"    {name} = (np.arange({size}, dtype=np.int64) % {out_count}).astype({np_dtype}, copy=False)"
+            )
+            input_generate.append(f"    {name}.tofile(\"{name}.bin\")")
+        elif is_mgather_indices:
+            table_count = (
+                int(ptr_elem_counts.get(mgather_table_input['name'], logical_elem_count))
+                if mgather_table_input is not None
+                else max(size, 1)
+            )
+            input_generate.append(
+                f"    {name} = (np.arange({size}, dtype=np.int64) % {table_count}).astype({np_dtype}, copy=False)"
+            )
+            input_generate.append(f"    {name}.tofile(\"{name}.bin\")")
         elif is_tgatherb_offset:
             input_generate.append(f"    {name} = (np.arange({size}, dtype=np.uint32) * 32).astype({np_dtype})")
             input_generate.append(f"    {name}.tofile(\"{name}.bin\")")
@@ -2205,13 +2260,15 @@ def generate_testcase(
     compare_template = (templates_root / "compare_template.py").read_text(encoding="utf-8")
     compare_lines = ["    ok = True"]
     compare_prefix_counts = {}
-    tscatter_indices_input = None
+    scatter_indices_input = None
     if kernel_has_tscatter:
         for p in init_ptrs:
             p_dtype = _np_dtype_for_cpp(p["cpp_type"])
             if p.get("role") == "input" and (p_dtype.startswith("np.int") or p_dtype.startswith("np.uint")):
-                tscatter_indices_input = p
+                scatter_indices_input = p
                 break
+    elif kernel_has_mscatter and mscatter_indices_input is not None:
+        scatter_indices_input = mscatter_indices_input
     for p in output_ptrs:
         name = p["name"]
         req = inferred_counts.get(name)
@@ -2242,16 +2299,16 @@ def generate_testcase(
         eps = _default_eps_for_cpp_type(p["cpp_type"])
         is_bf16_output = _is_bf16_cpp_type(p["cpp_type"])
         bf16_max_ulp = _default_bf16_max_ulp_for_cpp_type(p["cpp_type"])
-        if kernel_has_tscatter and tscatter_indices_input is not None:
+        if (kernel_has_tscatter or kernel_has_mscatter) and scatter_indices_input is not None:
             if is_bf16_output:
                 compare_lines.append(
                     f"    ok = compare_bf16_bin_at_indices(\"golden_{name}.bin\", \"{name}.bin\", {bf16_max_ulp}, "
-                    f"\"{tscatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(tscatter_indices_input['cpp_type'])}) and ok"
+                    f"\"{scatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(scatter_indices_input['cpp_type'])}) and ok"
                 )
             else:
                 compare_lines.append(
                     f"    ok = compare_bin_at_indices(\"golden_{name}.bin\", \"{name}.bin\", {np_dtype}, {eps}, "
-                    f"\"{tscatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(tscatter_indices_input['cpp_type'])}) and ok"
+                    f"\"{scatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(scatter_indices_input['cpp_type'])}) and ok"
                 )
         elif has_packed_pred_mask and p["cpp_type"] in {"uint8_t", "int8_t"}:
             compare_lines.append(

diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh
@@ -231,6 +231,15 @@ else
   fi
 fi
 
+pto_isa_has_symbol() {
+  local symbol="$1"
+  [[ -n "${symbol}" ]] || return 1
+  find "${PTO_ISA_ROOT}/include" "${PTO_ISA_ROOT}/tests" \
+    -type f \( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.cc' \) \
+    -print0 2>/dev/null \
+    | xargs -0 grep -F -q "${symbol}"
+}
+
 status=0
 ok_count=0
 fail_count=0
@@ -267,6 +276,12 @@ while IFS= read -r -d '' cpp; do
     log "SKIP: ${testcase} (SKIP_CASES)"
     continue
   fi
+  if [[ "${testcase}" == "partarg" ]] && ! pto_isa_has_symbol "TPARTARGMAX("; then
+    skip_count=$((skip_count + 1))
+    printf "%s\tSKIP\t%s\tpto-isa missing TPARTARGMAX/TPARTARGMIN\n" "${testcase}" "${STAGE}" >> "${RESULTS_TSV}"
+    log "SKIP: ${testcase} (pto-isa missing TPARTARG intrinsics)"
+    continue
+  fi
   if [[ "${testcase}" == "gemvmx" ]]; then
     soc_lc="$(printf '%s' "${SOC_VERSION:-}" | tr '[:upper:]' '[:lower:]')"
     if [[ "$soc_lc" != *"a5"* && "$soc_lc" != *"950"* ]]; then

diff --git a/test/samples/Abs/abs.py b/test/samples/Abs/abs.py
@@ -46,10 +46,7 @@ def build():
                 tv0 = pto.MakeTensorViewOp(tv2_f32, arg0, [c32, c32], [c32, c1]).result
                 tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c32], [c32, c1]).result
 
-                # Test pto.get_tensor_view_dim: get dim sizes from tensor_view and use as partition sizes
-                dim0 = pto.GetTensorViewDimOp(tv0, c0).result
-                dim1 = pto.GetTensorViewDimOp(tv0, c1).result
-                sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[dim0, dim1]).result
+                sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[c32, c32]).result
 
                 tb0 = pto.AllocTileOp(tile_buf_32).result
                 tb1 = pto.AllocTileOp(tile_buf_32).result

diff --git a/test/samples/Mgather/mgather.py b/test/samples/Mgather/mgather.py
@@ -21,7 +21,6 @@ def build():
 
             i32 = IntegerType.get_signless(32, ctx)
             ptr_i32 = pto.PtrType.get(i32, ctx)
-
             tv2_i32 = pto.TensorViewType.get(2, i32, ctx)
             tile_view_32 = pto.PartitionTensorViewType.get([32, 32], i32, ctx)
             vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
@@ -46,7 +45,6 @@ def build():
 
                 arg0, arg1, arg2 = entry.arguments
 
-                # %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1]
                 tv0 = pto.MakeTensorViewOp(tv2_i32, arg0, [c32, c32], [c32, c1]).result
                 tv1 = pto.MakeTensorViewOp(tv2_i32, arg1, [c32, c32], [c32, c1]).result
                 tv2 = pto.MakeTensorViewOp(tv2_i32, arg2, [c32, c32], [c32, c1]).result
@@ -57,15 +55,10 @@ def build():
                 tb1 = pto.AllocTileOp(tile_buf_i32).result
                 tb2 = pto.AllocTileOp(tile_buf_i32).result
 
-                # pto.load_dps_tb ins(%sv) outs(%tb)
-                pto.TLoadOp(None, sv1, tb1)  # result=None
-
+                pto.TLoadOp(None, sv1, tb1)
                 pto.MGatherOp(sv0, tb1, tb2)
 
-                # %8 = subview on output tensor_view
                 sv2 = pto.PartitionViewOp(tile_view_32, tv2, offsets=[c0, c0], sizes=[c32, c32]).result
-
-                # pto.store_dps_tb ins(%tb2) outs(%sv2)
                 pto.TStoreOp(None, tb2, sv2)
 
                 func.ReturnOp([])

diff --git a/test/samples/Mscatter/mscatter.py b/test/samples/Mscatter/mscatter.py
@@ -21,7 +21,6 @@ def build():
 
             i32 = IntegerType.get_signless(32, ctx)
             ptr_i32 = pto.PtrType.get(i32, ctx)
-
             tv2_i32 = pto.TensorViewType.get(2, i32, ctx)
             tile_view_32 = pto.PartitionTensorViewType.get([32, 32], i32, ctx)
             vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
@@ -46,7 +45,6 @@ def build():
 
                 arg0, arg1, arg2 = entry.arguments
 
-                # %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1]
                 tv0 = pto.MakeTensorViewOp(tv2_i32, arg0, [c32, c32], [c32, c1]).result
                 tv1 = pto.MakeTensorViewOp(tv2_i32, arg1, [c32, c32], [c32, c1]).result
                 tv2 = pto.MakeTensorViewOp(tv2_i32, arg2, [c32, c32], [c32, c1]).result
@@ -58,9 +56,8 @@ def build():
                 tb0 = pto.AllocTileOp(tile_buf_i32).result
                 tb1 = pto.AllocTileOp(tile_buf_i32).result
 
-                # pto.load_dps_tb ins(%sv) outs(%tb)
                 pto.TLoadOp(None, sv0, tb0)
-                pto.TLoadOp(None, sv1, tb1)  # result=None
+                pto.TLoadOp(None, sv1, tb1)
 
                 pto.MScatterOp(tb0, tb1, sv2)
 

diff --git a/test/samples/Quant/quant.py b/test/samples/Quant/quant.py
@@ -8,9 +8,9 @@
 
 """TQuant INT8_SYM kernel sample.
 
-  tquant(src_f32, fp_f32) -> dst_i8
+  tquant(src_f32, scale_f32[row]) -> dst_i8
 
-Loads a 32x32 f32 tile (src) and a 32x32 f32 scaling-factor tile (fp),
+Loads a 32x32 f32 tile (src) and a 32x1 per-row scaling tile (scale),
 performs symmetric INT8 quantization, and stores the int8 result tile.
 
 Note: int8 tiles require Cols*sizeof(T) to be a multiple of 32 bytes
@@ -49,18 +49,27 @@ def _make_common_types(ctx):
     tv2_i8 = pto.TensorViewType.get(2, i8, ctx)
 
     ptv_f32 = pto.PartitionTensorViewType.get(_SHAPE, f32, ctx)
+    ptv_scale = pto.PartitionTensorViewType.get([_SHAPE[0], 1], f32, ctx)
     ptv_i8 = pto.PartitionTensorViewType.get(_SHAPE, i8, ctx)
 
     vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
     bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx)
+    bl_col = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx)
     sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx)
     pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx)
     cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx)
+    cfg_col = pto.TileBufConfigAttr.get(
+        bl_col, sl, pto.TileConfig.fractalABSize, pd, ctx
+    )
 
     tb_f32 = pto.TileBufType.get(_SHAPE, f32, vec, _SHAPE, cfg, ctx)
+    tb_scale = pto.TileBufType.get(
+        [_SHAPE[0], 1], f32, vec, [_SHAPE[0], 1], cfg_col, ctx
+    )
     tb_i8 = pto.TileBufType.get(_SHAPE, i8, vec, _SHAPE, cfg, ctx)
 
     quant_sym = pto.QuantTypeAttr.get(pto.QuantType.INT8_SYM, ctx)
+    layout_dn = pto.LayoutAttr.get(pto.Layout.DN, ctx)
 
     class NS:
         pass
@@ -74,10 +83,13 @@ class NS:
     ns.tv2_f32 = tv2_f32
     ns.tv2_i8 = tv2_i8
     ns.ptv_f32 = ptv_f32
+    ns.ptv_scale = ptv_scale
     ns.ptv_i8 = ptv_i8
     ns.tb_f32 = tb_f32
+    ns.tb_scale = tb_scale
     ns.tb_i8 = tb_i8
     ns.quant_sym = quant_sym
+    ns.layout_dn = layout_dn
     return ns
 
 
@@ -91,7 +103,7 @@ def build():
 
             # ------------------------------------------------------------------
             # @tquant_sym_kernel(src_ptr: !pto.ptr<f32>,
-            #                    fp_ptr:  !pto.ptr<f32>,
+            #                    scale_ptr: !pto.ptr<f32>,
             #                    dst_ptr: !pto.ptr<i8>)
             # ------------------------------------------------------------------
             fn_sym_ty = func.FunctionType.get([t.ptr_f32, t.ptr_f32, t.ptr_i8], [])
@@ -109,14 +121,18 @@ def build():
                 c1 = arith.ConstantOp(idx, 1).result
                 c32 = arith.ConstantOp(idx, 32).result
 
-                src_ptr, fp_ptr, dst_ptr = entry_sym.arguments
+                src_ptr, scale_ptr, dst_ptr = entry_sym.arguments
 
                 # Make tensor views over the flat global-memory pointers.
                 tv_src = pto.MakeTensorViewOp(
                     t.tv2_f32, src_ptr, [c32, c32], [c32, c1]
                 ).result
-                tv_fp = pto.MakeTensorViewOp(
-                    t.tv2_f32, fp_ptr, [c32, c32], [c32, c1]
+                tv_scale = pto.MakeTensorViewOp(
+                    t.tv2_f32,
+                    scale_ptr,
+                    [c32, c1],
+                    [c1, c1],
+                    layout=t.layout_dn,
                 ).result
                 tv_dst = pto.MakeTensorViewOp(
                     t.tv2_i8, dst_ptr, [c32, c32], [c32, c1]
@@ -126,24 +142,24 @@ def build():
                 sv_src = pto.PartitionViewOp(
                     t.ptv_f32, tv_src, offsets=[c0, c0], sizes=[c32, c32]
                 ).result
-                sv_fp = pto.PartitionViewOp(
-                    t.ptv_f32, tv_fp, offsets=[c0, c0], sizes=[c32, c32]
+                sv_scale = pto.PartitionViewOp(
+                    t.ptv_scale, tv_scale, offsets=[c0, c0], sizes=[c32, c1]
                 ).result
                 sv_dst = pto.PartitionViewOp(
                     t.ptv_i8, tv_dst, offsets=[c0, c0], sizes=[c32, c32]
                 ).result
 
                 # Allocate on-chip tile buffers.
                 tb_src = pto.AllocTileOp(t.tb_f32).result
-                tb_fp = pto.AllocTileOp(t.tb_f32).result
+                tb_scale = pto.AllocTileOp(t.tb_scale).result
                 tb_dst = pto.AllocTileOp(t.tb_i8).result
 
-                # Load src and fp tiles from global memory.
+                # Load src and per-row scale tiles from global memory.
                 pto.TLoadOp(None, sv_src, tb_src)
-                pto.TLoadOp(None, sv_fp, tb_fp)
+                pto.TLoadOp(None, sv_scale, tb_scale)
 
                 # INT8_SYM quantization (no offset operand).
-                pto.TQuantOp(tb_src, tb_fp, tb_dst, quant_type=t.quant_sym)
+                pto.TQuantOp(tb_src, tb_scale, tb_dst, quant_type=t.quant_sym)
 
                 # Store result back to global memory.
                 pto.TStoreOp(None, tb_dst, sv_dst)