Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 62 additions & 5 deletions test/npu_validation/scripts/generate_testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -1861,6 +1861,8 @@ def generate_testcase(
kernel_has_tscatter = "TSCATTER" in raw_kernel
kernel_has_tgather = "TGATHER" in raw_kernel
kernel_has_tgatherb = "TGATHERB" in raw_kernel
kernel_has_mscatter = "MSCATTER" in raw_kernel
kernel_has_mgather = "MGATHER" in raw_kernel
# Some kernels use an integer tensor as "indices". The safe in-range domain
# depends on the op semantics:
# - TSCATTER: use a deterministic, collision-free permutation so NPU-vs-NPU
Expand All @@ -1872,6 +1874,27 @@ def generate_testcase(
index_mod = max(elem_count, 1)
elif kernel_has_tgather and not kernel_has_tgatherb:
index_mod = max(elem_count, 1)
mgather_table_input = None
if kernel_has_mgather:
for p in init_ptrs:
if p.get("role") == "input":
mgather_table_input = p
break
mscatter_indices_input = None
mscatter_output = output_ptrs[0] if kernel_has_mscatter and output_ptrs else None
if kernel_has_mscatter:
for p in reversed(init_ptrs):
p_dtype = _np_dtype_for_cpp(p["cpp_type"])
if p.get("role") == "input" and (
p_dtype.startswith("np.int") or p_dtype.startswith("np.uint")
):
mscatter_indices_input = p
break
Comment on lines +1886 to +1892
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The heuristic for identifying mscatter_indices_input uses reversed(init_ptrs), which selects the last integer input as the indices. This is inconsistent with the tscatter logic (which selects the first integer input) and may be incorrect depending on the operand order of the MSCATTER operation. In mscatter.py, arg0 appears to be the indices and arg1 the data; if so, this heuristic will misidentify arg1 as the indices, potentially breaking the compare_bin_at_indices logic later. Consider if init_ptrs (picking the first) would be more appropriate or if a more robust identification method is needed.

if mscatter_output is not None:
index_mod = max(
int(ptr_elem_counts.get(mscatter_output["name"], logical_elem_count)),
1,
)
mrgsort_packed = "TMRGSORT" in raw_kernel
for p in init_ptrs:
np_dtype = _np_dtype_for_cpp(p["cpp_type"])
Expand All @@ -1880,6 +1903,18 @@ def generate_testcase(
is_output = p.get("role") == "output"
is_integer = np_dtype.startswith("np.int") or np_dtype.startswith("np.uint")
is_tscatter_indices = kernel_has_tscatter and p.get("role") == "input" and is_integer and size == elem_count
is_mscatter_indices = (
kernel_has_mscatter
and mscatter_indices_input is not None
and name == mscatter_indices_input["name"]
)
is_mgather_indices = (
kernel_has_mgather
and mgather_table_input is not None
and p.get("role") == "input"
and is_integer
and name != mgather_table_input["name"]
)
is_tgatherb_offset = kernel_has_tgatherb and p.get("role") == "input" and is_integer and size < elem_count
is_tgatherb_src = kernel_has_tgatherb and p.get("role") == "input" and not is_tgatherb_offset
# If the kernel has both inputs and outputs, default to zero-init for
Expand Down Expand Up @@ -1954,6 +1989,26 @@ def generate_testcase(
f" {name} = ({name}__row_perm * {cols} + {name}__cols).astype({np_dtype}).reshape(-1)"
)
input_generate.append(f" {name}.tofile(\"{name}.bin\")")
elif is_mscatter_indices:
out_count = (
int(ptr_elem_counts.get(mscatter_output["name"], logical_elem_count))
if mscatter_output is not None
else max(size, 1)
)
input_generate.append(
f" {name} = (np.arange({size}, dtype=np.int64) % {out_count}).astype({np_dtype}, copy=False)"
)
input_generate.append(f" {name}.tofile(\"{name}.bin\")")
elif is_mgather_indices:
table_count = (
int(ptr_elem_counts.get(mgather_table_input['name'], logical_elem_count))
if mgather_table_input is not None
else max(size, 1)
)
input_generate.append(
f" {name} = (np.arange({size}, dtype=np.int64) % {table_count}).astype({np_dtype}, copy=False)"
)
input_generate.append(f" {name}.tofile(\"{name}.bin\")")
elif is_tgatherb_offset:
input_generate.append(f" {name} = (np.arange({size}, dtype=np.uint32) * 32).astype({np_dtype})")
input_generate.append(f" {name}.tofile(\"{name}.bin\")")
Expand Down Expand Up @@ -2205,13 +2260,15 @@ def generate_testcase(
compare_template = (templates_root / "compare_template.py").read_text(encoding="utf-8")
compare_lines = [" ok = True"]
compare_prefix_counts = {}
tscatter_indices_input = None
scatter_indices_input = None
if kernel_has_tscatter:
for p in init_ptrs:
p_dtype = _np_dtype_for_cpp(p["cpp_type"])
if p.get("role") == "input" and (p_dtype.startswith("np.int") or p_dtype.startswith("np.uint")):
tscatter_indices_input = p
scatter_indices_input = p
break
elif kernel_has_mscatter and mscatter_indices_input is not None:
scatter_indices_input = mscatter_indices_input
for p in output_ptrs:
name = p["name"]
req = inferred_counts.get(name)
Expand Down Expand Up @@ -2242,16 +2299,16 @@ def generate_testcase(
eps = _default_eps_for_cpp_type(p["cpp_type"])
is_bf16_output = _is_bf16_cpp_type(p["cpp_type"])
bf16_max_ulp = _default_bf16_max_ulp_for_cpp_type(p["cpp_type"])
if kernel_has_tscatter and tscatter_indices_input is not None:
if (kernel_has_tscatter or kernel_has_mscatter) and scatter_indices_input is not None:
if is_bf16_output:
compare_lines.append(
f" ok = compare_bf16_bin_at_indices(\"golden_{name}.bin\", \"{name}.bin\", {bf16_max_ulp}, "
f"\"{tscatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(tscatter_indices_input['cpp_type'])}) and ok"
f"\"{scatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(scatter_indices_input['cpp_type'])}) and ok"
)
else:
compare_lines.append(
f" ok = compare_bin_at_indices(\"golden_{name}.bin\", \"{name}.bin\", {np_dtype}, {eps}, "
f"\"{tscatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(tscatter_indices_input['cpp_type'])}) and ok"
f"\"{scatter_indices_input['name']}.bin\", {_np_dtype_for_cpp(scatter_indices_input['cpp_type'])}) and ok"
)
elif has_packed_pred_mask and p["cpp_type"] in {"uint8_t", "int8_t"}:
compare_lines.append(
Expand Down
15 changes: 15 additions & 0 deletions test/npu_validation/scripts/run_remote_npu_validation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,15 @@ else
fi
fi

pto_isa_has_symbol() {
local symbol="$1"
[[ -n "${symbol}" ]] || return 1
find "${PTO_ISA_ROOT}/include" "${PTO_ISA_ROOT}/tests" \
-type f \( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.cc' \) \
-print0 2>/dev/null \
| xargs -0 grep -F -q "${symbol}"
}

status=0
ok_count=0
fail_count=0
Expand Down Expand Up @@ -267,6 +276,12 @@ while IFS= read -r -d '' cpp; do
log "SKIP: ${testcase} (SKIP_CASES)"
continue
fi
if [[ "${testcase}" == "partarg" ]] && ! pto_isa_has_symbol "TPARTARGMAX("; then
skip_count=$((skip_count + 1))
printf "%s\tSKIP\t%s\tpto-isa missing TPARTARGMAX/TPARTARGMIN\n" "${testcase}" "${STAGE}" >> "${RESULTS_TSV}"
log "SKIP: ${testcase} (pto-isa missing TPARTARG intrinsics)"
continue
fi
if [[ "${testcase}" == "gemvmx" ]]; then
soc_lc="$(printf '%s' "${SOC_VERSION:-}" | tr '[:upper:]' '[:lower:]')"
if [[ "$soc_lc" != *"a5"* && "$soc_lc" != *"950"* ]]; then
Expand Down
5 changes: 1 addition & 4 deletions test/samples/Abs/abs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,7 @@ def build():
tv0 = pto.MakeTensorViewOp(tv2_f32, arg0, [c32, c32], [c32, c1]).result
tv1 = pto.MakeTensorViewOp(tv2_f32, arg1, [c32, c32], [c32, c1]).result

# Test pto.get_tensor_view_dim: get dim sizes from tensor_view and use as partition sizes
dim0 = pto.GetTensorViewDimOp(tv0, c0).result
dim1 = pto.GetTensorViewDimOp(tv0, c1).result
sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[dim0, dim1]).result
sv0 = pto.PartitionViewOp(tile_view_32, tv0, offsets=[c0, c0], sizes=[c32, c32]).result

tb0 = pto.AllocTileOp(tile_buf_32).result
tb1 = pto.AllocTileOp(tile_buf_32).result
Expand Down
9 changes: 1 addition & 8 deletions test/samples/Mgather/mgather.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def build():

i32 = IntegerType.get_signless(32, ctx)
ptr_i32 = pto.PtrType.get(i32, ctx)

tv2_i32 = pto.TensorViewType.get(2, i32, ctx)
tile_view_32 = pto.PartitionTensorViewType.get([32, 32], i32, ctx)
vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
Expand All @@ -46,7 +45,6 @@ def build():

arg0, arg1, arg2 = entry.arguments

# %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1]
tv0 = pto.MakeTensorViewOp(tv2_i32, arg0, [c32, c32], [c32, c1]).result
tv1 = pto.MakeTensorViewOp(tv2_i32, arg1, [c32, c32], [c32, c1]).result
tv2 = pto.MakeTensorViewOp(tv2_i32, arg2, [c32, c32], [c32, c1]).result
Expand All @@ -57,15 +55,10 @@ def build():
tb1 = pto.AllocTileOp(tile_buf_i32).result
tb2 = pto.AllocTileOp(tile_buf_i32).result

# pto.load_dps_tb ins(%sv) outs(%tb)
pto.TLoadOp(None, sv1, tb1) # result=None

pto.TLoadOp(None, sv1, tb1)
pto.MGatherOp(sv0, tb1, tb2)

# %8 = subview on output tensor_view
sv2 = pto.PartitionViewOp(tile_view_32, tv2, offsets=[c0, c0], sizes=[c32, c32]).result

# pto.store_dps_tb ins(%tb2) outs(%sv2)
pto.TStoreOp(None, tb2, sv2)

func.ReturnOp([])
Expand Down
5 changes: 1 addition & 4 deletions test/samples/Mscatter/mscatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def build():

i32 = IntegerType.get_signless(32, ctx)
ptr_i32 = pto.PtrType.get(i32, ctx)

tv2_i32 = pto.TensorViewType.get(2, i32, ctx)
tile_view_32 = pto.PartitionTensorViewType.get([32, 32], i32, ctx)
vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
Expand All @@ -46,7 +45,6 @@ def build():

arg0, arg1, arg2 = entry.arguments

# %0/%1/%2 = pto.make_tensor_view %arg?, shape=[%c32,%c32] strides=[%c32,%c1]
tv0 = pto.MakeTensorViewOp(tv2_i32, arg0, [c32, c32], [c32, c1]).result
tv1 = pto.MakeTensorViewOp(tv2_i32, arg1, [c32, c32], [c32, c1]).result
tv2 = pto.MakeTensorViewOp(tv2_i32, arg2, [c32, c32], [c32, c1]).result
Expand All @@ -58,9 +56,8 @@ def build():
tb0 = pto.AllocTileOp(tile_buf_i32).result
tb1 = pto.AllocTileOp(tile_buf_i32).result

# pto.load_dps_tb ins(%sv) outs(%tb)
pto.TLoadOp(None, sv0, tb0)
pto.TLoadOp(None, sv1, tb1) # result=None
pto.TLoadOp(None, sv1, tb1)

pto.MScatterOp(tb0, tb1, sv2)

Expand Down
40 changes: 28 additions & 12 deletions test/samples/Quant/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@

"""TQuant INT8_SYM kernel sample.

tquant(src_f32, fp_f32) -> dst_i8
tquant(src_f32, scale_f32[row]) -> dst_i8

Loads a 32x32 f32 tile (src) and a 32x32 f32 scaling-factor tile (fp),
Loads a 32x32 f32 tile (src) and a 32x1 per-row scaling tile (scale),
performs symmetric INT8 quantization, and stores the int8 result tile.

Note: int8 tiles require Cols*sizeof(T) to be a multiple of 32 bytes
Expand Down Expand Up @@ -49,18 +49,27 @@ def _make_common_types(ctx):
tv2_i8 = pto.TensorViewType.get(2, i8, ctx)

ptv_f32 = pto.PartitionTensorViewType.get(_SHAPE, f32, ctx)
ptv_scale = pto.PartitionTensorViewType.get([_SHAPE[0], 1], f32, ctx)
ptv_i8 = pto.PartitionTensorViewType.get(_SHAPE, i8, ctx)

vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx)
bl_col = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx)
sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx)
pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx)
cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx)
cfg_col = pto.TileBufConfigAttr.get(
bl_col, sl, pto.TileConfig.fractalABSize, pd, ctx
)

tb_f32 = pto.TileBufType.get(_SHAPE, f32, vec, _SHAPE, cfg, ctx)
tb_scale = pto.TileBufType.get(
[_SHAPE[0], 1], f32, vec, [_SHAPE[0], 1], cfg_col, ctx
)
tb_i8 = pto.TileBufType.get(_SHAPE, i8, vec, _SHAPE, cfg, ctx)

quant_sym = pto.QuantTypeAttr.get(pto.QuantType.INT8_SYM, ctx)
layout_dn = pto.LayoutAttr.get(pto.Layout.DN, ctx)

class NS:
pass
Expand All @@ -74,10 +83,13 @@ class NS:
ns.tv2_f32 = tv2_f32
ns.tv2_i8 = tv2_i8
ns.ptv_f32 = ptv_f32
ns.ptv_scale = ptv_scale
ns.ptv_i8 = ptv_i8
ns.tb_f32 = tb_f32
ns.tb_scale = tb_scale
ns.tb_i8 = tb_i8
ns.quant_sym = quant_sym
ns.layout_dn = layout_dn
return ns


Expand All @@ -91,7 +103,7 @@ def build():

# ------------------------------------------------------------------
# @tquant_sym_kernel(src_ptr: !pto.ptr<f32>,
# fp_ptr: !pto.ptr<f32>,
# scale_ptr: !pto.ptr<f32>,
# dst_ptr: !pto.ptr<i8>)
# ------------------------------------------------------------------
fn_sym_ty = func.FunctionType.get([t.ptr_f32, t.ptr_f32, t.ptr_i8], [])
Expand All @@ -109,14 +121,18 @@ def build():
c1 = arith.ConstantOp(idx, 1).result
c32 = arith.ConstantOp(idx, 32).result

src_ptr, fp_ptr, dst_ptr = entry_sym.arguments
src_ptr, scale_ptr, dst_ptr = entry_sym.arguments

# Make tensor views over the flat global-memory pointers.
tv_src = pto.MakeTensorViewOp(
t.tv2_f32, src_ptr, [c32, c32], [c32, c1]
).result
tv_fp = pto.MakeTensorViewOp(
t.tv2_f32, fp_ptr, [c32, c32], [c32, c1]
tv_scale = pto.MakeTensorViewOp(
t.tv2_f32,
scale_ptr,
[c32, c1],
[c1, c1],
layout=t.layout_dn,
).result
tv_dst = pto.MakeTensorViewOp(
t.tv2_i8, dst_ptr, [c32, c32], [c32, c1]
Expand All @@ -126,24 +142,24 @@ def build():
sv_src = pto.PartitionViewOp(
t.ptv_f32, tv_src, offsets=[c0, c0], sizes=[c32, c32]
).result
sv_fp = pto.PartitionViewOp(
t.ptv_f32, tv_fp, offsets=[c0, c0], sizes=[c32, c32]
sv_scale = pto.PartitionViewOp(
t.ptv_scale, tv_scale, offsets=[c0, c0], sizes=[c32, c1]
).result
sv_dst = pto.PartitionViewOp(
t.ptv_i8, tv_dst, offsets=[c0, c0], sizes=[c32, c32]
).result

# Allocate on-chip tile buffers.
tb_src = pto.AllocTileOp(t.tb_f32).result
tb_fp = pto.AllocTileOp(t.tb_f32).result
tb_scale = pto.AllocTileOp(t.tb_scale).result
tb_dst = pto.AllocTileOp(t.tb_i8).result

# Load src and fp tiles from global memory.
# Load src and per-row scale tiles from global memory.
pto.TLoadOp(None, sv_src, tb_src)
pto.TLoadOp(None, sv_fp, tb_fp)
pto.TLoadOp(None, sv_scale, tb_scale)

# INT8_SYM quantization (no offset operand).
pto.TQuantOp(tb_src, tb_fp, tb_dst, quant_type=t.quant_sym)
pto.TQuantOp(tb_src, tb_scale, tb_dst, quant_type=t.quant_sym)

# Store result back to global memory.
pto.TStoreOp(None, tb_dst, sv_dst)
Expand Down
Loading
Loading