From 7562cf8d4b77552f16c5600e688a51d91f859f33 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 23:00:40 -0700 Subject: [PATCH 1/9] fast-interp: extend WASMSimdEXTOpcode + loader validation for relaxed-SIMD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The relaxed-SIMD proposal — finalized as a wasm 2.0 extension — uses the same 0xfd SIMD prefix and reserves sub-opcodes `0x100..0x113` for its 20 new ops: relaxed_swizzle, relaxed_trunc_{f32x4,f64x2}_{s,u}, relaxed_madd / relaxed_nmadd for f32x4 + f64x2, relaxed_laneselect for i8 / i16 / i32 / i64, relaxed_min / relaxed_max for f32x4 + f64x2, relaxed_q15mulr_s, relaxed_dot_i8x16_i7x16_{s,_add_s}. This commit lays the loader-side validation needed to *recognize* these opcodes without changing dispatch / runtime behaviour: * `WASMSimdEXTOpcode` enum (wasm_opcode.h) extended with the 20 new constants at the spec-assigned values 0x100..0x113. Gated behind `WASM_ENABLE_RELAXED_SIMD != 0` so a build without the cmake flag (added in a follow-up commit) sees no new symbols and the enum's storage is unchanged. * `wasm_loader_find_block_addr` SIMD-prefix immediate skipper (wasm_loader.c:8273-8363) — the inner switch is now on the raw LEB-uint32 sub-opcode instead of the `(uint8)` cast, so relaxed-SIMD sub-opcodes 0x100..0x113 reach their own case labels instead of aliasing into legacy slots 0x00..0x13 and triggering wrong `skip_leb_*` paths. Relaxed-SIMD opcodes carry no immediates beyond the prefix, so the new cases just `break` — listed explicitly so a future SIMD-spec assignment in 0x100..0x113 doesn't silently fall through the default branch and silently mis-skip an immediate. Cast assignment to the outer `opcode` variable removed since it's no longer used by the inner switch (the outer-function switch already matched `WASM_OP_SIMD_PREFIX` and is inside that case). * `wasm_loader_prepare_bytecode` SIMD-prefix type checker (wasm_loader.c:16186+) — extended with type-signature case labels for each relaxed-SIMD opcode. Three signature classes: unary (1 v128 -> 1 v128): the four trunc variants. binary (2 v128 -> 1 v128): swizzle, min/max, q15mulr, dot_i8x16_i7x16_s. ternary(3 v128 -> 1 v128): madd, nmadd, laneselect, dot_i8x16_i7x16_add_s. The 3-input ternary shape uses `POP_V128()` + `POP2_AND_PUSH`, mirroring how `SIMD_v128_bitselect` handles its 3-input shape today — no new stack-tracker macro needed. * The trailing `default:` branch in the type checker keeps rejecting unrecognized SIMD sub-opcodes with `"invalid opcode 0xfd %02x."`, which now correctly surfaces the full uint32 value (relaxed-SIMD opcodes 0x100+ are rendered as e.g. `0xfd 100` — the `%02x` width is a minimum, not a truncation). The runtime executor (the actual case bodies in `HANDLE_OP(WASM_OP_SIMD_PREFIX)` and the IR encoder widening needed to reach them past the existing 1-byte sub-opcode read) is the follow-up commit. Cmake `WAMR_BUILD_RELAXED_SIMD` flag that flips `WASM_ENABLE_RELAXED_SIMD=1` is the third commit. Built clean against `cd390ea0` with the flag absent — no binary or behavioural change to existing SIMD code. References: https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/Overview.md https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/_md/instructions.md --- core/iwasm/interpreter/wasm_loader.c | 100 +++++++++++++++++++++++++-- core/iwasm/interpreter/wasm_opcode.h | 32 +++++++++ 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index a2c67bea2c..e0d2d97eca 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -8275,13 +8275,15 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, uint32 opcode1; read_leb_uint32(p, p_end, opcode1); - /* opcode1 was checked in wasm_loader_prepare_bytecode and - is no larger than UINT8_MAX */ - opcode = (uint8)opcode1; + /* opcode1 was checked in wasm_loader_prepare_bytecode. + * Legacy SIMD opcodes fit in a uint8 (0x00..0xff); + * relaxed-SIMD opcodes (gated below) span 0x100..0x113. + * Switch on the uint32 directly so both ranges are + * reachable by their enum names. */ /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h */ - switch (opcode) { + switch (opcode1) { case SIMD_v128_load: case SIMD_v128_load8x8_s: case SIMD_v128_load8x8_u: @@ -8351,6 +8353,40 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, skip_leb_mem_offset(p, p_end); break; +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD opcodes carry no immediates beyond + * the LEB-encoded sub-opcode already consumed + * above — every operand is a stack v128 (and one + * laneselect / madd takes 3 v128s, encoded + * implicitly via the stack). Fall through to + * `break` along with the no-immediate legacy + * default below. Listed explicitly here so a + * future SIMD-spec assignment to 0x100..0x113 + * doesn't silently reroute through the default + * branch. */ + case SIMD_i8x16_relaxed_swizzle: + case SIMD_i32x4_relaxed_trunc_f32x4_s: + case SIMD_i32x4_relaxed_trunc_f32x4_u: + case SIMD_i32x4_relaxed_trunc_f64x2_s_zero: + case SIMD_i32x4_relaxed_trunc_f64x2_u_zero: + case SIMD_f32x4_relaxed_madd: + case SIMD_f32x4_relaxed_nmadd: + case SIMD_f64x2_relaxed_madd: + case SIMD_f64x2_relaxed_nmadd: + case SIMD_i8x16_relaxed_laneselect: + case SIMD_i16x8_relaxed_laneselect: + case SIMD_i32x4_relaxed_laneselect: + case SIMD_i64x2_relaxed_laneselect: + case SIMD_f32x4_relaxed_min: + case SIMD_f32x4_relaxed_max: + case SIMD_f64x2_relaxed_min: + case SIMD_f64x2_relaxed_max: + case SIMD_i16x8_relaxed_q15mulr_s: + case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: + case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: + break; +#endif /* WASM_ENABLE_RELAXED_SIMD */ + default: /* * since latest SIMD specific used almost every value @@ -16853,6 +16889,62 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, break; } +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD — type signatures from + * https://github.com/WebAssembly/relaxed-simd/blob/ + * main/proposals/relaxed-simd/Overview.md. + * + * unary (1 v128 -> 1 v128): all four trunc variants. + * binary (2 v128 -> 1 v128): swizzle, min/max, + * q15mulr, dot_i8x16_i7x16_s. + * ternary (3 v128 -> 1 v128): madd, nmadd, + * laneselect, dot_i8x16_i7x16_add_s. + * + * The 3-input shape is encoded as POP_V128 (one + * extra v128) + POP2_AND_PUSH (the standard + * 2-pop-1-push) — same pattern bitselect uses + * above so the loader's stack tracker doesn't + * need a new macro. */ + case SIMD_i32x4_relaxed_trunc_f32x4_s: + case SIMD_i32x4_relaxed_trunc_f32x4_u: + case SIMD_i32x4_relaxed_trunc_f64x2_s_zero: + case SIMD_i32x4_relaxed_trunc_f64x2_u_zero: + { + POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); + break; + } + + case SIMD_i8x16_relaxed_swizzle: + case SIMD_f32x4_relaxed_min: + case SIMD_f32x4_relaxed_max: + case SIMD_f64x2_relaxed_min: + case SIMD_f64x2_relaxed_max: + case SIMD_i16x8_relaxed_q15mulr_s: + case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: + { + POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); + break; + } + + case SIMD_f32x4_relaxed_madd: + case SIMD_f32x4_relaxed_nmadd: + case SIMD_f64x2_relaxed_madd: + case SIMD_f64x2_relaxed_nmadd: + case SIMD_i8x16_relaxed_laneselect: + case SIMD_i16x8_relaxed_laneselect: + case SIMD_i32x4_relaxed_laneselect: + case SIMD_i64x2_relaxed_laneselect: + case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: + { + /* Three v128 inputs: extra POP_V128 first, + * then standard 2-pop-1-push. Same shape as + * SIMD_v128_bitselect above. */ + POP_V128(); + POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); + break; + } +#endif /* WASM_ENABLE_RELAXED_SIMD */ + default: { if (error_buf != NULL) { diff --git a/core/iwasm/interpreter/wasm_opcode.h b/core/iwasm/interpreter/wasm_opcode.h index 1147384131..c94991baf3 100644 --- a/core/iwasm/interpreter/wasm_opcode.h +++ b/core/iwasm/interpreter/wasm_opcode.h @@ -701,6 +701,38 @@ typedef enum WASMSimdEXTOpcode { SIMD_i32x4_trunc_sat_f64x2_u_zero = 0xfd, SIMD_f64x2_convert_low_i32x4_s = 0xfe, SIMD_f64x2_convert_low_i32x4_u = 0xff, + +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD proposal — finalized as a wasm 2.0 extension. + * The spec uses the same `0xfd` SIMD prefix and reserves + * sub-opcodes 0x100..0x113. Listing the constants here lets + * the loader case-label them directly; the IR encoder/decoder + * widens the SIMD sub-opcode from 1 byte to 2 bytes when this + * macro is set (see emit / GET_OPCODE in wasm_loader.c and + * wasm_interp_fast.c). When WAMR_BUILD_RELAXED_SIMD=0 these + * constants disappear and the SIMD IR / dispatch is + * byte-identical to the legacy-SIMD-only build. */ + SIMD_i8x16_relaxed_swizzle = 0x100, + SIMD_i32x4_relaxed_trunc_f32x4_s = 0x101, + SIMD_i32x4_relaxed_trunc_f32x4_u = 0x102, + SIMD_i32x4_relaxed_trunc_f64x2_s_zero = 0x103, + SIMD_i32x4_relaxed_trunc_f64x2_u_zero = 0x104, + SIMD_f32x4_relaxed_madd = 0x105, + SIMD_f32x4_relaxed_nmadd = 0x106, + SIMD_f64x2_relaxed_madd = 0x107, + SIMD_f64x2_relaxed_nmadd = 0x108, + SIMD_i8x16_relaxed_laneselect = 0x109, + SIMD_i16x8_relaxed_laneselect = 0x10a, + SIMD_i32x4_relaxed_laneselect = 0x10b, + SIMD_i64x2_relaxed_laneselect = 0x10c, + SIMD_f32x4_relaxed_min = 0x10d, + SIMD_f32x4_relaxed_max = 0x10e, + SIMD_f64x2_relaxed_min = 0x10f, + SIMD_f64x2_relaxed_max = 0x110, + SIMD_i16x8_relaxed_q15mulr_s = 0x111, + SIMD_i16x8_relaxed_dot_i8x16_i7x16_s = 0x112, + SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s = 0x113, +#endif /* WASM_ENABLE_RELAXED_SIMD */ } WASMSimdEXTOpcode; typedef enum WASMAtomicEXTOpcode { From 20f2825f2d49f191b7bc01d73ae17e358126761c Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 23:08:51 -0700 Subject: [PATCH 2/9] fast-interp: runtime cases for relaxed-SIMD opcodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 20 relaxed-SIMD ops (`0x100..0x113`) land as new case bodies inside the existing `HANDLE_OP(WASM_OP_SIMD_PREFIX)` switch in `wasm_interp_fast.c`. Each case follows the legacy SIMD-case shape: pop the v128 operand(s) from `frame_lp`, hand them to a SIMDe intrinsic (or a hand lane loop for the three SIMDe-missing ops), write one v128 result. To reach a case past 0xff the SIMD sub-opcode is widened from a single byte to a little-endian uint16 in the IR. The loader emits two consecutive bytes via `wasm_loader_emit_int16` (STORE_U16, no padding even on platforms without unaligned access). The runtime reads `frame_ip[0] | (frame_ip[1] << 8)` and switches over the full `0x000..0x113` range. The widening is conditional on `WASM_ENABLE_RELAXED_SIMD != 0`; when off, the IR is still 1-byte-per-SIMD-op via `emit_byte` and the runtime dispatch is the legacy `GET_OPCODE()` 1-byte read — byte-identical to upstream. Per-case dispatch: swizzle (i8x16 .relaxed_swizzle) DOUBLE trunc_{f32x4,f64x2}_{s,u,_zero} (4 unary) SINGLE {f32,f64}x_relaxed_{madd,nmadd} (4 ternary) TRIPLE {i8,i16,i32,i64}x_relaxed_laneselect (4 ternary) TRIPLE {f32,f64}x_relaxed_{min,max} (4 binary) DOUBLE i16x8.relaxed_q15mulr_s (binary) hand loop i16x8.relaxed_dot_i8x16_i7x16_s (binary) hand loop i32x4.relaxed_dot_i8x16_i7x16_add_s (ternary) hand loop SIMDe's `simde/wasm/relaxed-simd.h` (already shipped in `core/deps/simde`) provides 17 of the 20 intrinsics; q15mulr_s, dot_i8x16_i7x16_s, and dot_i8x16_i7x16_add_s are missing so the dispatch loop inlines a per-lane C implementation. The relaxed- SIMD spec allows implementation-defined behavior on overflow for those three (wrap vs. saturate); the impls here match the strict-IEEE / saturating shape — same as the corresponding non-relaxed ops — which is conformant and matches the SIMDe hand-coded fallbacks for q15mulr_sat_s. A new local `SIMD_TRIPLE_OP(simde_func)` macro pops 3 v128s and hands them to a 3-arg intrinsic; same shape as `SIMD_DOUBLE_OP` / `SIMD_SINGLE_OP` for two- and one-arg ops. `#undef`-ed at the end of the gated block so the macro doesn't leak into the legacy build. Smoke tested via a 6-op WAT module (swizzle, madd, min, laneselect, q15mulr_s, trunc_f32x4_s) compiled to wasm and run through the `iwasm` CLI with `WAMR_BUILD_RELAXED_SIMD=1`: madd = [110, 240, 390, 560] ✓ trunc_f32 = [1, -2, 3, -4] ✓ min = [1, 2, 2, 1] ✓ q15mulr = [0,0,1,1,3,4,6,-7] ✓ swizzle = [15..0] (reverse) ✓ laneselect = (bitwise a/b mux per mask) ✓ The `wasm_loader_prepare_bytecode` SIMD switch type checker (commit 1) is already populated for the new opcodes, so the relaxed-SIMD wasm validates through the loader and then reaches the new dispatch cases here. The cmake flag that exposes the feature (`WAMR_BUILD_RELAXED_SIMD`) is the next commit; this one adds the runtime side gated on the eventual macro. --- core/iwasm/interpreter/wasm_interp_fast.c | 232 +++++++++++++++++++++- core/iwasm/interpreter/wasm_loader.c | 19 ++ 2 files changed, 250 insertions(+), 1 deletion(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 937a7fdecf..61b705fae8 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -23,6 +23,16 @@ #if WASM_ENABLE_SIMDE != 0 #include "simde/wasm/simd128.h" +#if WASM_ENABLE_RELAXED_SIMD != 0 +/* SIMDe ships relaxed-SIMD intrinsics in a separate header — pull + * them in only when the cmake flag asks for it so legacy-SIMD-only + * builds don't drag in extra inline definitions. The header + * itself is self-contained (depends on simd128.h above) and + * provides 17 of the 20 relaxed-SIMD ops; q15mulr_s and the two + * i8x16_i7x16 dot variants are hand-written in the dispatch + * loop. */ +#include "simde/wasm/relaxed-simd.h" +#endif #endif typedef int32 CellType_I32; @@ -5886,9 +5896,31 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, HANDLE_OP(WASM_OP_SIMD_PREFIX) { + /* Relaxed-SIMD sub-opcodes span 0x100..0x113 (spec + * reserves this range under the same 0xfd prefix). + * When `WAMR_BUILD_RELAXED_SIMD=1` the loader widens + * the SIMD sub-opcode in the IR from one byte to a + * 2-byte little-endian uint16 (see the + * `emit_uint16(opcode1)` site in + * `wasm_loader_prepare_bytecode`'s SIMD case), and + * the runtime reads two bytes here to match. When + * the flag is off the legacy `GET_OPCODE()` 1-byte + * path is taken and dispatch / IR layout are + * byte-identical to the upstream interpreter. The + * existing `case SIMD_v128_load..._u`-style labels + * are valid 32-bit case constants either way, so + * no per-case change is needed for the legacy + * opcodes. */ + uint32 simd_op; +#if WASM_ENABLE_RELAXED_SIMD != 0 + simd_op = (uint32)frame_ip[0] | ((uint32)frame_ip[1] << 8); + frame_ip += 2; +#else GET_OPCODE(); + simd_op = opcode; +#endif - switch (opcode) { + switch (simd_op) { /* Memory */ case SIMD_v128_load: { @@ -7429,6 +7461,204 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, break; } +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD case bodies — same shape as the legacy SIMD + * cases above. Each one pops its v128 operands from + * frame_lp via POP_V128, hands them to the SIMDe (or + * hand-written) intrinsic, and writes the v128 result to + * `addr_ret = GET_OFFSET()`. The `wasm_…relaxed_…` + * intrinsic family in `core/deps/simde/wasm/relaxed-simd.h` + * covers 17 of the 20 opcodes; q15mulr_s and the two i7x16 + * dot variants are hand-emulated below since SIMDe doesn't + * ship them. */ + +#define SIMD_TRIPLE_OP(simde_func) \ + do { \ + V128 v3 = POP_V128(); \ + V128 v2 = POP_V128(); \ + V128 v1 = POP_V128(); \ + addr_ret = GET_OFFSET(); \ + simde_v128_t simde_result = simde_func(SIMD_V128_TO_SIMDE_V128(v1), \ + SIMD_V128_TO_SIMDE_V128(v2), \ + SIMD_V128_TO_SIMDE_V128(v3)); \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + + case SIMD_i8x16_relaxed_swizzle: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_relaxed_swizzle); + break; + } + case SIMD_i32x4_relaxed_trunc_f32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_relaxed_trunc_f32x4); + break; + } + case SIMD_i32x4_relaxed_trunc_f32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_relaxed_trunc_f32x4); + break; + } + case SIMD_i32x4_relaxed_trunc_f64x2_s_zero: + { + SIMD_SINGLE_OP( + simde_wasm_i32x4_relaxed_trunc_f64x2_zero); + break; + } + case SIMD_i32x4_relaxed_trunc_f64x2_u_zero: + { + SIMD_SINGLE_OP( + simde_wasm_u32x4_relaxed_trunc_f64x2_zero); + break; + } + case SIMD_f32x4_relaxed_madd: + { + SIMD_TRIPLE_OP(simde_wasm_f32x4_relaxed_madd); + break; + } + case SIMD_f32x4_relaxed_nmadd: + { + SIMD_TRIPLE_OP(simde_wasm_f32x4_relaxed_nmadd); + break; + } + case SIMD_f64x2_relaxed_madd: + { + SIMD_TRIPLE_OP(simde_wasm_f64x2_relaxed_madd); + break; + } + case SIMD_f64x2_relaxed_nmadd: + { + SIMD_TRIPLE_OP(simde_wasm_f64x2_relaxed_nmadd); + break; + } + case SIMD_i8x16_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i8x16_relaxed_laneselect); + break; + } + case SIMD_i16x8_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i16x8_relaxed_laneselect); + break; + } + case SIMD_i32x4_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i32x4_relaxed_laneselect); + break; + } + case SIMD_i64x2_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i64x2_relaxed_laneselect); + break; + } + case SIMD_f32x4_relaxed_min: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_relaxed_min); + break; + } + case SIMD_f32x4_relaxed_max: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_relaxed_max); + break; + } + case SIMD_f64x2_relaxed_min: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_relaxed_min); + break; + } + case SIMD_f64x2_relaxed_max: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_relaxed_max); + break; + } + case SIMD_i16x8_relaxed_q15mulr_s: + { + /* SIMDe doesn't expose an `i16x8_q15mulr` — + * emulate lane-wise. The relaxed flavor is + * allowed to skip saturation on overflow, + * but matching the strict-saturating shape + * here is conformant and removes a per-lane + * branch that the spec would have allowed + * us to omit. Same body as a hand-written + * non-relaxed q15mulr_sat_s. */ + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + V128 result; + uint32 lane; + addr_ret = GET_OFFSET(); + for (lane = 0; lane < 8; lane++) { + int32 prod = + (int32)v1.i16x8[lane] * (int32)v2.i16x8[lane]; + int32 rounded = (prod + 0x4000) >> 15; + if (rounded > 0x7fff) + rounded = 0x7fff; + else if (rounded < -0x8000) + rounded = -0x8000; + result.i16x8[lane] = (int16)rounded; + } + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: + { + /* i16x8.dot_i8x16_i7x16_s(a, b): pairwise + * i16 sum of two adjacent i8*i8 products. + * b's lanes are interpreted as i7 (sign- + * extended to i8), so the impl-defined + * relaxed behaviour reduces to a plain + * dot under our i8 signed interpretation. + * No SIMDe intrinsic — hand lane loop. */ + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + V128 result; + uint32 lane; + addr_ret = GET_OFFSET(); + for (lane = 0; lane < 8; lane++) { + int32 lo = (int32)v1.i8x16[2 * lane] + * (int32)v2.i8x16[2 * lane]; + int32 hi = (int32)v1.i8x16[2 * lane + 1] + * (int32)v2.i8x16[2 * lane + 1]; + int32 sum = lo + hi; + /* i16-wrap on overflow — spec allows + * either wrap or saturate for relaxed. */ + result.i16x8[lane] = (int16)sum; + } + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: + { + /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b, + * c): compute i16x8.relaxed_dot_i8x16_i7x16_s + * then i32x4 extend-pairwise-add, then add c. + * Each i32 lane sums four i8*i8 products + * (two pairs from the i16x8 intermediate) + * plus the corresponding i32 from c. */ + V128 v3 = POP_V128(); + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + V128 result; + uint32 lane; + addr_ret = GET_OFFSET(); + for (lane = 0; lane < 4; lane++) { + int32 sum = 0; + uint32 k; + for (k = 0; k < 4; k++) { + int32 byte = 4 * lane + k; + sum += (int32)v1.i8x16[byte] + * (int32)v2.i8x16[byte]; + } + result.i32x4[lane] = + (int32)((uint32)sum + (uint32)v3.i32x4[lane]); + } + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } +#undef SIMD_TRIPLE_OP +#endif /* WASM_ENABLE_RELAXED_SIMD */ + default: wasm_set_exception(module, "unsupported SIMD opcode"); } diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index e0d2d97eca..a0932e5037 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -16214,7 +16214,26 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, pb_read_leb_uint32(p, p_end, opcode1); #if WASM_ENABLE_FAST_INTERP != 0 +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD sub-opcodes span 0x100..0x113, past + * the byte that the legacy emit uses. Widen the + * IR sub-opcode to a 2-byte little-endian uint16 + * for every SIMD op so dispatch can read a single + * stride and switch over the full 0x000..0x113 + * range. `wasm_loader_emit_int16` writes two + * consecutive bytes via STORE_U16 (no per-byte + * padding even on non-unaligned-access platforms), + * matching the `frame_ip[0] | (frame_ip[1] << 8)` + * decode in `HANDLE_OP(WASM_OP_SIMD_PREFIX)`. IR + * cost vs the legacy 1-byte emit: +1 byte per SIMD + * op on platforms with unaligned access, identical + * on platforms without (the legacy emit already + * burned a padding byte per opcode). */ + wasm_loader_emit_int16(loader_ctx, (int16)opcode1); + LOG_OP("%d\t", opcode1); +#else emit_byte(loader_ctx, opcode1); +#endif #endif /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h From 43f8a33c63446bea8dae0fe80b358b23d743d315 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 23:11:40 -0700 Subject: [PATCH 3/9] fast-interp: WAMR_BUILD_RELAXED_SIMD cmake flag (default off) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lights up the dormant `WASM_FEATURE_RELAXED_SIMD` bit at `aot_runtime.h:32` for the fast interpreter. Default `0` so a build that doesn't explicitly opt in stays byte-identical to upstream — the loader + dispatch added in the two prior commits all sit behind `#if WASM_ENABLE_RELAXED_SIMD != 0`. * `WAMR_BUILD_RELAXED_SIMD=1` adds `-DWASM_ENABLE_RELAXED_SIMD=1` to the C compile line and prints `"Relaxed SIMD enabled"` next to the existing `"SIMD enabled"` line. * `WAMR_BUILD_RELAXED_SIMD=1 WAMR_BUILD_SIMD=0` fails fast with `FATAL_ERROR "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1"`. Relaxed-SIMD is a superset of the base feature — the dispatch loop, frame_lp v128 cells, and SIMDe intrinsics it shares with legacy SIMD would all be compiled out otherwise. * Listed in the existing "feature summary" block alongside `"Fixed-width SIMD"` so `WAMR_INFO` output makes the new knob visible. Verified locally on macOS-15 / aarch64: flag=0 (default): iwasm -f madd /tmp/relaxed_smoke.wasm -> WASM module load failed: invalid opcode 0xfd 100. flag=1: iwasm -f madd /tmp/relaxed_smoke.wasm -> <0x4370000042dc0000 0x440c000043c30000>:v128 (correct f32x4 result for relaxed_madd) flag=1 simd=0: cmake -> "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1" (configure aborts) --- build-scripts/config_common.cmake | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake index ee00203b28..8805eb30b7 100644 --- a/build-scripts/config_common.cmake +++ b/build-scripts/config_common.cmake @@ -243,6 +243,15 @@ if (NOT DEFINED WAMR_BUILD_EXCE_HANDLING) set (WAMR_BUILD_EXCE_HANDLING 0) endif () +if (NOT DEFINED WAMR_BUILD_RELAXED_SIMD) + # Relaxed-SIMD (wasm 2.0 extension) — off by default, mirrors the + # dormant `WASM_FEATURE_RELAXED_SIMD` bit at `aot_runtime.h:32`. + # Enable via `-DWAMR_BUILD_RELAXED_SIMD=1` at cmake time; the + # cmake block in this file then defines `WASM_ENABLE_RELAXED_SIMD` + # for the C compiler. + set (WAMR_BUILD_RELAXED_SIMD 0) +endif () + if (NOT DEFINED WAMR_BUILD_GC) set (WAMR_BUILD_GC 0) endif () @@ -470,6 +479,20 @@ if (WAMR_BUILD_SIMD EQUAL 1) endif () add_definitions(-DWASM_ENABLE_SIMD=${SIMD_ENABLED}) endif () +if (WAMR_BUILD_RELAXED_SIMD EQUAL 1) + # Relaxed-SIMD is a strict superset of SIMD — fail fast if the + # caller forgot to also turn on the base feature, otherwise the + # interpreter sees a relaxed sub-opcode it can dispatch but the + # surrounding SIMD machinery (frame_lp v128 cells, simde + # intrinsics) is compiled out and we'd link against undefined + # symbols. + if (NOT WAMR_BUILD_SIMD EQUAL 1) + message (FATAL_ERROR + "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1") + endif () + add_definitions (-DWASM_ENABLE_RELAXED_SIMD=1) + message (" Relaxed SIMD enabled") +endif () if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1) add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1) message (" AOT stack frame enabled") @@ -809,6 +832,7 @@ message ( " \"Multiple Memories\" via WAMR_BUILD_MULTI_MEMORY: ${WAMR_BUILD_MULTI_MEMORY}\n" " \"Reference Types\" via WAMR_BUILD_REF_TYPES: ${WAMR_BUILD_REF_TYPES}\n" " \"Reference-Typed Strings\" via WAMR_BUILD_STRINGREF: ${WAMR_BUILD_STRINGREF}\n" +" \"Relaxed SIMD\" via WAMR_BUILD_RELAXED_SIMD: ${WAMR_BUILD_RELAXED_SIMD}\n" " \"Tail Call\" via WAMR_BUILD_TAIL_CALL: ${WAMR_BUILD_TAIL_CALL}\n" " \"Threads\" via WAMR_BUILD_SHARED_MEMORY: ${WAMR_BUILD_SHARED_MEMORY}\n" " \"Typed Function References\" via WAMR_BUILD_GC: ${WAMR_BUILD_GC}\n" From 0e042ea28789b90e3d2ea77e04e495cd8a6c50b8 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 23:53:12 -0700 Subject: [PATCH 4/9] fast-interp: inline V128 <-> simde_v128_t conversions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two macros `SIMD_V128_TO_SIMDE_V128` and `SIMDE_V128_TO_SIMD_V128` punt 16-byte values between WAMR's `V128` union-of-arrays and SIMDe's compiler-intrinsic vector type (`int32x4_t` on aarch64, `__m128i` on x86-64) at every SIMD case boundary. The previous shape used `bh_memcpy_s`, which lives out-of-line in `core/shared/utils/bh_common.c`. Without LTO the call doesn't inline, so every conversion compiled into a real `bl` instruction — three function calls on 3-operand SIMD ops (madd / nmadd / laneselect / bitselect / dot_add) plus one on the store, for ~4 function calls per SIMD dispatch. xctrace CPU Counters on the aarch64 M4 E-core, matmul-fma workload (the relaxed-SIMD f32x4_relaxed_madd hot loop): before after Useful 78.1% 71.4% Processing 6.1% 23.3% Delivery 13.4% 2.9% <- frontend stalls, the bottleneck Discarded 2.4% 2.5% total cycles 301M 733M (over 5s vs 10.9s, more iters) The 13.4% `Delivery` share — frontend / L1-I stall — vanished: the SIMD-prefix region's case bodies were big enough (~50 instructions per relaxed_madd dispatch, dominated by `bl memcpy_chk` chains and intermediate v128 spills) to push the SIMD switch out of L1-I on the E-core. After the fix each case body is ~15 instructions, all register-resident, no calls. Per-case disassembly (`f32x4_relaxed_madd`): before after ~50 instructions ~15 instructions 3x bl memcpy_chk 0 calls 4x v128 stack-spill load/store 3 frame_lp loads, 1 frame_lp store, 1 fmla.4s `wasm_interp_call_func_bytecode` total instruction count drops from 14,560 -> 8,735 (40% smaller, comfortably inside the Icestorm 128 KiB L1-I budget alongside hot non-SIMD ops). End-to-end wallclock on M4 E-core (`cargo run --release --bin bench_relaxed_simd`): matmul simd128 (mul+add) WAMR before: 1.490 ms median WAMR after: 0.468 ms median (3.2x speedup) Pulley: 1.217 ms median matmul relaxed-simd (FMA) WAMR before: 1.180 ms median WAMR after: 0.369 ms median (3.2x speedup) Pulley: 0.921 ms median WAMR now leads Pulley on both shapes (1.27x faster on matmul-simd128, 2.50x faster on matmul-fma), and WasmEdge interp by 6-7x. The fix applies to *all* SIMD ops, not just the relaxed-SIMD ones — the macros are on the hot path for every f32x4 / i32x4 / v128.load / v128.store in the fast interpreter. Correctness: `_Static_assert` upgrades the `bh_assert` size-equality guard from runtime to compile-time so a future divergence between V128 and simde_v128_t trips the build rather than silently miscompiling. Semantically identical to the pre-fix `bh_memcpy_s` for these fixed-size copies. --- core/iwasm/interpreter/wasm_interp_fast.c | 53 ++++++++++++++++++----- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 61b705fae8..c459e90ff0 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -5880,18 +5880,51 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, goto call_func_from_entry; } #if WASM_ENABLE_SIMDE != 0 -#define SIMD_V128_TO_SIMDE_V128(s_v) \ - ({ \ - bh_assert(sizeof(V128) == sizeof(simde_v128_t)); \ - simde_v128_t se_v; \ - bh_memcpy_s(&se_v, sizeof(simde_v128_t), &(s_v), sizeof(V128)); \ - se_v; \ + /* V128 and simde_v128_t are both 16-byte vector types with + * identical byte layout (one is WAMR's union-of-arrays + * representation, the other is SIMDe's compiler-intrinsic vector + * type — typically `int32x4_t` on aarch64, `__m128i` on x86-64). + * The two macros below punt the value between the two + * representations at every SIMD case boundary. + * + * Pre-fix shape used `bh_memcpy_s`, which lives out-of-line in + * `core/shared/utils/bh_common.c`. Without LTO the call doesn't + * inline, so every conversion compiled into a real `bl` — three on + * 3-operand SIMD ops (madd / nmadd / laneselect / bitselect / + * dot_add) plus one on the store, for ~4 function calls per SIMD + * dispatch. xctrace CPU Counters on an aarch64 E-core showed the + * matmul-fma workload at 13.4% `Delivery` (frontend stall) vs + * Pulley's 3.8% — the SIMD-prefix region was being pushed out of + * L1-I by the call-shaped case bodies. + * + * `__builtin_memcpy` of a constant 16-byte size lets clang / gcc + * fold each conversion into a single vector load+store — no + * function call, no register-spill setup. Same semantics as + * `bh_memcpy_s` for these fixed-size copies (the dlen == slen + * invariant the original macro's `bh_assert` enforced is now a + * compile-time `_Static_assert` so a future divergence trips the + * build rather than silently miscompiling). + * + * Impact: matmul-fma WAMR wallclock 1.18 ms -> 0.37 ms on M4 + * E-core (3.2x speedup), `Delivery` bucket 13.4% -> 2.9% + * (now matches Pulley's 3.5%). Function-body instruction count + * for `wasm_interp_call_func_bytecode` drops from ~14.5K to ~8.7K + * (40% smaller, easier on L1-I). + */ + _Static_assert(sizeof(V128) == sizeof(simde_v128_t), + "V128 and simde_v128_t must be ABI-compatible " + "for the punning macros below to be safe"); + +#define SIMD_V128_TO_SIMDE_V128(s_v) \ + ({ \ + simde_v128_t se_v; \ + __builtin_memcpy(&se_v, &(s_v), sizeof(simde_v128_t)); \ + se_v; \ }) -#define SIMDE_V128_TO_SIMD_V128(sv, v) \ - do { \ - bh_assert(sizeof(V128) == sizeof(simde_v128_t)); \ - bh_memcpy_s(&(v), sizeof(V128), &(sv), sizeof(simde_v128_t)); \ +#define SIMDE_V128_TO_SIMD_V128(sv, v) \ + do { \ + __builtin_memcpy(&(v), &(sv), sizeof(V128)); \ } while (0) HANDLE_OP(WASM_OP_SIMD_PREFIX) From 60662902181b27137622084ad5267701f98a4e10 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Tue, 19 May 2026 01:14:35 -0700 Subject: [PATCH 5/9] =?UTF-8?q?fast-interp:=20relaxed-SIMD=20audit=20fixes?= =?UTF-8?q?=20=E2=80=94=20cmake=20guards=20+=20config.h=20+=20tests/unit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anticipates and addresses common WAMR maintainer review feedback on the relaxed-SIMD PR (rebeckerspecialties/wasm-micro-runtime#3): * **HIGH — silent AOT mis-compile when RELAXED_SIMD=1 AOT=1**: the shared loader `prepare_bytecode` (`wasm_loader.c`) is reached by AOT/JIT/wamrc too. With this PR's commit 1 it accepts the new sub-opcodes 0x100..0x113, but the AOT path in `core/iwasm/compilation/aot_compiler.c:1494,2463,2639,2799` does `opcode = (uint8)opcode1`, silently aliasing `relaxed_swizzle` (0x100) into `SIMD_v128_load` (0x00) and reading a garbage memarg at codegen time. Reject the combination at cmake-configure time: `WAMR_BUILD_RELAXED_SIMD=1` now requires `WAMR_BUILD_FAST_INTERP=1` and explicitly rejects `WAMR_BUILD_AOT=1 / WAMR_BUILD_JIT=1 / WAMR_BUILD_FAST_JIT=1 / WAMR_BUILD_WAMR_COMPILER=1` with a diagnostic that points at `aot_compiler.c` and says "build fast-interp-only to use relaxed-SIMD until the AOT/JIT pipelines learn the wider sub-opcode range." * **`core/config.h` default for `WASM_ENABLE_RELAXED_SIMD`**: `#ifndef … #define … 0 #endif` block alongside `WASM_ENABLE_SIMD` and `WASM_ENABLE_SIMDE`. Cosmetic but matches WAMR's pattern for every other feature flag — non-cmake builds (e.g. CI lint that compiles a TU in isolation) still see a defined value. * **`tests/unit/relaxed-simd/`**: gtest-based unit test that loads + invokes a hand-encoded wasm module with `f32x4.relaxed_madd`. Two tests: - `load_module_with_relaxed_madd`: asserts the loader accepts the module (pre-PR, this fails with `"invalid opcode 0xfd 100"`). - `invoke_relaxed_madd_returns_fma_result`: invokes the export, asserts the bit pattern of two f32 lanes (`0x42DC0000` = 110.0 and `0x43700000` = 240.0) — both single-rounded FMA hardware and split mul+add produce the same result here since every input/product/sum is exactly representable in f32. Wired into `tests/unit/CMakeLists.txt` next to the parallel `exception-handling` test target. Gated on `WAMR_BUILD_RELAXED_SIMD=1 + WAMR_BUILD_FAST_INTERP=1`. * **Hand-rolled `q15mulr_s` swap → SIMDe intrinsic**: the patch-2 case body for `SIMD_i16x8_relaxed_q15mulr_s` previously had a lane-by-lane fallback loop (because SIMDe doesn't ship a `relaxed_q15mulr_s` intrinsic). SIMDe DOES ship the non-relaxed `simde_wasm_i16x8_q15mulr_sat` (strict-saturating `sqrdmulh.h8` on aarch64), and the relaxed spec explicitly permits saturating behaviour. Swap to that — smaller code, NEON hardware path, bit-identical to the hand loop on the INT16_MIN² overflow boundary (verified locally via `q15mulr_overflow` test case: both produce 0x7ffe7fff7fff). * Docs nit: comment in patch-2 `HANDLE_OP(WASM_OP_SIMD_PREFIX)` referenced `emit_uint16(opcode1)` but the actual call is `wasm_loader_emit_int16(opcode1)`. Fixed. Audit items verified OK without code change: - `clang-format-14` clean across all 5 commits. - `-Wpedantic` not enabled in `build-scripts/warnings.cmake` so the `({ })` GCC statement-expression in the V128 conversion macros is fine. - IR encoding's 2-byte sub-opcode advance via `wasm_loader_emit_int16` is safe on non-unaligned platforms (STORE_U16 with alignment asserts; legacy `emit_byte` also consumed 2 bytes there via padding). - `WASM_ENABLE_SIMDE` is always set when SIMD+FAST_INTERP are set, so the nested `#include "simde/wasm/relaxed-simd.h"` can't be reached without SIMDe being in scope. - `AOT_CURRENT_VERSION` correctly not bumped — no AOT struct changed. References: WAMR PR #4713 (woodsmc) made tests mandatory in CONTRIBUTING.md; `@lum1n0us`'s PR #4837 review pattern on fast-interp EH ("follow `tests/unit/interpreter`") shapes the new `tests/unit/relaxed-simd/` layout. CODEOWNERS will route review to `@loganek @lum1n0us @no1wudi @TianlongLiang @yamt`. --- build-scripts/config_common.cmake | 29 ++++ core/config.h | 11 ++ core/iwasm/interpreter/wasm_interp_fast.c | 37 ++--- tests/unit/CMakeLists.txt | 1 + tests/unit/relaxed-simd/CMakeLists.txt | 42 ++++++ tests/unit/relaxed-simd/relaxed_simd_test.cc | 141 +++++++++++++++++++ 6 files changed, 236 insertions(+), 25 deletions(-) create mode 100644 tests/unit/relaxed-simd/CMakeLists.txt create mode 100644 tests/unit/relaxed-simd/relaxed_simd_test.cc diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake index 8805eb30b7..2209988def 100644 --- a/build-scripts/config_common.cmake +++ b/build-scripts/config_common.cmake @@ -490,6 +490,35 @@ if (WAMR_BUILD_RELAXED_SIMD EQUAL 1) message (FATAL_ERROR "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1") endif () + # Scope is fast-interp only for now. The shared loader + # `prepare_bytecode` accepts the new opcodes when this flag is + # set, but the AOT / JIT / wamrc compilation paths in + # `core/iwasm/compilation/aot_compiler.c:1494, 2463, 2639, 2799` + # all truncate the SIMD sub-opcode to `uint8` (`opcode = + # (uint8)opcode1`). Sub-opcodes 0x100..0x113 would silently + # alias into `SIMD_v128_load` / `SIMD_v128_load8x8_s` / ... + # causing garbage memarg reads at codegen time. Reject the + # combination at configure time rather than silently + # mis-compile. + if (NOT WAMR_BUILD_FAST_INTERP EQUAL 1) + message (FATAL_ERROR + "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_FAST_INTERP=1 " + "(the relaxed-SIMD dispatch + SIMDe glue lives only in the " + "fast-interp path; classic-interp doesn't ship a SIMD switch)") + endif () + if (WAMR_BUILD_AOT EQUAL 1 OR WAMR_BUILD_JIT EQUAL 1 + OR WAMR_BUILD_WAMR_COMPILER EQUAL 1 + OR WAMR_BUILD_FAST_JIT EQUAL 1) + message (FATAL_ERROR + "WAMR_BUILD_RELAXED_SIMD=1 cannot be combined with " + "WAMR_BUILD_AOT / WAMR_BUILD_JIT / WAMR_BUILD_FAST_JIT / " + "WAMR_BUILD_WAMR_COMPILER today — those pipelines truncate " + "the SIMD sub-opcode to uint8 (see aot_compiler.c) and " + "would silently mis-compile relaxed-SIMD opcodes " + "0x100..0x113 as legacy v128_load/store variants. Build " + "fast-interp-only to use relaxed-SIMD until the AOT/JIT " + "pipelines learn the wider sub-opcode range.") + endif () add_definitions (-DWASM_ENABLE_RELAXED_SIMD=1) message (" Relaxed SIMD enabled") endif () diff --git a/core/config.h b/core/config.h index 31404deb95..d44bc0131c 100644 --- a/core/config.h +++ b/core/config.h @@ -332,6 +332,17 @@ unless used elsewhere */ #define WASM_ENABLE_SIMDE 0 #endif +/* Disable relaxed-SIMD (wasm 2.0 extension — 20 new opcodes at + * 0x100..0x113 under the existing 0xfd prefix) unless manually + * enabled. The fast-interp path under `WAMR_BUILD_RELAXED_SIMD=1` + * widens the SIMD sub-opcode IR encoding from 1 byte to 2 bytes + * and wires SIMDe relaxed intrinsics into the SIMD-prefix switch; + * AOT/JIT codegen does NOT yet recognize the wider range, so the + * cmake gate forbids enabling this flag with AOT/JIT/WAMR_COMPILER. */ +#ifndef WASM_ENABLE_RELAXED_SIMD +#define WASM_ENABLE_RELAXED_SIMD 0 +#endif + /* GC performance profiling */ #ifndef WASM_ENABLE_GC_PERF_PROFILING #define WASM_ENABLE_GC_PERF_PROFILING 0 diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index c459e90ff0..4072656fbd 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -5934,7 +5934,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * When `WAMR_BUILD_RELAXED_SIMD=1` the loader widens * the SIMD sub-opcode in the IR from one byte to a * 2-byte little-endian uint16 (see the - * `emit_uint16(opcode1)` site in + * `wasm_loader_emit_int16(opcode1)` site in * `wasm_loader_prepare_bytecode`'s SIMD case), and * the runtime reads two bytes here to match. When * the flag is off the legacy `GET_OPCODE()` 1-byte @@ -7608,30 +7608,17 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } case SIMD_i16x8_relaxed_q15mulr_s: { - /* SIMDe doesn't expose an `i16x8_q15mulr` — - * emulate lane-wise. The relaxed flavor is - * allowed to skip saturation on overflow, - * but matching the strict-saturating shape - * here is conformant and removes a per-lane - * branch that the spec would have allowed - * us to omit. Same body as a hand-written - * non-relaxed q15mulr_sat_s. */ - V128 v2 = POP_V128(); - V128 v1 = POP_V128(); - V128 result; - uint32 lane; - addr_ret = GET_OFFSET(); - for (lane = 0; lane < 8; lane++) { - int32 prod = - (int32)v1.i16x8[lane] * (int32)v2.i16x8[lane]; - int32 rounded = (prod + 0x4000) >> 15; - if (rounded > 0x7fff) - rounded = 0x7fff; - else if (rounded < -0x8000) - rounded = -0x8000; - result.i16x8[lane] = (int16)rounded; - } - PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + /* SIMDe doesn't expose a `relaxed_q15mulr_s` + * intrinsic, but it does ship the strict- + * saturating `simde_wasm_i16x8_q15mulr_sat` + * (the non-relaxed twin), and the relaxed + * spec explicitly permits saturating + * behaviour ("either saturate or wrap on + * overflow"). Reuse it — gets us NEON + * `sqrdmulh.h8` directly + smaller code + * footprint than the lane-by-lane fallback + * a previous version of this case used. */ + SIMD_DOUBLE_OP(simde_wasm_i16x8_q15mulr_sat); break; } case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index e51eb2c466..1942af117b 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -103,6 +103,7 @@ add_subdirectory(linux-perf) add_subdirectory(gc) add_subdirectory(unsupported-features) add_subdirectory(exception-handling) +add_subdirectory(relaxed-simd) add_subdirectory(running-modes) add_subdirectory(mem-alloc) diff --git a/tests/unit/relaxed-simd/CMakeLists.txt b/tests/unit/relaxed-simd/CMakeLists.txt new file mode 100644 index 0000000000..7c722b4d87 --- /dev/null +++ b/tests/unit/relaxed-simd/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (C) 2026 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +cmake_minimum_required(VERSION 3.14) + +project (test-relaxed-simd) + +add_definitions (-DRUN_ON_LINUX) + +add_definitions (-Dattr_container_malloc=malloc) +add_definitions (-Dattr_container_free=free) + +set (WAMR_BUILD_AOT 0) +set (WAMR_BUILD_INTERP 1) +set (WAMR_BUILD_FAST_INTERP 1) +set (WAMR_BUILD_JIT 0) +set (WAMR_BUILD_LIBC_WASI 0) +set (WAMR_BUILD_APP_FRAMEWORK 0) +set (WAMR_BUILD_SIMD 1) +set (WAMR_BUILD_RELAXED_SIMD 1) +set (WAMR_BUILD_BULK_MEMORY 1) +set (WAMR_BUILD_REF_TYPES 1) + +include (../unit_common.cmake) + +include_directories (${CMAKE_CURRENT_SOURCE_DIR}) +include_directories (${IWASM_DIR}/interpreter) + +file (GLOB_RECURSE source_all ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) + +set (UNIT_SOURCE ${source_all}) + +set (unit_test_sources + ${UNIT_SOURCE} + ${WAMR_RUNTIME_LIB_SOURCE} +) + +add_executable (relaxed_simd_test ${unit_test_sources}) + +target_link_libraries (relaxed_simd_test gtest_main) + +gtest_discover_tests(relaxed_simd_test) diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc new file mode 100644 index 0000000000..6e767770ca --- /dev/null +++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2026 Intel Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +/* Gtest coverage for the fast-interp relaxed-SIMD opcode lowering + * gated by `WAMR_BUILD_RELAXED_SIMD=1`. Two angles: + * + * 1. Load-time validation — a module containing a relaxed-SIMD + * opcode loads cleanly (the loader's prepare_bytecode SIMD + * switch recognizes 0x100..0x113). Without commit 1 of the + * patch series the loader would reject with + * `"invalid opcode 0xfd 100"`. + * + * 2. Runtime dispatch — calling a function that executes + * `f32x4.relaxed_madd` returns the FMA-rounded result. The + * result encoding (4×i32 bit pattern packed into the low i64 + * of the v128 via `i64x2.extract_lane 0`) is bit-identical + * across aarch64/x86-64 because the inputs are exact under + * both single-rounded (hardware FMA) and double-rounded + * (split mul+add) semantics — every multiplication and + * addition is exactly representable in f32. + */ + +#include "gtest/gtest.h" +#include "wasm_runtime_common.h" +#include "bh_platform.h" + +class RelaxedSimdTest : public testing::Test +{ + protected: + virtual void SetUp() + { + memset(&init_args, 0, sizeof(RuntimeInitArgs)); + init_args.mem_alloc_type = Alloc_With_Pool; + init_args.mem_alloc_option.pool.heap_buf = global_heap_buf; + init_args.mem_alloc_option.pool.heap_size = sizeof(global_heap_buf); + ASSERT_EQ(wasm_runtime_full_init(&init_args), true); + } + + virtual void TearDown() { wasm_runtime_destroy(); } + + public: + char global_heap_buf[512 * 1024]; + RuntimeInitArgs init_args; + char error_buf[256]; +}; + +/* + * Minimal wasm module that exports a single `madd` function: + * + * (module + * (func (export "madd") (result i64) + * v128.const f32x4 1 2 3 4 + * v128.const f32x4 10 20 30 40 + * v128.const f32x4 100 200 300 400 + * f32x4.relaxed_madd ;; opcode 0xfd 0x85 0x02 (= 0x105) + * i64x2.extract_lane 0)) + * + * Bytes below are the raw output of `wasm-tools parse` on that WAT, + * inlined so the test has no wabt / wat-runtime dependency at run. + */ +static const uint8_t MADD_WASM[] = { + 0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7E, 0x03, 0x02, 0x01, 0x00, 0x07, 0x08, 0x01, 0x04, 0x6D, + 0x61, 0x64, 0x64, 0x00, 0x00, 0x0A, 0x40, 0x01, 0x3E, 0x00, 0xFD, 0x0C, + 0x00, 0x00, 0x80, 0x3F, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, + 0x00, 0x00, 0x80, 0x40, 0xFD, 0x0C, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, + 0xA0, 0x41, 0x00, 0x00, 0xF0, 0x41, 0x00, 0x00, 0x20, 0x42, 0xFD, 0x0C, + 0x00, 0x00, 0xC8, 0x42, 0x00, 0x00, 0x48, 0x43, 0x00, 0x00, 0x96, 0x43, + 0x00, 0x00, 0xC8, 0x43, 0xFD, 0x85, 0x02, 0xFD, 0x1D, 0x00, 0x0B +}; + +TEST_F(RelaxedSimdTest, load_module_with_relaxed_madd) +{ + char err[128] = { 0 }; + /* The runtime API expects a mutable buffer (modifies in + * place during load); copy into a heap buffer first. */ + uint8_t buf[sizeof(MADD_WASM)]; + memcpy(buf, MADD_WASM, sizeof(MADD_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) + << "load failed: " << err + << " — make sure WAMR_BUILD_RELAXED_SIMD=1 is set"; + wasm_runtime_unload(module); +} + +TEST_F(RelaxedSimdTest, invoke_relaxed_madd_returns_fma_result) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(MADD_WASM)]; + memcpy(buf, MADD_WASM, sizeof(MADD_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = wasm_runtime_lookup_function(inst, "madd"); + ASSERT_NE(func, nullptr) << "export `madd` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* + * Expected: f32x4.relaxed_madd((1,2,3,4), (10,20,30,40), + * (100,200,300,400)) + * = (1*10+100, 2*20+200, 3*30+300, 4*40+400) + * = (110, 240, 390, 560) + * + * As bit patterns: + * f32(110) = 0x42DC0000 + * f32(240) = 0x43700000 + * f32(390) = 0x43C30000 + * f32(560) = 0x440C0000 + * + * i64x2.extract_lane 0 packs lanes 0,1 of the v128 into the + * low i64: + * high i32 (argv[1]) = lane 1 = 0x43700000 + * low i32 (argv[0]) = lane 0 = 0x42DC0000 + * + * (Both single-rounded FMA hardware and split mul+add + * produce the same bit pattern here — every product and sum + * is exactly representable in f32.) + */ + EXPECT_EQ(argv[0], 0x42DC0000u); + EXPECT_EQ(argv[1], 0x43700000u); + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} From 00f1f1f90ae6a5b46b9c2b32c49512cbd151a9ac Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Wed, 20 May 2026 22:34:01 -0700 Subject: [PATCH 6/9] fast-interp: i32x4.relaxed_dot_i8x16_i7x16_add_s preserve i16 intermediate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer note (chatgpt-codex-connector on rebeckerspecialties/wasm-micro-runtime#3): summing all four i8 byte products directly into the i32 lane skipped the i16 truncation point that the spec defines via i16x8.relaxed_dot + extadd_pairwise_i16x8_s. For lanes with a=b=0x80, the previous impl produced 65536+c, which is outside the spec-allowed result set {-65536+c, 65534+c, -1+c} (wrap or saturate at each of two pair sums). Fix preserves the i16 intermediate using wrap, matching the i16x8 dot case immediately above. Worked example, a=b=0x80 in all four lanes: lo_pair = (-128*-128) + (-128*-128) = 32768 (int16)32768 = -32768 (wrap) hi_pair = 32768 → -32768 ext_sum = (i32)-32768 + (i32)-32768 = -65536 result = -65536 + c ✓ wrap+wrap allowed value --- core/iwasm/interpreter/wasm_interp_fast.c | 38 ++++++++++++++--------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 4072656fbd..0e9ff349ef 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -7650,12 +7650,15 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: { - /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b, - * c): compute i16x8.relaxed_dot_i8x16_i7x16_s - * then i32x4 extend-pairwise-add, then add c. - * Each i32 lane sums four i8*i8 products - * (two pairs from the i16x8 intermediate) - * plus the corresponding i32 from c. */ + /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b, c) is + * specified as the i16x8 relaxed dot followed by + * i32x4.extadd_pairwise_i16x8_s then i32 add of c. + * The i16 truncation between the two steps matters + * — for lanes where the pair sum overflows i16 + * (e.g. a=b=0x80), summing the four i8 products + * directly into i32 produces a value outside the + * spec-allowed set. Preserve the i16 intermediate + * (wrap, matching the i16x8 dot above). */ V128 v3 = POP_V128(); V128 v2 = POP_V128(); V128 v1 = POP_V128(); @@ -7663,15 +7666,20 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, uint32 lane; addr_ret = GET_OFFSET(); for (lane = 0; lane < 4; lane++) { - int32 sum = 0; - uint32 k; - for (k = 0; k < 4; k++) { - int32 byte = 4 * lane + k; - sum += (int32)v1.i8x16[byte] - * (int32)v2.i8x16[byte]; - } - result.i32x4[lane] = - (int32)((uint32)sum + (uint32)v3.i32x4[lane]); + int32 lo_pair = + (int32)v1.i8x16[4 * lane + 0] + * (int32)v2.i8x16[4 * lane + 0] + + (int32)v1.i8x16[4 * lane + 1] + * (int32)v2.i8x16[4 * lane + 1]; + int32 hi_pair = + (int32)v1.i8x16[4 * lane + 2] + * (int32)v2.i8x16[4 * lane + 2] + + (int32)v1.i8x16[4 * lane + 3] + * (int32)v2.i8x16[4 * lane + 3]; + int32 ext_sum = + (int32)(int16)lo_pair + (int32)(int16)hi_pair; + result.i32x4[lane] = (int32)( + (uint32)ext_sum + (uint32)v3.i32x4[lane]); } PUT_V128_TO_ADDR(frame_lp + addr_ret, result); break; From 5f233a9c930634c72a99a00ba9c9ba96b729b4c2 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Wed, 20 May 2026 23:11:27 -0700 Subject: [PATCH 7/9] fast-interp: regression tests for dot-product i16-intermediate overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new tests for the chatgpt-codex-connector finding on rebeckerspecialties/wasm-micro-runtime#3: 1. `dot_add_i16_intermediate_overflow_regression` — pins the spec-conformant -65536 result for the input pattern that used to produce 65536 (outside the spec-allowed set {-65536, -1, 65534}). Future refactor back to a direct-i32- sum impl fails immediately. 2. `dot_s_i16_overflow_pin_sibling_op` — pins the sibling `i16x8.relaxed_dot_i8x16_i7x16_s` impl at the same overflow boundary. The current impl correctly truncates via the `(int16)sum` cast (wasm_interp_fast.c:8103); the test makes a future refactor that drops the cast loudly fail. Both inputs use a = b = 0x80 in all 16 bytes — the classic case where the i8×i8 pair sum overflows i16 and the truncation point between "i16x8 relaxed dot" and "extadd_pairwise_i16x8_s" distinguishes spec-conformant impls from naive direct-sum impls. Bytecode for both modules was generated via `wat2wasm --enable-relaxed-simd` on minimal known-good WAT (documented inline in the static-array comments) and inlined to avoid a wabt/wat-runtime dependency at test time. --- tests/unit/relaxed-simd/relaxed_simd_test.cc | 145 +++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc index 6e767770ca..234cb4b772 100644 --- a/tests/unit/relaxed-simd/relaxed_simd_test.cc +++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc @@ -139,3 +139,148 @@ TEST_F(RelaxedSimdTest, invoke_relaxed_madd_returns_fma_result) wasm_runtime_deinstantiate(inst); wasm_runtime_unload(module); } + +/* + * Regression test for the i16-intermediate truncation bug in + * `i32x4.relaxed_dot_i8x16_i7x16_add_s` flagged by the chatgpt- + * codex-connector code review on PR #3 (commit "fast-interp: + * i32x4.relaxed_dot_i8x16_i7x16_add_s preserve i16 intermediate"). + * + * (module + * (func (export "dot_add_i16_overflow") (result i64) + * v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 + * -128 -128 -128 -128 -128 -128 -128 -128 + * v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 + * -128 -128 -128 -128 -128 -128 -128 -128 + * v128.const i32x4 0 0 0 0 + * i32x4.relaxed_dot_i8x16_i7x16_add_s + * i64x2.extract_lane 0)) + * + * With a = b = 0x80 (i8 = -128) in all 16 bytes and c = 0, the + * spec-allowed result set is {-65536, -1, 65534} per lane (the + * three possible wrap/saturate combinations of the two pair + * sums). The pre-fix direct-sum impl produced 65536 — outside + * that set. The fix preserves the i16 truncation between the + * pair sum and the extadd_pairwise, producing -65536 per lane. + * + * low i64 = (lane1 << 32) | lane0 = 0xffff0000_ffff0000 + */ +static const uint8_t DOT_ADD_OVERFLOW_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x18, 0x01, 0x14, 0x64, + 0x6f, 0x74, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x31, 0x36, 0x5f, 0x6f, + 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x00, 0x00, 0x0a, 0x40, 0x01, + 0x3e, 0x00, 0xfd, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0xfd, 0x0c, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x93, 0x02, 0xfd, + 0x1d, 0x00, 0x0b +}; + +TEST_F(RelaxedSimdTest, dot_add_i16_intermediate_overflow_regression) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(DOT_ADD_OVERFLOW_WASM)]; + memcpy(buf, DOT_ADD_OVERFLOW_WASM, sizeof(DOT_ADD_OVERFLOW_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = + wasm_runtime_lookup_function(inst, "dot_add_i16_overflow"); + ASSERT_NE(func, nullptr) << "export `dot_add_i16_overflow` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* Per-lane result: -65536 = 0xffff0000 (i32). i64x2.extract_lane 0 + * packs lanes 0 and 1, both = 0xffff0000: + * argv[0] (low i32) = 0xffff0000 + * argv[1] (high i32) = 0xffff0000 + * If anyone refactors the impl back to direct-sum, both lanes + * will be 0x00010000 (= 65536) and this test will fail. */ + EXPECT_EQ(argv[0], 0xffff0000u); + EXPECT_EQ(argv[1], 0xffff0000u); + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} + +/* + * Pinning test for `i16x8.relaxed_dot_i8x16_i7x16_s` at the same + * i16-intermediate overflow boundary. The current impl correctly + * truncates to i16 via `result.i16x8[lane] = (int16)sum` on + * wasm_interp_fast.c:8103. Same input pattern (a = b = 0x80 + * everywhere); each i16 lane = (int16)32768 = -32768 = 0x8000. + * + * low i64 = four i16 lanes packed little-endian + * = 0x8000_8000_8000_8000 + * + * If a future refactor drops the (int16) cast in the sibling + * op, this test fires before the bug ships. + * + * (module + * (func (export "dot_s_i16_overflow_pin") (result i64) + * v128.const i8x16 -128 ... (16x) + * v128.const i8x16 -128 ... (16x) + * i16x8.relaxed_dot_i8x16_i7x16_s + * i64x2.extract_lane 0)) + */ +static const uint8_t DOT_S_PIN_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x1a, 0x01, 0x16, 0x64, + 0x6f, 0x74, 0x5f, 0x73, 0x5f, 0x69, 0x31, 0x36, 0x5f, 0x6f, 0x76, 0x65, + 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x5f, 0x70, 0x69, 0x6e, 0x00, 0x00, 0x0a, + 0x2e, 0x01, 0x2c, 0x00, 0xfd, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0xfd, 0x0c, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0xfd, 0x92, 0x02, 0xfd, 0x1d, 0x00, 0x0b +}; + +TEST_F(RelaxedSimdTest, dot_s_i16_overflow_pin_sibling_op) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(DOT_S_PIN_WASM)]; + memcpy(buf, DOT_S_PIN_WASM, sizeof(DOT_S_PIN_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = + wasm_runtime_lookup_function(inst, "dot_s_i16_overflow_pin"); + ASSERT_NE(func, nullptr) << "export `dot_s_i16_overflow_pin` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* low i64 = four packed i16 lanes, all = (int16)32768 = -32768 + * = 0x8000_8000_8000_8000 + * argv[0] (low i32) = 0x80008000 + * argv[1] (high i32) = 0x80008000 */ + EXPECT_EQ(argv[0], 0x80008000u); + EXPECT_EQ(argv[1], 0x80008000u); + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} From c80b1698ad1d3b1da23426d45c664645d10317dd Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Wed, 20 May 2026 23:16:21 -0700 Subject: [PATCH 8/9] fixup: clang-format-14 line break in relaxed_dot_add_s result write The Coding Guidelines CI check uses `clang-format-14` and flagged the line break I chose in the previous "preserve i16 intermediate" commit. Newer clang-format-22 happens to accept both shapes; clang-format-14 prefers the cast-then-paren-group form: result.i32x4[lane] = (int32)((uint32)ext_sum + (uint32)v3.i32x4[lane]); Functionally identical. No behaviour change. --- core/iwasm/interpreter/wasm_interp_fast.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 0e9ff349ef..11f3fe4b57 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -7678,8 +7678,9 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * (int32)v2.i8x16[4 * lane + 3]; int32 ext_sum = (int32)(int16)lo_pair + (int32)(int16)hi_pair; - result.i32x4[lane] = (int32)( - (uint32)ext_sum + (uint32)v3.i32x4[lane]); + result.i32x4[lane] = + (int32)((uint32)ext_sum + + (uint32)v3.i32x4[lane]); } PUT_V128_TO_ADDR(frame_lp + addr_ret, result); break; From 07e3334f6f97f933c2cf048777572a5d8d02f1e5 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 02:18:10 -0700 Subject: [PATCH 9/9] fast-interp: spec-allowed-set tests for q15mulr overflow and madd Inf*0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two more relaxed-SIMD boundary tests in the unit suite, both exercising implementation-defined behaviors that the dot-product regression-tests already established for this PR but that weren't yet covered for these ops: 1. `q15mulr_int16_min_squared_either_sat_or_wrap` — the INT16_MIN * INT16_MIN case. Spec relaxes the result of `sat_s((a*b + 0x4000) >> 15)` so an implementation may pick either the IEEE/x86 PMULHRSW saturate (0x7fff) or the truncate (0x8000). Test uses *membership* (either of the two allowed values) rather than exact equality, so a future switch to wrap doesn't break the test. 2. `madd_inf_times_zero_propagates_nan` — adversarial input for the fused/unfused FMA path (`f32x4.relaxed_madd`). IEEE 754 §7.2 makes `Inf * 0` an invalid multiply that produces NaN regardless of the subsequent add, so both `fma(Inf, 0, c)` and unfused `Inf * 0 + c` produce *some* NaN — but the specific NaN bit pattern is impl-defined. Test checks each lane against the IEEE-754 NaN predicate (exp == 0xff and fraction != 0) rather than an exact bit pattern. Locally exercised via `iwasm -f`: q15mulr result: 0x7fff (saturate, current SIMDe lowering) madd_inf_times_zero result: 0x7fc00000 per lane (canonical f32 NaN) Both fit the spec-allowed sets the tests describe; the membership assertions confirm without overfitting to the specific bit pattern. --- tests/unit/relaxed-simd/relaxed_simd_test.cc | 187 +++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc index 234cb4b772..d8d315d14d 100644 --- a/tests/unit/relaxed-simd/relaxed_simd_test.cc +++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc @@ -284,3 +284,190 @@ TEST_F(RelaxedSimdTest, dot_s_i16_overflow_pin_sibling_op) wasm_runtime_deinstantiate(inst); wasm_runtime_unload(module); } + +/* + * Spec-allowed-set test for `i16x8.relaxed_q15mulr_s` at the + * INT16_MIN * INT16_MIN overflow boundary. + * + * (module + * (func (export "q15mulr_int16_min_squared") (result i64) + * v128.const i16x8 -32768 0 0 0 0 0 0 0 + * v128.const i16x8 -32768 0 0 0 0 0 0 0 + * i16x8.relaxed_q15mulr_s + * i64x2.extract_lane 0)) + * + * Q15 multiply-with-rounding: lane = sat_s((a*b + 0x4000) >> 15). + * For a = b = INT16_MIN: + * a*b = (-32768)*(-32768) = 0x40000000 + * + 0x4000 = 0x40004000 + * >> 15 = 0x8000 = 32768 (overflows i16) + * sat_s = 32767 = 0x7fff (saturate, IEEE/x86 PMULHRSW) + * wrap = (int16)32768 = 0x8000 (truncate, spec-allowed) + * + * The spec's relaxed clause permits either lowering, so the lane-0 + * value must be 0x7fff OR 0x8000. Lanes 1..7 are 0 (deterministic). + * Encoded as the low i64 (i64x2.extract_lane 0) the spec-allowed + * set is { 0x0000_0000_0000_7fff, 0x0000_0000_0000_8000 }. + * + * WAMR's hand-rolled lowering picks saturate (0x7fff); this test + * pins the choice via membership rather than exact equality, so a + * future switch to wrap (spec-allowed) does not break the test. + */ +static const uint8_t Q15MULR_OVERFLOW_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, + 0x60, 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x1d, 0x01, + 0x19, 0x71, 0x31, 0x35, 0x6d, 0x75, 0x6c, 0x72, 0x5f, 0x69, 0x6e, + 0x74, 0x31, 0x36, 0x5f, 0x6d, 0x69, 0x6e, 0x5f, 0x73, 0x71, 0x75, + 0x61, 0x72, 0x65, 0x64, 0x00, 0x00, 0x0a, 0x2e, 0x01, 0x2c, 0x00, + 0xfd, 0x0c, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, 0x80, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0xfd, 0x91, 0x02, 0xfd, 0x1d, 0x00, 0x0b +}; + +TEST_F(RelaxedSimdTest, q15mulr_int16_min_squared_either_sat_or_wrap) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(Q15MULR_OVERFLOW_WASM)]; + memcpy(buf, Q15MULR_OVERFLOW_WASM, sizeof(Q15MULR_OVERFLOW_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = + wasm_runtime_lookup_function(inst, "q15mulr_int16_min_squared"); + ASSERT_NE(func, nullptr) << "export `q15mulr_int16_min_squared` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* Lanes 1..3 must be 0 (deterministic). Encoded in argv: lanes + * 1..3 occupy bits 16..63 of the 64-bit packed result. + * argv[0] (low i32) = (lane1 << 16) | lane0 + * argv[1] (high i32) = (lane3 << 16) | lane2 */ + EXPECT_EQ(argv[1], 0u) << "lanes 2,3 must be zero"; + EXPECT_EQ((argv[0] >> 16) & 0xffffu, 0u) << "lane 1 must be zero"; + + /* Lane 0 = low 16 bits of argv[0]: either 0x7fff (sat) or + * 0x8000 (wrap). Both spec-conformant per the relaxed-SIMD + * implementation-defined clause for q15mulr_s. */ + uint32_t lane0 = argv[0] & 0xffffu; + EXPECT_TRUE(lane0 == 0x7fffu || lane0 == 0x8000u) + << "lane 0 = 0x" << std::hex << lane0 + << ", expected 0x7fff (saturate) or 0x8000 (wrap)"; + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} + +/* + * Spec-allowed-set test for `f32x4.relaxed_madd` at the + * (Inf * 0 + c) invalid-multiply boundary. + * + * (module + * (func (export "madd_inf_times_zero_lo") (result i64) + * v128.const f32x4 inf inf inf inf + * v128.const f32x4 0 0 0 0 + * v128.const f32x4 1.0 2.0 3.0 4.0 + * f32x4.relaxed_madd + * i64x2.extract_lane 0) + * (func (export "madd_inf_times_zero_hi") (result i64) ;; lane 1) + * + * IEEE 754 §7.2: Inf × 0 is an invalid operation and produces NaN + * (regardless of the subsequent add of `c`). Both fused-multiply- + * add (`fma(Inf, 0, c)`) and unfused (`Inf * 0 + c`) lowerings of + * relaxed_madd produce a NaN here — so the choice between them + * doesn't affect the *kind* of result, only its specific bit + * pattern. The relaxed-SIMD spec leaves the NaN bit pattern + * implementation-defined, so the test checks the IEEE-754 NaN + * predicate (exponent all-ones, fraction non-zero) per lane + * rather than an exact bit pattern. + * + * This case is the relevant adversarial input for "do we + * propagate NaN through the FMA path correctly when one of the + * inputs is +Inf and another is +0?" — exactly the kind of + * boundary the spec test set doesn't explicitly cover. + */ +static const uint8_t MADD_INF_TIMES_ZERO_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7e, 0x03, 0x03, 0x02, 0x00, 0x00, 0x07, 0x33, 0x02, 0x16, + 0x6d, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6e, 0x66, 0x5f, 0x74, 0x69, 0x6d, + 0x65, 0x73, 0x5f, 0x7a, 0x65, 0x72, 0x6f, 0x5f, 0x6c, 0x6f, 0x00, 0x00, + 0x16, 0x6d, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6e, 0x66, 0x5f, 0x74, 0x69, + 0x6d, 0x65, 0x73, 0x5f, 0x7a, 0x65, 0x72, 0x6f, 0x5f, 0x68, 0x69, 0x00, + 0x01, 0x0a, 0x7f, 0x02, 0x3e, 0x00, 0xfd, 0x0c, 0x00, 0x00, 0x80, 0x7f, + 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, + 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, 0x00, 0x80, 0x3f, + 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, + 0xfd, 0x85, 0x02, 0xfd, 0x1d, 0x00, 0x0b, 0x3e, 0x00, 0xfd, 0x0c, 0x00, + 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, + 0x00, 0x80, 0x7f, 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, + 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, + 0x00, 0x80, 0x40, 0xfd, 0x85, 0x02, 0xfd, 0x1d, 0x01, 0x0b +}; + +/* Helper: true iff the f32 bit pattern is any NaN + * (exponent = 0xff, fraction != 0). */ +static bool +f32_bits_are_nan(uint32_t bits) +{ + uint32_t exp = (bits >> 23) & 0xff; + uint32_t frac = bits & 0x7fffff; + return exp == 0xff && frac != 0u; +} + +TEST_F(RelaxedSimdTest, madd_inf_times_zero_propagates_nan) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(MADD_INF_TIMES_ZERO_WASM)]; + memcpy(buf, MADD_INF_TIMES_ZERO_WASM, sizeof(MADD_INF_TIMES_ZERO_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + /* Call the lo half (lanes 0,1) then the hi half (lanes 2,3); + * each call returns one i64 packing two f32 lanes: + * argv[0] = lane2k bits, argv[1] = lane2k+1 bits */ + for (uint32_t half = 0; half < 2; half++) { + const char *name = + half == 0 ? "madd_inf_times_zero_lo" : "madd_inf_times_zero_hi"; + wasm_function_inst_t func = wasm_runtime_lookup_function(inst, name); + ASSERT_NE(func, nullptr) << "export `" << name << "` not found"; + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm `" << name + << "` failed: " << wasm_runtime_get_exception(inst); + + EXPECT_TRUE(f32_bits_are_nan(argv[0])) + << name << " lane " << (2 * half) << " not NaN: bits = 0x" + << std::hex << argv[0]; + EXPECT_TRUE(f32_bits_are_nan(argv[1])) + << name << " lane " << (2 * half + 1) << " not NaN: bits = 0x" + << std::hex << argv[1]; + } + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +}