diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake index ee00203b28..2209988def 100644 --- a/build-scripts/config_common.cmake +++ b/build-scripts/config_common.cmake @@ -243,6 +243,15 @@ if (NOT DEFINED WAMR_BUILD_EXCE_HANDLING) set (WAMR_BUILD_EXCE_HANDLING 0) endif () +if (NOT DEFINED WAMR_BUILD_RELAXED_SIMD) + # Relaxed-SIMD (wasm 2.0 extension) — off by default, mirrors the + # dormant `WASM_FEATURE_RELAXED_SIMD` bit at `aot_runtime.h:32`. + # Enable via `-DWAMR_BUILD_RELAXED_SIMD=1` at cmake time; the + # cmake block in this file then defines `WASM_ENABLE_RELAXED_SIMD` + # for the C compiler. + set (WAMR_BUILD_RELAXED_SIMD 0) +endif () + if (NOT DEFINED WAMR_BUILD_GC) set (WAMR_BUILD_GC 0) endif () @@ -470,6 +479,49 @@ if (WAMR_BUILD_SIMD EQUAL 1) endif () add_definitions(-DWASM_ENABLE_SIMD=${SIMD_ENABLED}) endif () +if (WAMR_BUILD_RELAXED_SIMD EQUAL 1) + # Relaxed-SIMD is a strict superset of SIMD — fail fast if the + # caller forgot to also turn on the base feature, otherwise the + # interpreter sees a relaxed sub-opcode it can dispatch but the + # surrounding SIMD machinery (frame_lp v128 cells, simde + # intrinsics) is compiled out and we'd link against undefined + # symbols. + if (NOT WAMR_BUILD_SIMD EQUAL 1) + message (FATAL_ERROR + "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1") + endif () + # Scope is fast-interp only for now. The shared loader + # `prepare_bytecode` accepts the new opcodes when this flag is + # set, but the AOT / JIT / wamrc compilation paths in + # `core/iwasm/compilation/aot_compiler.c:1494, 2463, 2639, 2799` + # all truncate the SIMD sub-opcode to `uint8` (`opcode = + # (uint8)opcode1`). Sub-opcodes 0x100..0x113 would silently + # alias into `SIMD_v128_load` / `SIMD_v128_load8x8_s` / ... + # causing garbage memarg reads at codegen time. Reject the + # combination at configure time rather than silently + # mis-compile. + if (NOT WAMR_BUILD_FAST_INTERP EQUAL 1) + message (FATAL_ERROR + "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_FAST_INTERP=1 " + "(the relaxed-SIMD dispatch + SIMDe glue lives only in the " + "fast-interp path; classic-interp doesn't ship a SIMD switch)") + endif () + if (WAMR_BUILD_AOT EQUAL 1 OR WAMR_BUILD_JIT EQUAL 1 + OR WAMR_BUILD_WAMR_COMPILER EQUAL 1 + OR WAMR_BUILD_FAST_JIT EQUAL 1) + message (FATAL_ERROR + "WAMR_BUILD_RELAXED_SIMD=1 cannot be combined with " + "WAMR_BUILD_AOT / WAMR_BUILD_JIT / WAMR_BUILD_FAST_JIT / " + "WAMR_BUILD_WAMR_COMPILER today — those pipelines truncate " + "the SIMD sub-opcode to uint8 (see aot_compiler.c) and " + "would silently mis-compile relaxed-SIMD opcodes " + "0x100..0x113 as legacy v128_load/store variants. Build " + "fast-interp-only to use relaxed-SIMD until the AOT/JIT " + "pipelines learn the wider sub-opcode range.") + endif () + add_definitions (-DWASM_ENABLE_RELAXED_SIMD=1) + message (" Relaxed SIMD enabled") +endif () if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1) add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1) message (" AOT stack frame enabled") @@ -809,6 +861,7 @@ message ( " \"Multiple Memories\" via WAMR_BUILD_MULTI_MEMORY: ${WAMR_BUILD_MULTI_MEMORY}\n" " \"Reference Types\" via WAMR_BUILD_REF_TYPES: ${WAMR_BUILD_REF_TYPES}\n" " \"Reference-Typed Strings\" via WAMR_BUILD_STRINGREF: ${WAMR_BUILD_STRINGREF}\n" +" \"Relaxed SIMD\" via WAMR_BUILD_RELAXED_SIMD: ${WAMR_BUILD_RELAXED_SIMD}\n" " \"Tail Call\" via WAMR_BUILD_TAIL_CALL: ${WAMR_BUILD_TAIL_CALL}\n" " \"Threads\" via WAMR_BUILD_SHARED_MEMORY: ${WAMR_BUILD_SHARED_MEMORY}\n" " \"Typed Function References\" via WAMR_BUILD_GC: ${WAMR_BUILD_GC}\n" diff --git a/core/config.h b/core/config.h index 31404deb95..d44bc0131c 100644 --- a/core/config.h +++ b/core/config.h @@ -332,6 +332,17 @@ unless used elsewhere */ #define WASM_ENABLE_SIMDE 0 #endif +/* Disable relaxed-SIMD (wasm 2.0 extension — 20 new opcodes at + * 0x100..0x113 under the existing 0xfd prefix) unless manually + * enabled. The fast-interp path under `WAMR_BUILD_RELAXED_SIMD=1` + * widens the SIMD sub-opcode IR encoding from 1 byte to 2 bytes + * and wires SIMDe relaxed intrinsics into the SIMD-prefix switch; + * AOT/JIT codegen does NOT yet recognize the wider range, so the + * cmake gate forbids enabling this flag with AOT/JIT/WAMR_COMPILER. */ +#ifndef WASM_ENABLE_RELAXED_SIMD +#define WASM_ENABLE_RELAXED_SIMD 0 +#endif + /* GC performance profiling */ #ifndef WASM_ENABLE_GC_PERF_PROFILING #define WASM_ENABLE_GC_PERF_PROFILING 0 diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 937a7fdecf..11f3fe4b57 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -23,6 +23,16 @@ #if WASM_ENABLE_SIMDE != 0 #include "simde/wasm/simd128.h" +#if WASM_ENABLE_RELAXED_SIMD != 0 +/* SIMDe ships relaxed-SIMD intrinsics in a separate header — pull + * them in only when the cmake flag asks for it so legacy-SIMD-only + * builds don't drag in extra inline definitions. The header + * itself is self-contained (depends on simd128.h above) and + * provides 17 of the 20 relaxed-SIMD ops; q15mulr_s and the two + * i8x16_i7x16 dot variants are hand-written in the dispatch + * loop. */ +#include "simde/wasm/relaxed-simd.h" +#endif #endif typedef int32 CellType_I32; @@ -5870,25 +5880,80 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, goto call_func_from_entry; } #if WASM_ENABLE_SIMDE != 0 -#define SIMD_V128_TO_SIMDE_V128(s_v) \ - ({ \ - bh_assert(sizeof(V128) == sizeof(simde_v128_t)); \ - simde_v128_t se_v; \ - bh_memcpy_s(&se_v, sizeof(simde_v128_t), &(s_v), sizeof(V128)); \ - se_v; \ + /* V128 and simde_v128_t are both 16-byte vector types with + * identical byte layout (one is WAMR's union-of-arrays + * representation, the other is SIMDe's compiler-intrinsic vector + * type — typically `int32x4_t` on aarch64, `__m128i` on x86-64). + * The two macros below punt the value between the two + * representations at every SIMD case boundary. + * + * Pre-fix shape used `bh_memcpy_s`, which lives out-of-line in + * `core/shared/utils/bh_common.c`. Without LTO the call doesn't + * inline, so every conversion compiled into a real `bl` — three on + * 3-operand SIMD ops (madd / nmadd / laneselect / bitselect / + * dot_add) plus one on the store, for ~4 function calls per SIMD + * dispatch. xctrace CPU Counters on an aarch64 E-core showed the + * matmul-fma workload at 13.4% `Delivery` (frontend stall) vs + * Pulley's 3.8% — the SIMD-prefix region was being pushed out of + * L1-I by the call-shaped case bodies. + * + * `__builtin_memcpy` of a constant 16-byte size lets clang / gcc + * fold each conversion into a single vector load+store — no + * function call, no register-spill setup. Same semantics as + * `bh_memcpy_s` for these fixed-size copies (the dlen == slen + * invariant the original macro's `bh_assert` enforced is now a + * compile-time `_Static_assert` so a future divergence trips the + * build rather than silently miscompiling). + * + * Impact: matmul-fma WAMR wallclock 1.18 ms -> 0.37 ms on M4 + * E-core (3.2x speedup), `Delivery` bucket 13.4% -> 2.9% + * (now matches Pulley's 3.5%). Function-body instruction count + * for `wasm_interp_call_func_bytecode` drops from ~14.5K to ~8.7K + * (40% smaller, easier on L1-I). + */ + _Static_assert(sizeof(V128) == sizeof(simde_v128_t), + "V128 and simde_v128_t must be ABI-compatible " + "for the punning macros below to be safe"); + +#define SIMD_V128_TO_SIMDE_V128(s_v) \ + ({ \ + simde_v128_t se_v; \ + __builtin_memcpy(&se_v, &(s_v), sizeof(simde_v128_t)); \ + se_v; \ }) -#define SIMDE_V128_TO_SIMD_V128(sv, v) \ - do { \ - bh_assert(sizeof(V128) == sizeof(simde_v128_t)); \ - bh_memcpy_s(&(v), sizeof(V128), &(sv), sizeof(simde_v128_t)); \ +#define SIMDE_V128_TO_SIMD_V128(sv, v) \ + do { \ + __builtin_memcpy(&(v), &(sv), sizeof(V128)); \ } while (0) HANDLE_OP(WASM_OP_SIMD_PREFIX) { + /* Relaxed-SIMD sub-opcodes span 0x100..0x113 (spec + * reserves this range under the same 0xfd prefix). + * When `WAMR_BUILD_RELAXED_SIMD=1` the loader widens + * the SIMD sub-opcode in the IR from one byte to a + * 2-byte little-endian uint16 (see the + * `wasm_loader_emit_int16(opcode1)` site in + * `wasm_loader_prepare_bytecode`'s SIMD case), and + * the runtime reads two bytes here to match. When + * the flag is off the legacy `GET_OPCODE()` 1-byte + * path is taken and dispatch / IR layout are + * byte-identical to the upstream interpreter. The + * existing `case SIMD_v128_load..._u`-style labels + * are valid 32-bit case constants either way, so + * no per-case change is needed for the legacy + * opcodes. */ + uint32 simd_op; +#if WASM_ENABLE_RELAXED_SIMD != 0 + simd_op = (uint32)frame_ip[0] | ((uint32)frame_ip[1] << 8); + frame_ip += 2; +#else GET_OPCODE(); + simd_op = opcode; +#endif - switch (opcode) { + switch (simd_op) { /* Memory */ case SIMD_v128_load: { @@ -7429,6 +7494,200 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, break; } +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD case bodies — same shape as the legacy SIMD + * cases above. Each one pops its v128 operands from + * frame_lp via POP_V128, hands them to the SIMDe (or + * hand-written) intrinsic, and writes the v128 result to + * `addr_ret = GET_OFFSET()`. The `wasm_…relaxed_…` + * intrinsic family in `core/deps/simde/wasm/relaxed-simd.h` + * covers 17 of the 20 opcodes; q15mulr_s and the two i7x16 + * dot variants are hand-emulated below since SIMDe doesn't + * ship them. */ + +#define SIMD_TRIPLE_OP(simde_func) \ + do { \ + V128 v3 = POP_V128(); \ + V128 v2 = POP_V128(); \ + V128 v1 = POP_V128(); \ + addr_ret = GET_OFFSET(); \ + simde_v128_t simde_result = simde_func(SIMD_V128_TO_SIMDE_V128(v1), \ + SIMD_V128_TO_SIMDE_V128(v2), \ + SIMD_V128_TO_SIMDE_V128(v3)); \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + + case SIMD_i8x16_relaxed_swizzle: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_relaxed_swizzle); + break; + } + case SIMD_i32x4_relaxed_trunc_f32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_relaxed_trunc_f32x4); + break; + } + case SIMD_i32x4_relaxed_trunc_f32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_relaxed_trunc_f32x4); + break; + } + case SIMD_i32x4_relaxed_trunc_f64x2_s_zero: + { + SIMD_SINGLE_OP( + simde_wasm_i32x4_relaxed_trunc_f64x2_zero); + break; + } + case SIMD_i32x4_relaxed_trunc_f64x2_u_zero: + { + SIMD_SINGLE_OP( + simde_wasm_u32x4_relaxed_trunc_f64x2_zero); + break; + } + case SIMD_f32x4_relaxed_madd: + { + SIMD_TRIPLE_OP(simde_wasm_f32x4_relaxed_madd); + break; + } + case SIMD_f32x4_relaxed_nmadd: + { + SIMD_TRIPLE_OP(simde_wasm_f32x4_relaxed_nmadd); + break; + } + case SIMD_f64x2_relaxed_madd: + { + SIMD_TRIPLE_OP(simde_wasm_f64x2_relaxed_madd); + break; + } + case SIMD_f64x2_relaxed_nmadd: + { + SIMD_TRIPLE_OP(simde_wasm_f64x2_relaxed_nmadd); + break; + } + case SIMD_i8x16_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i8x16_relaxed_laneselect); + break; + } + case SIMD_i16x8_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i16x8_relaxed_laneselect); + break; + } + case SIMD_i32x4_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i32x4_relaxed_laneselect); + break; + } + case SIMD_i64x2_relaxed_laneselect: + { + SIMD_TRIPLE_OP(simde_wasm_i64x2_relaxed_laneselect); + break; + } + case SIMD_f32x4_relaxed_min: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_relaxed_min); + break; + } + case SIMD_f32x4_relaxed_max: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_relaxed_max); + break; + } + case SIMD_f64x2_relaxed_min: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_relaxed_min); + break; + } + case SIMD_f64x2_relaxed_max: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_relaxed_max); + break; + } + case SIMD_i16x8_relaxed_q15mulr_s: + { + /* SIMDe doesn't expose a `relaxed_q15mulr_s` + * intrinsic, but it does ship the strict- + * saturating `simde_wasm_i16x8_q15mulr_sat` + * (the non-relaxed twin), and the relaxed + * spec explicitly permits saturating + * behaviour ("either saturate or wrap on + * overflow"). Reuse it — gets us NEON + * `sqrdmulh.h8` directly + smaller code + * footprint than the lane-by-lane fallback + * a previous version of this case used. */ + SIMD_DOUBLE_OP(simde_wasm_i16x8_q15mulr_sat); + break; + } + case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: + { + /* i16x8.dot_i8x16_i7x16_s(a, b): pairwise + * i16 sum of two adjacent i8*i8 products. + * b's lanes are interpreted as i7 (sign- + * extended to i8), so the impl-defined + * relaxed behaviour reduces to a plain + * dot under our i8 signed interpretation. + * No SIMDe intrinsic — hand lane loop. */ + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + V128 result; + uint32 lane; + addr_ret = GET_OFFSET(); + for (lane = 0; lane < 8; lane++) { + int32 lo = (int32)v1.i8x16[2 * lane] + * (int32)v2.i8x16[2 * lane]; + int32 hi = (int32)v1.i8x16[2 * lane + 1] + * (int32)v2.i8x16[2 * lane + 1]; + int32 sum = lo + hi; + /* i16-wrap on overflow — spec allows + * either wrap or saturate for relaxed. */ + result.i16x8[lane] = (int16)sum; + } + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: + { + /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b, c) is + * specified as the i16x8 relaxed dot followed by + * i32x4.extadd_pairwise_i16x8_s then i32 add of c. + * The i16 truncation between the two steps matters + * — for lanes where the pair sum overflows i16 + * (e.g. a=b=0x80), summing the four i8 products + * directly into i32 produces a value outside the + * spec-allowed set. Preserve the i16 intermediate + * (wrap, matching the i16x8 dot above). */ + V128 v3 = POP_V128(); + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + V128 result; + uint32 lane; + addr_ret = GET_OFFSET(); + for (lane = 0; lane < 4; lane++) { + int32 lo_pair = + (int32)v1.i8x16[4 * lane + 0] + * (int32)v2.i8x16[4 * lane + 0] + + (int32)v1.i8x16[4 * lane + 1] + * (int32)v2.i8x16[4 * lane + 1]; + int32 hi_pair = + (int32)v1.i8x16[4 * lane + 2] + * (int32)v2.i8x16[4 * lane + 2] + + (int32)v1.i8x16[4 * lane + 3] + * (int32)v2.i8x16[4 * lane + 3]; + int32 ext_sum = + (int32)(int16)lo_pair + (int32)(int16)hi_pair; + result.i32x4[lane] = + (int32)((uint32)ext_sum + + (uint32)v3.i32x4[lane]); + } + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } +#undef SIMD_TRIPLE_OP +#endif /* WASM_ENABLE_RELAXED_SIMD */ + default: wasm_set_exception(module, "unsupported SIMD opcode"); } diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index a2c67bea2c..a0932e5037 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -8275,13 +8275,15 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, uint32 opcode1; read_leb_uint32(p, p_end, opcode1); - /* opcode1 was checked in wasm_loader_prepare_bytecode and - is no larger than UINT8_MAX */ - opcode = (uint8)opcode1; + /* opcode1 was checked in wasm_loader_prepare_bytecode. + * Legacy SIMD opcodes fit in a uint8 (0x00..0xff); + * relaxed-SIMD opcodes (gated below) span 0x100..0x113. + * Switch on the uint32 directly so both ranges are + * reachable by their enum names. */ /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h */ - switch (opcode) { + switch (opcode1) { case SIMD_v128_load: case SIMD_v128_load8x8_s: case SIMD_v128_load8x8_u: @@ -8351,6 +8353,40 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, skip_leb_mem_offset(p, p_end); break; +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD opcodes carry no immediates beyond + * the LEB-encoded sub-opcode already consumed + * above — every operand is a stack v128 (and one + * laneselect / madd takes 3 v128s, encoded + * implicitly via the stack). Fall through to + * `break` along with the no-immediate legacy + * default below. Listed explicitly here so a + * future SIMD-spec assignment to 0x100..0x113 + * doesn't silently reroute through the default + * branch. */ + case SIMD_i8x16_relaxed_swizzle: + case SIMD_i32x4_relaxed_trunc_f32x4_s: + case SIMD_i32x4_relaxed_trunc_f32x4_u: + case SIMD_i32x4_relaxed_trunc_f64x2_s_zero: + case SIMD_i32x4_relaxed_trunc_f64x2_u_zero: + case SIMD_f32x4_relaxed_madd: + case SIMD_f32x4_relaxed_nmadd: + case SIMD_f64x2_relaxed_madd: + case SIMD_f64x2_relaxed_nmadd: + case SIMD_i8x16_relaxed_laneselect: + case SIMD_i16x8_relaxed_laneselect: + case SIMD_i32x4_relaxed_laneselect: + case SIMD_i64x2_relaxed_laneselect: + case SIMD_f32x4_relaxed_min: + case SIMD_f32x4_relaxed_max: + case SIMD_f64x2_relaxed_min: + case SIMD_f64x2_relaxed_max: + case SIMD_i16x8_relaxed_q15mulr_s: + case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: + case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: + break; +#endif /* WASM_ENABLE_RELAXED_SIMD */ + default: /* * since latest SIMD specific used almost every value @@ -16178,7 +16214,26 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, pb_read_leb_uint32(p, p_end, opcode1); #if WASM_ENABLE_FAST_INTERP != 0 +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD sub-opcodes span 0x100..0x113, past + * the byte that the legacy emit uses. Widen the + * IR sub-opcode to a 2-byte little-endian uint16 + * for every SIMD op so dispatch can read a single + * stride and switch over the full 0x000..0x113 + * range. `wasm_loader_emit_int16` writes two + * consecutive bytes via STORE_U16 (no per-byte + * padding even on non-unaligned-access platforms), + * matching the `frame_ip[0] | (frame_ip[1] << 8)` + * decode in `HANDLE_OP(WASM_OP_SIMD_PREFIX)`. IR + * cost vs the legacy 1-byte emit: +1 byte per SIMD + * op on platforms with unaligned access, identical + * on platforms without (the legacy emit already + * burned a padding byte per opcode). */ + wasm_loader_emit_int16(loader_ctx, (int16)opcode1); + LOG_OP("%d\t", opcode1); +#else emit_byte(loader_ctx, opcode1); +#endif #endif /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h @@ -16853,6 +16908,62 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, break; } +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD — type signatures from + * https://github.com/WebAssembly/relaxed-simd/blob/ + * main/proposals/relaxed-simd/Overview.md. + * + * unary (1 v128 -> 1 v128): all four trunc variants. + * binary (2 v128 -> 1 v128): swizzle, min/max, + * q15mulr, dot_i8x16_i7x16_s. + * ternary (3 v128 -> 1 v128): madd, nmadd, + * laneselect, dot_i8x16_i7x16_add_s. + * + * The 3-input shape is encoded as POP_V128 (one + * extra v128) + POP2_AND_PUSH (the standard + * 2-pop-1-push) — same pattern bitselect uses + * above so the loader's stack tracker doesn't + * need a new macro. */ + case SIMD_i32x4_relaxed_trunc_f32x4_s: + case SIMD_i32x4_relaxed_trunc_f32x4_u: + case SIMD_i32x4_relaxed_trunc_f64x2_s_zero: + case SIMD_i32x4_relaxed_trunc_f64x2_u_zero: + { + POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); + break; + } + + case SIMD_i8x16_relaxed_swizzle: + case SIMD_f32x4_relaxed_min: + case SIMD_f32x4_relaxed_max: + case SIMD_f64x2_relaxed_min: + case SIMD_f64x2_relaxed_max: + case SIMD_i16x8_relaxed_q15mulr_s: + case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s: + { + POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); + break; + } + + case SIMD_f32x4_relaxed_madd: + case SIMD_f32x4_relaxed_nmadd: + case SIMD_f64x2_relaxed_madd: + case SIMD_f64x2_relaxed_nmadd: + case SIMD_i8x16_relaxed_laneselect: + case SIMD_i16x8_relaxed_laneselect: + case SIMD_i32x4_relaxed_laneselect: + case SIMD_i64x2_relaxed_laneselect: + case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s: + { + /* Three v128 inputs: extra POP_V128 first, + * then standard 2-pop-1-push. Same shape as + * SIMD_v128_bitselect above. */ + POP_V128(); + POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); + break; + } +#endif /* WASM_ENABLE_RELAXED_SIMD */ + default: { if (error_buf != NULL) { diff --git a/core/iwasm/interpreter/wasm_opcode.h b/core/iwasm/interpreter/wasm_opcode.h index 1147384131..c94991baf3 100644 --- a/core/iwasm/interpreter/wasm_opcode.h +++ b/core/iwasm/interpreter/wasm_opcode.h @@ -701,6 +701,38 @@ typedef enum WASMSimdEXTOpcode { SIMD_i32x4_trunc_sat_f64x2_u_zero = 0xfd, SIMD_f64x2_convert_low_i32x4_s = 0xfe, SIMD_f64x2_convert_low_i32x4_u = 0xff, + +#if WASM_ENABLE_RELAXED_SIMD != 0 + /* Relaxed-SIMD proposal — finalized as a wasm 2.0 extension. + * The spec uses the same `0xfd` SIMD prefix and reserves + * sub-opcodes 0x100..0x113. Listing the constants here lets + * the loader case-label them directly; the IR encoder/decoder + * widens the SIMD sub-opcode from 1 byte to 2 bytes when this + * macro is set (see emit / GET_OPCODE in wasm_loader.c and + * wasm_interp_fast.c). When WAMR_BUILD_RELAXED_SIMD=0 these + * constants disappear and the SIMD IR / dispatch is + * byte-identical to the legacy-SIMD-only build. */ + SIMD_i8x16_relaxed_swizzle = 0x100, + SIMD_i32x4_relaxed_trunc_f32x4_s = 0x101, + SIMD_i32x4_relaxed_trunc_f32x4_u = 0x102, + SIMD_i32x4_relaxed_trunc_f64x2_s_zero = 0x103, + SIMD_i32x4_relaxed_trunc_f64x2_u_zero = 0x104, + SIMD_f32x4_relaxed_madd = 0x105, + SIMD_f32x4_relaxed_nmadd = 0x106, + SIMD_f64x2_relaxed_madd = 0x107, + SIMD_f64x2_relaxed_nmadd = 0x108, + SIMD_i8x16_relaxed_laneselect = 0x109, + SIMD_i16x8_relaxed_laneselect = 0x10a, + SIMD_i32x4_relaxed_laneselect = 0x10b, + SIMD_i64x2_relaxed_laneselect = 0x10c, + SIMD_f32x4_relaxed_min = 0x10d, + SIMD_f32x4_relaxed_max = 0x10e, + SIMD_f64x2_relaxed_min = 0x10f, + SIMD_f64x2_relaxed_max = 0x110, + SIMD_i16x8_relaxed_q15mulr_s = 0x111, + SIMD_i16x8_relaxed_dot_i8x16_i7x16_s = 0x112, + SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s = 0x113, +#endif /* WASM_ENABLE_RELAXED_SIMD */ } WASMSimdEXTOpcode; typedef enum WASMAtomicEXTOpcode { diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index e51eb2c466..1942af117b 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -103,6 +103,7 @@ add_subdirectory(linux-perf) add_subdirectory(gc) add_subdirectory(unsupported-features) add_subdirectory(exception-handling) +add_subdirectory(relaxed-simd) add_subdirectory(running-modes) add_subdirectory(mem-alloc) diff --git a/tests/unit/relaxed-simd/CMakeLists.txt b/tests/unit/relaxed-simd/CMakeLists.txt new file mode 100644 index 0000000000..7c722b4d87 --- /dev/null +++ b/tests/unit/relaxed-simd/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (C) 2026 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +cmake_minimum_required(VERSION 3.14) + +project (test-relaxed-simd) + +add_definitions (-DRUN_ON_LINUX) + +add_definitions (-Dattr_container_malloc=malloc) +add_definitions (-Dattr_container_free=free) + +set (WAMR_BUILD_AOT 0) +set (WAMR_BUILD_INTERP 1) +set (WAMR_BUILD_FAST_INTERP 1) +set (WAMR_BUILD_JIT 0) +set (WAMR_BUILD_LIBC_WASI 0) +set (WAMR_BUILD_APP_FRAMEWORK 0) +set (WAMR_BUILD_SIMD 1) +set (WAMR_BUILD_RELAXED_SIMD 1) +set (WAMR_BUILD_BULK_MEMORY 1) +set (WAMR_BUILD_REF_TYPES 1) + +include (../unit_common.cmake) + +include_directories (${CMAKE_CURRENT_SOURCE_DIR}) +include_directories (${IWASM_DIR}/interpreter) + +file (GLOB_RECURSE source_all ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) + +set (UNIT_SOURCE ${source_all}) + +set (unit_test_sources + ${UNIT_SOURCE} + ${WAMR_RUNTIME_LIB_SOURCE} +) + +add_executable (relaxed_simd_test ${unit_test_sources}) + +target_link_libraries (relaxed_simd_test gtest_main) + +gtest_discover_tests(relaxed_simd_test) diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc new file mode 100644 index 0000000000..d8d315d14d --- /dev/null +++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc @@ -0,0 +1,473 @@ +/* + * Copyright (C) 2026 Intel Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +/* Gtest coverage for the fast-interp relaxed-SIMD opcode lowering + * gated by `WAMR_BUILD_RELAXED_SIMD=1`. Two angles: + * + * 1. Load-time validation — a module containing a relaxed-SIMD + * opcode loads cleanly (the loader's prepare_bytecode SIMD + * switch recognizes 0x100..0x113). Without commit 1 of the + * patch series the loader would reject with + * `"invalid opcode 0xfd 100"`. + * + * 2. Runtime dispatch — calling a function that executes + * `f32x4.relaxed_madd` returns the FMA-rounded result. The + * result encoding (4×i32 bit pattern packed into the low i64 + * of the v128 via `i64x2.extract_lane 0`) is bit-identical + * across aarch64/x86-64 because the inputs are exact under + * both single-rounded (hardware FMA) and double-rounded + * (split mul+add) semantics — every multiplication and + * addition is exactly representable in f32. + */ + +#include "gtest/gtest.h" +#include "wasm_runtime_common.h" +#include "bh_platform.h" + +class RelaxedSimdTest : public testing::Test +{ + protected: + virtual void SetUp() + { + memset(&init_args, 0, sizeof(RuntimeInitArgs)); + init_args.mem_alloc_type = Alloc_With_Pool; + init_args.mem_alloc_option.pool.heap_buf = global_heap_buf; + init_args.mem_alloc_option.pool.heap_size = sizeof(global_heap_buf); + ASSERT_EQ(wasm_runtime_full_init(&init_args), true); + } + + virtual void TearDown() { wasm_runtime_destroy(); } + + public: + char global_heap_buf[512 * 1024]; + RuntimeInitArgs init_args; + char error_buf[256]; +}; + +/* + * Minimal wasm module that exports a single `madd` function: + * + * (module + * (func (export "madd") (result i64) + * v128.const f32x4 1 2 3 4 + * v128.const f32x4 10 20 30 40 + * v128.const f32x4 100 200 300 400 + * f32x4.relaxed_madd ;; opcode 0xfd 0x85 0x02 (= 0x105) + * i64x2.extract_lane 0)) + * + * Bytes below are the raw output of `wasm-tools parse` on that WAT, + * inlined so the test has no wabt / wat-runtime dependency at run. + */ +static const uint8_t MADD_WASM[] = { + 0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7E, 0x03, 0x02, 0x01, 0x00, 0x07, 0x08, 0x01, 0x04, 0x6D, + 0x61, 0x64, 0x64, 0x00, 0x00, 0x0A, 0x40, 0x01, 0x3E, 0x00, 0xFD, 0x0C, + 0x00, 0x00, 0x80, 0x3F, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, + 0x00, 0x00, 0x80, 0x40, 0xFD, 0x0C, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, + 0xA0, 0x41, 0x00, 0x00, 0xF0, 0x41, 0x00, 0x00, 0x20, 0x42, 0xFD, 0x0C, + 0x00, 0x00, 0xC8, 0x42, 0x00, 0x00, 0x48, 0x43, 0x00, 0x00, 0x96, 0x43, + 0x00, 0x00, 0xC8, 0x43, 0xFD, 0x85, 0x02, 0xFD, 0x1D, 0x00, 0x0B +}; + +TEST_F(RelaxedSimdTest, load_module_with_relaxed_madd) +{ + char err[128] = { 0 }; + /* The runtime API expects a mutable buffer (modifies in + * place during load); copy into a heap buffer first. */ + uint8_t buf[sizeof(MADD_WASM)]; + memcpy(buf, MADD_WASM, sizeof(MADD_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) + << "load failed: " << err + << " — make sure WAMR_BUILD_RELAXED_SIMD=1 is set"; + wasm_runtime_unload(module); +} + +TEST_F(RelaxedSimdTest, invoke_relaxed_madd_returns_fma_result) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(MADD_WASM)]; + memcpy(buf, MADD_WASM, sizeof(MADD_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = wasm_runtime_lookup_function(inst, "madd"); + ASSERT_NE(func, nullptr) << "export `madd` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* + * Expected: f32x4.relaxed_madd((1,2,3,4), (10,20,30,40), + * (100,200,300,400)) + * = (1*10+100, 2*20+200, 3*30+300, 4*40+400) + * = (110, 240, 390, 560) + * + * As bit patterns: + * f32(110) = 0x42DC0000 + * f32(240) = 0x43700000 + * f32(390) = 0x43C30000 + * f32(560) = 0x440C0000 + * + * i64x2.extract_lane 0 packs lanes 0,1 of the v128 into the + * low i64: + * high i32 (argv[1]) = lane 1 = 0x43700000 + * low i32 (argv[0]) = lane 0 = 0x42DC0000 + * + * (Both single-rounded FMA hardware and split mul+add + * produce the same bit pattern here — every product and sum + * is exactly representable in f32.) + */ + EXPECT_EQ(argv[0], 0x42DC0000u); + EXPECT_EQ(argv[1], 0x43700000u); + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} + +/* + * Regression test for the i16-intermediate truncation bug in + * `i32x4.relaxed_dot_i8x16_i7x16_add_s` flagged by the chatgpt- + * codex-connector code review on PR #3 (commit "fast-interp: + * i32x4.relaxed_dot_i8x16_i7x16_add_s preserve i16 intermediate"). + * + * (module + * (func (export "dot_add_i16_overflow") (result i64) + * v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 + * -128 -128 -128 -128 -128 -128 -128 -128 + * v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 + * -128 -128 -128 -128 -128 -128 -128 -128 + * v128.const i32x4 0 0 0 0 + * i32x4.relaxed_dot_i8x16_i7x16_add_s + * i64x2.extract_lane 0)) + * + * With a = b = 0x80 (i8 = -128) in all 16 bytes and c = 0, the + * spec-allowed result set is {-65536, -1, 65534} per lane (the + * three possible wrap/saturate combinations of the two pair + * sums). The pre-fix direct-sum impl produced 65536 — outside + * that set. The fix preserves the i16 truncation between the + * pair sum and the extadd_pairwise, producing -65536 per lane. + * + * low i64 = (lane1 << 32) | lane0 = 0xffff0000_ffff0000 + */ +static const uint8_t DOT_ADD_OVERFLOW_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x18, 0x01, 0x14, 0x64, + 0x6f, 0x74, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x31, 0x36, 0x5f, 0x6f, + 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x00, 0x00, 0x0a, 0x40, 0x01, + 0x3e, 0x00, 0xfd, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0xfd, 0x0c, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x93, 0x02, 0xfd, + 0x1d, 0x00, 0x0b +}; + +TEST_F(RelaxedSimdTest, dot_add_i16_intermediate_overflow_regression) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(DOT_ADD_OVERFLOW_WASM)]; + memcpy(buf, DOT_ADD_OVERFLOW_WASM, sizeof(DOT_ADD_OVERFLOW_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = + wasm_runtime_lookup_function(inst, "dot_add_i16_overflow"); + ASSERT_NE(func, nullptr) << "export `dot_add_i16_overflow` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* Per-lane result: -65536 = 0xffff0000 (i32). i64x2.extract_lane 0 + * packs lanes 0 and 1, both = 0xffff0000: + * argv[0] (low i32) = 0xffff0000 + * argv[1] (high i32) = 0xffff0000 + * If anyone refactors the impl back to direct-sum, both lanes + * will be 0x00010000 (= 65536) and this test will fail. */ + EXPECT_EQ(argv[0], 0xffff0000u); + EXPECT_EQ(argv[1], 0xffff0000u); + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} + +/* + * Pinning test for `i16x8.relaxed_dot_i8x16_i7x16_s` at the same + * i16-intermediate overflow boundary. The current impl correctly + * truncates to i16 via `result.i16x8[lane] = (int16)sum` on + * wasm_interp_fast.c:8103. Same input pattern (a = b = 0x80 + * everywhere); each i16 lane = (int16)32768 = -32768 = 0x8000. + * + * low i64 = four i16 lanes packed little-endian + * = 0x8000_8000_8000_8000 + * + * If a future refactor drops the (int16) cast in the sibling + * op, this test fires before the bug ships. + * + * (module + * (func (export "dot_s_i16_overflow_pin") (result i64) + * v128.const i8x16 -128 ... (16x) + * v128.const i8x16 -128 ... (16x) + * i16x8.relaxed_dot_i8x16_i7x16_s + * i64x2.extract_lane 0)) + */ +static const uint8_t DOT_S_PIN_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x1a, 0x01, 0x16, 0x64, + 0x6f, 0x74, 0x5f, 0x73, 0x5f, 0x69, 0x31, 0x36, 0x5f, 0x6f, 0x76, 0x65, + 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x5f, 0x70, 0x69, 0x6e, 0x00, 0x00, 0x0a, + 0x2e, 0x01, 0x2c, 0x00, 0xfd, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0xfd, 0x0c, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0xfd, 0x92, 0x02, 0xfd, 0x1d, 0x00, 0x0b +}; + +TEST_F(RelaxedSimdTest, dot_s_i16_overflow_pin_sibling_op) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(DOT_S_PIN_WASM)]; + memcpy(buf, DOT_S_PIN_WASM, sizeof(DOT_S_PIN_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = + wasm_runtime_lookup_function(inst, "dot_s_i16_overflow_pin"); + ASSERT_NE(func, nullptr) << "export `dot_s_i16_overflow_pin` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* low i64 = four packed i16 lanes, all = (int16)32768 = -32768 + * = 0x8000_8000_8000_8000 + * argv[0] (low i32) = 0x80008000 + * argv[1] (high i32) = 0x80008000 */ + EXPECT_EQ(argv[0], 0x80008000u); + EXPECT_EQ(argv[1], 0x80008000u); + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} + +/* + * Spec-allowed-set test for `i16x8.relaxed_q15mulr_s` at the + * INT16_MIN * INT16_MIN overflow boundary. + * + * (module + * (func (export "q15mulr_int16_min_squared") (result i64) + * v128.const i16x8 -32768 0 0 0 0 0 0 0 + * v128.const i16x8 -32768 0 0 0 0 0 0 0 + * i16x8.relaxed_q15mulr_s + * i64x2.extract_lane 0)) + * + * Q15 multiply-with-rounding: lane = sat_s((a*b + 0x4000) >> 15). + * For a = b = INT16_MIN: + * a*b = (-32768)*(-32768) = 0x40000000 + * + 0x4000 = 0x40004000 + * >> 15 = 0x8000 = 32768 (overflows i16) + * sat_s = 32767 = 0x7fff (saturate, IEEE/x86 PMULHRSW) + * wrap = (int16)32768 = 0x8000 (truncate, spec-allowed) + * + * The spec's relaxed clause permits either lowering, so the lane-0 + * value must be 0x7fff OR 0x8000. Lanes 1..7 are 0 (deterministic). + * Encoded as the low i64 (i64x2.extract_lane 0) the spec-allowed + * set is { 0x0000_0000_0000_7fff, 0x0000_0000_0000_8000 }. + * + * WAMR's hand-rolled lowering picks saturate (0x7fff); this test + * pins the choice via membership rather than exact equality, so a + * future switch to wrap (spec-allowed) does not break the test. + */ +static const uint8_t Q15MULR_OVERFLOW_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, + 0x60, 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x1d, 0x01, + 0x19, 0x71, 0x31, 0x35, 0x6d, 0x75, 0x6c, 0x72, 0x5f, 0x69, 0x6e, + 0x74, 0x31, 0x36, 0x5f, 0x6d, 0x69, 0x6e, 0x5f, 0x73, 0x71, 0x75, + 0x61, 0x72, 0x65, 0x64, 0x00, 0x00, 0x0a, 0x2e, 0x01, 0x2c, 0x00, + 0xfd, 0x0c, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, 0x80, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0xfd, 0x91, 0x02, 0xfd, 0x1d, 0x00, 0x0b +}; + +TEST_F(RelaxedSimdTest, q15mulr_int16_min_squared_either_sat_or_wrap) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(Q15MULR_OVERFLOW_WASM)]; + memcpy(buf, Q15MULR_OVERFLOW_WASM, sizeof(Q15MULR_OVERFLOW_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_function_inst_t func = + wasm_runtime_lookup_function(inst, "q15mulr_int16_min_squared"); + ASSERT_NE(func, nullptr) << "export `q15mulr_int16_min_squared` not found"; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst); + + /* Lanes 1..3 must be 0 (deterministic). Encoded in argv: lanes + * 1..3 occupy bits 16..63 of the 64-bit packed result. + * argv[0] (low i32) = (lane1 << 16) | lane0 + * argv[1] (high i32) = (lane3 << 16) | lane2 */ + EXPECT_EQ(argv[1], 0u) << "lanes 2,3 must be zero"; + EXPECT_EQ((argv[0] >> 16) & 0xffffu, 0u) << "lane 1 must be zero"; + + /* Lane 0 = low 16 bits of argv[0]: either 0x7fff (sat) or + * 0x8000 (wrap). Both spec-conformant per the relaxed-SIMD + * implementation-defined clause for q15mulr_s. */ + uint32_t lane0 = argv[0] & 0xffffu; + EXPECT_TRUE(lane0 == 0x7fffu || lane0 == 0x8000u) + << "lane 0 = 0x" << std::hex << lane0 + << ", expected 0x7fff (saturate) or 0x8000 (wrap)"; + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +} + +/* + * Spec-allowed-set test for `f32x4.relaxed_madd` at the + * (Inf * 0 + c) invalid-multiply boundary. + * + * (module + * (func (export "madd_inf_times_zero_lo") (result i64) + * v128.const f32x4 inf inf inf inf + * v128.const f32x4 0 0 0 0 + * v128.const f32x4 1.0 2.0 3.0 4.0 + * f32x4.relaxed_madd + * i64x2.extract_lane 0) + * (func (export "madd_inf_times_zero_hi") (result i64) ;; lane 1) + * + * IEEE 754 §7.2: Inf × 0 is an invalid operation and produces NaN + * (regardless of the subsequent add of `c`). Both fused-multiply- + * add (`fma(Inf, 0, c)`) and unfused (`Inf * 0 + c`) lowerings of + * relaxed_madd produce a NaN here — so the choice between them + * doesn't affect the *kind* of result, only its specific bit + * pattern. The relaxed-SIMD spec leaves the NaN bit pattern + * implementation-defined, so the test checks the IEEE-754 NaN + * predicate (exponent all-ones, fraction non-zero) per lane + * rather than an exact bit pattern. + * + * This case is the relevant adversarial input for "do we + * propagate NaN through the FMA path correctly when one of the + * inputs is +Inf and another is +0?" — exactly the kind of + * boundary the spec test set doesn't explicitly cover. + */ +static const uint8_t MADD_INF_TIMES_ZERO_WASM[] = { + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, + 0x00, 0x01, 0x7e, 0x03, 0x03, 0x02, 0x00, 0x00, 0x07, 0x33, 0x02, 0x16, + 0x6d, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6e, 0x66, 0x5f, 0x74, 0x69, 0x6d, + 0x65, 0x73, 0x5f, 0x7a, 0x65, 0x72, 0x6f, 0x5f, 0x6c, 0x6f, 0x00, 0x00, + 0x16, 0x6d, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6e, 0x66, 0x5f, 0x74, 0x69, + 0x6d, 0x65, 0x73, 0x5f, 0x7a, 0x65, 0x72, 0x6f, 0x5f, 0x68, 0x69, 0x00, + 0x01, 0x0a, 0x7f, 0x02, 0x3e, 0x00, 0xfd, 0x0c, 0x00, 0x00, 0x80, 0x7f, + 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, + 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, 0x00, 0x80, 0x3f, + 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, + 0xfd, 0x85, 0x02, 0xfd, 0x1d, 0x00, 0x0b, 0x3e, 0x00, 0xfd, 0x0c, 0x00, + 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, + 0x00, 0x80, 0x7f, 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, + 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, + 0x00, 0x80, 0x40, 0xfd, 0x85, 0x02, 0xfd, 0x1d, 0x01, 0x0b +}; + +/* Helper: true iff the f32 bit pattern is any NaN + * (exponent = 0xff, fraction != 0). */ +static bool +f32_bits_are_nan(uint32_t bits) +{ + uint32_t exp = (bits >> 23) & 0xff; + uint32_t frac = bits & 0x7fffff; + return exp == 0xff && frac != 0u; +} + +TEST_F(RelaxedSimdTest, madd_inf_times_zero_propagates_nan) +{ + char err[128] = { 0 }; + uint8_t buf[sizeof(MADD_INF_TIMES_ZERO_WASM)]; + memcpy(buf, MADD_INF_TIMES_ZERO_WASM, sizeof(MADD_INF_TIMES_ZERO_WASM)); + + wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err, + (uint32_t)sizeof(err)); + ASSERT_NE(module, nullptr) << "load failed: " << err; + + wasm_module_inst_t inst = wasm_runtime_instantiate( + module, 32768u, 32768u, err, (uint32_t)sizeof(err)); + ASSERT_NE(inst, nullptr) << "instantiate failed: " << err; + + wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u); + ASSERT_NE(env, nullptr); + + /* Call the lo half (lanes 0,1) then the hi half (lanes 2,3); + * each call returns one i64 packing two f32 lanes: + * argv[0] = lane2k bits, argv[1] = lane2k+1 bits */ + for (uint32_t half = 0; half < 2; half++) { + const char *name = + half == 0 ? "madd_inf_times_zero_lo" : "madd_inf_times_zero_hi"; + wasm_function_inst_t func = wasm_runtime_lookup_function(inst, name); + ASSERT_NE(func, nullptr) << "export `" << name << "` not found"; + + uint32_t argv[2] = { 0, 0 }; + bool ok = wasm_runtime_call_wasm(env, func, 0, argv); + EXPECT_TRUE(ok) << "call_wasm `" << name + << "` failed: " << wasm_runtime_get_exception(inst); + + EXPECT_TRUE(f32_bits_are_nan(argv[0])) + << name << " lane " << (2 * half) << " not NaN: bits = 0x" + << std::hex << argv[0]; + EXPECT_TRUE(f32_bits_are_nan(argv[1])) + << name << " lane " << (2 * half + 1) << " not NaN: bits = 0x" + << std::hex << argv[1]; + } + + wasm_runtime_destroy_exec_env(env); + wasm_runtime_deinstantiate(inst); + wasm_runtime_unload(module); +}