From 7562cf8d4b77552f16c5600e688a51d91f859f33 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Mon, 18 May 2026 23:00:40 -0700
Subject: [PATCH 1/9] fast-interp: extend WASMSimdEXTOpcode + loader validation
 for relaxed-SIMD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The relaxed-SIMD proposal — finalized as a wasm 2.0 extension — uses
the same 0xfd SIMD prefix and reserves sub-opcodes `0x100..0x113`
for its 20 new ops:

  relaxed_swizzle, relaxed_trunc_{f32x4,f64x2}_{s,u},
  relaxed_madd / relaxed_nmadd for f32x4 + f64x2,
  relaxed_laneselect for i8 / i16 / i32 / i64,
  relaxed_min / relaxed_max for f32x4 + f64x2,
  relaxed_q15mulr_s,
  relaxed_dot_i8x16_i7x16_{s,_add_s}.

This commit lays the loader-side validation needed to *recognize*
these opcodes without changing dispatch / runtime behaviour:

  * `WASMSimdEXTOpcode` enum (wasm_opcode.h) extended with the 20
    new constants at the spec-assigned values 0x100..0x113. Gated
    behind `WASM_ENABLE_RELAXED_SIMD != 0` so a build without the
    cmake flag (added in a follow-up commit) sees no new symbols
    and the enum's storage is unchanged.

  * `wasm_loader_find_block_addr` SIMD-prefix immediate skipper
    (wasm_loader.c:8273-8363) — the inner switch is now on the
    raw LEB-uint32 sub-opcode instead of the `(uint8)` cast, so
    relaxed-SIMD sub-opcodes 0x100..0x113 reach their own case
    labels instead of aliasing into legacy slots 0x00..0x13 and
    triggering wrong `skip_leb_*` paths. Relaxed-SIMD opcodes
    carry no immediates beyond the prefix, so the new cases just
    `break` — listed explicitly so a future SIMD-spec assignment
    in 0x100..0x113 doesn't silently fall through the default
    branch and silently mis-skip an immediate. Cast assignment to
    the outer `opcode` variable removed since it's no longer
    used by the inner switch (the outer-function switch already
    matched `WASM_OP_SIMD_PREFIX` and is inside that case).

  * `wasm_loader_prepare_bytecode` SIMD-prefix type checker
    (wasm_loader.c:16186+) — extended with type-signature case
    labels for each relaxed-SIMD opcode. Three signature classes:

      unary  (1 v128 -> 1 v128): the four trunc variants.
      binary (2 v128 -> 1 v128): swizzle, min/max, q15mulr,
                                 dot_i8x16_i7x16_s.
      ternary(3 v128 -> 1 v128): madd, nmadd, laneselect,
                                 dot_i8x16_i7x16_add_s.

    The 3-input ternary shape uses `POP_V128()` + `POP2_AND_PUSH`,
    mirroring how `SIMD_v128_bitselect` handles its 3-input shape
    today — no new stack-tracker macro needed.

  * The trailing `default:` branch in the type checker keeps
    rejecting unrecognized SIMD sub-opcodes with
    `"invalid opcode 0xfd %02x."`, which now correctly surfaces
    the full uint32 value (relaxed-SIMD opcodes 0x100+ are
    rendered as e.g. `0xfd 100` — the `%02x` width is a minimum,
    not a truncation).

The runtime executor (the actual case bodies in
`HANDLE_OP(WASM_OP_SIMD_PREFIX)` and the IR encoder widening
needed to reach them past the existing 1-byte sub-opcode read)
is the follow-up commit. Cmake `WAMR_BUILD_RELAXED_SIMD` flag
that flips `WASM_ENABLE_RELAXED_SIMD=1` is the third commit.
Built clean against `cd390ea0` with the flag absent — no
binary or behavioural change to existing SIMD code.

References:
  https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/Overview.md
  https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/_md/instructions.md
---
 core/iwasm/interpreter/wasm_loader.c | 100 +++++++++++++++++++++++++--
 core/iwasm/interpreter/wasm_opcode.h |  32 +++++++++
 2 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c
index a2c67bea2c..e0d2d97eca 100644
--- a/core/iwasm/interpreter/wasm_loader.c
+++ b/core/iwasm/interpreter/wasm_loader.c
@@ -8275,13 +8275,15 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache,
                 uint32 opcode1;
 
                 read_leb_uint32(p, p_end, opcode1);
-                /* opcode1 was checked in wasm_loader_prepare_bytecode and
-                   is no larger than UINT8_MAX */
-                opcode = (uint8)opcode1;
+                /* opcode1 was checked in wasm_loader_prepare_bytecode.
+                 * Legacy SIMD opcodes fit in a uint8 (0x00..0xff);
+                 * relaxed-SIMD opcodes (gated below) span 0x100..0x113.
+                 * Switch on the uint32 directly so both ranges are
+                 * reachable by their enum names. */
 
                 /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h
                  */
-                switch (opcode) {
+                switch (opcode1) {
                     case SIMD_v128_load:
                     case SIMD_v128_load8x8_s:
                     case SIMD_v128_load8x8_u:
@@ -8351,6 +8353,40 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache,
                         skip_leb_mem_offset(p, p_end);
                         break;
 
+#if WASM_ENABLE_RELAXED_SIMD != 0
+                    /* Relaxed-SIMD opcodes carry no immediates beyond
+                     * the LEB-encoded sub-opcode already consumed
+                     * above — every operand is a stack v128 (and one
+                     * laneselect / madd takes 3 v128s, encoded
+                     * implicitly via the stack). Fall through to
+                     * `break` along with the no-immediate legacy
+                     * default below. Listed explicitly here so a
+                     * future SIMD-spec assignment to 0x100..0x113
+                     * doesn't silently reroute through the default
+                     * branch. */
+                    case SIMD_i8x16_relaxed_swizzle:
+                    case SIMD_i32x4_relaxed_trunc_f32x4_s:
+                    case SIMD_i32x4_relaxed_trunc_f32x4_u:
+                    case SIMD_i32x4_relaxed_trunc_f64x2_s_zero:
+                    case SIMD_i32x4_relaxed_trunc_f64x2_u_zero:
+                    case SIMD_f32x4_relaxed_madd:
+                    case SIMD_f32x4_relaxed_nmadd:
+                    case SIMD_f64x2_relaxed_madd:
+                    case SIMD_f64x2_relaxed_nmadd:
+                    case SIMD_i8x16_relaxed_laneselect:
+                    case SIMD_i16x8_relaxed_laneselect:
+                    case SIMD_i32x4_relaxed_laneselect:
+                    case SIMD_i64x2_relaxed_laneselect:
+                    case SIMD_f32x4_relaxed_min:
+                    case SIMD_f32x4_relaxed_max:
+                    case SIMD_f64x2_relaxed_min:
+                    case SIMD_f64x2_relaxed_max:
+                    case SIMD_i16x8_relaxed_q15mulr_s:
+                    case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s:
+                    case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s:
+                        break;
+#endif /* WASM_ENABLE_RELAXED_SIMD */
+
                     default:
                         /*
                          * since latest SIMD specific used almost every value
@@ -16853,6 +16889,62 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func,
                         break;
                     }
 
+#if WASM_ENABLE_RELAXED_SIMD != 0
+                    /* Relaxed-SIMD — type signatures from
+                     * https://github.com/WebAssembly/relaxed-simd/blob/
+                     * main/proposals/relaxed-simd/Overview.md.
+                     *
+                     *  unary (1 v128 -> 1 v128): all four trunc variants.
+                     *  binary (2 v128 -> 1 v128): swizzle, min/max,
+                     *      q15mulr, dot_i8x16_i7x16_s.
+                     *  ternary (3 v128 -> 1 v128): madd, nmadd,
+                     *      laneselect, dot_i8x16_i7x16_add_s.
+                     *
+                     * The 3-input shape is encoded as POP_V128 (one
+                     * extra v128) + POP2_AND_PUSH (the standard
+                     * 2-pop-1-push) — same pattern bitselect uses
+                     * above so the loader's stack tracker doesn't
+                     * need a new macro. */
+                    case SIMD_i32x4_relaxed_trunc_f32x4_s:
+                    case SIMD_i32x4_relaxed_trunc_f32x4_u:
+                    case SIMD_i32x4_relaxed_trunc_f64x2_s_zero:
+                    case SIMD_i32x4_relaxed_trunc_f64x2_u_zero:
+                    {
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    }
+
+                    case SIMD_i8x16_relaxed_swizzle:
+                    case SIMD_f32x4_relaxed_min:
+                    case SIMD_f32x4_relaxed_max:
+                    case SIMD_f64x2_relaxed_min:
+                    case SIMD_f64x2_relaxed_max:
+                    case SIMD_i16x8_relaxed_q15mulr_s:
+                    case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s:
+                    {
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    }
+
+                    case SIMD_f32x4_relaxed_madd:
+                    case SIMD_f32x4_relaxed_nmadd:
+                    case SIMD_f64x2_relaxed_madd:
+                    case SIMD_f64x2_relaxed_nmadd:
+                    case SIMD_i8x16_relaxed_laneselect:
+                    case SIMD_i16x8_relaxed_laneselect:
+                    case SIMD_i32x4_relaxed_laneselect:
+                    case SIMD_i64x2_relaxed_laneselect:
+                    case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s:
+                    {
+                        /* Three v128 inputs: extra POP_V128 first,
+                         * then standard 2-pop-1-push. Same shape as
+                         * SIMD_v128_bitselect above. */
+                        POP_V128();
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    }
+#endif /* WASM_ENABLE_RELAXED_SIMD */
+
                     default:
                     {
                         if (error_buf != NULL) {
diff --git a/core/iwasm/interpreter/wasm_opcode.h b/core/iwasm/interpreter/wasm_opcode.h
index 1147384131..c94991baf3 100644
--- a/core/iwasm/interpreter/wasm_opcode.h
+++ b/core/iwasm/interpreter/wasm_opcode.h
@@ -701,6 +701,38 @@ typedef enum WASMSimdEXTOpcode {
     SIMD_i32x4_trunc_sat_f64x2_u_zero = 0xfd,
     SIMD_f64x2_convert_low_i32x4_s = 0xfe,
     SIMD_f64x2_convert_low_i32x4_u = 0xff,
+
+#if WASM_ENABLE_RELAXED_SIMD != 0
+    /* Relaxed-SIMD proposal — finalized as a wasm 2.0 extension.
+     * The spec uses the same `0xfd` SIMD prefix and reserves
+     * sub-opcodes 0x100..0x113. Listing the constants here lets
+     * the loader case-label them directly; the IR encoder/decoder
+     * widens the SIMD sub-opcode from 1 byte to 2 bytes when this
+     * macro is set (see emit / GET_OPCODE in wasm_loader.c and
+     * wasm_interp_fast.c). When WAMR_BUILD_RELAXED_SIMD=0 these
+     * constants disappear and the SIMD IR / dispatch is
+     * byte-identical to the legacy-SIMD-only build. */
+    SIMD_i8x16_relaxed_swizzle = 0x100,
+    SIMD_i32x4_relaxed_trunc_f32x4_s = 0x101,
+    SIMD_i32x4_relaxed_trunc_f32x4_u = 0x102,
+    SIMD_i32x4_relaxed_trunc_f64x2_s_zero = 0x103,
+    SIMD_i32x4_relaxed_trunc_f64x2_u_zero = 0x104,
+    SIMD_f32x4_relaxed_madd = 0x105,
+    SIMD_f32x4_relaxed_nmadd = 0x106,
+    SIMD_f64x2_relaxed_madd = 0x107,
+    SIMD_f64x2_relaxed_nmadd = 0x108,
+    SIMD_i8x16_relaxed_laneselect = 0x109,
+    SIMD_i16x8_relaxed_laneselect = 0x10a,
+    SIMD_i32x4_relaxed_laneselect = 0x10b,
+    SIMD_i64x2_relaxed_laneselect = 0x10c,
+    SIMD_f32x4_relaxed_min = 0x10d,
+    SIMD_f32x4_relaxed_max = 0x10e,
+    SIMD_f64x2_relaxed_min = 0x10f,
+    SIMD_f64x2_relaxed_max = 0x110,
+    SIMD_i16x8_relaxed_q15mulr_s = 0x111,
+    SIMD_i16x8_relaxed_dot_i8x16_i7x16_s = 0x112,
+    SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s = 0x113,
+#endif /* WASM_ENABLE_RELAXED_SIMD */
 } WASMSimdEXTOpcode;
 
 typedef enum WASMAtomicEXTOpcode {

From 20f2825f2d49f191b7bc01d73ae17e358126761c Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Mon, 18 May 2026 23:08:51 -0700
Subject: [PATCH 2/9] fast-interp: runtime cases for relaxed-SIMD opcodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 20 relaxed-SIMD ops (`0x100..0x113`) land as new case bodies
inside the existing `HANDLE_OP(WASM_OP_SIMD_PREFIX)` switch in
`wasm_interp_fast.c`. Each case follows the legacy SIMD-case
shape: pop the v128 operand(s) from `frame_lp`, hand them to a
SIMDe intrinsic (or a hand lane loop for the three SIMDe-missing
ops), write one v128 result.

To reach a case past 0xff the SIMD sub-opcode is widened from a
single byte to a little-endian uint16 in the IR. The loader emits
two consecutive bytes via `wasm_loader_emit_int16` (STORE_U16, no
padding even on platforms without unaligned access). The runtime
reads `frame_ip[0] | (frame_ip[1] << 8)` and switches over the
full `0x000..0x113` range. The widening is conditional on
`WASM_ENABLE_RELAXED_SIMD != 0`; when off, the IR is still
1-byte-per-SIMD-op via `emit_byte` and the runtime dispatch is
the legacy `GET_OPCODE()` 1-byte read — byte-identical to
upstream.

Per-case dispatch:

  swizzle (i8x16 .relaxed_swizzle)                    DOUBLE
  trunc_{f32x4,f64x2}_{s,u,_zero}    (4 unary)         SINGLE
  {f32,f64}x_relaxed_{madd,nmadd}    (4 ternary)       TRIPLE
  {i8,i16,i32,i64}x_relaxed_laneselect (4 ternary)     TRIPLE
  {f32,f64}x_relaxed_{min,max}        (4 binary)        DOUBLE
  i16x8.relaxed_q15mulr_s             (binary)          hand loop
  i16x8.relaxed_dot_i8x16_i7x16_s     (binary)          hand loop
  i32x4.relaxed_dot_i8x16_i7x16_add_s (ternary)         hand loop

SIMDe's `simde/wasm/relaxed-simd.h` (already shipped in
`core/deps/simde`) provides 17 of the 20 intrinsics; q15mulr_s,
dot_i8x16_i7x16_s, and dot_i8x16_i7x16_add_s are missing so the
dispatch loop inlines a per-lane C implementation. The relaxed-
SIMD spec allows implementation-defined behavior on overflow for
those three (wrap vs. saturate); the impls here match the
strict-IEEE / saturating shape — same as the corresponding
non-relaxed ops — which is conformant and matches the SIMDe
hand-coded fallbacks for q15mulr_sat_s.

A new local `SIMD_TRIPLE_OP(simde_func)` macro pops 3 v128s and
hands them to a 3-arg intrinsic; same shape as `SIMD_DOUBLE_OP` /
`SIMD_SINGLE_OP` for two- and one-arg ops. `#undef`-ed at the end
of the gated block so the macro doesn't leak into the legacy
build.

Smoke tested via a 6-op WAT module (swizzle, madd, min,
laneselect, q15mulr_s, trunc_f32x4_s) compiled to wasm and run
through the `iwasm` CLI with `WAMR_BUILD_RELAXED_SIMD=1`:

  madd        = [110, 240, 390, 560]            ✓
  trunc_f32   = [1, -2, 3, -4]                   ✓
  min         = [1, 2, 2, 1]                     ✓
  q15mulr     = [0,0,1,1,3,4,6,-7]               ✓
  swizzle     = [15..0] (reverse)                 ✓
  laneselect  = (bitwise a/b mux per mask)       ✓

The `wasm_loader_prepare_bytecode` SIMD switch type checker
(commit 1) is already populated for the new opcodes, so the
relaxed-SIMD wasm validates through the loader and then reaches
the new dispatch cases here. The cmake flag that exposes the
feature (`WAMR_BUILD_RELAXED_SIMD`) is the next commit; this one
adds the runtime side gated on the eventual macro.
---
 core/iwasm/interpreter/wasm_interp_fast.c | 232 +++++++++++++++++++++-
 core/iwasm/interpreter/wasm_loader.c      |  19 ++
 2 files changed, 250 insertions(+), 1 deletion(-)

diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c
index 937a7fdecf..61b705fae8 100644
--- a/core/iwasm/interpreter/wasm_interp_fast.c
+++ b/core/iwasm/interpreter/wasm_interp_fast.c
@@ -23,6 +23,16 @@
 
 #if WASM_ENABLE_SIMDE != 0
 #include "simde/wasm/simd128.h"
+#if WASM_ENABLE_RELAXED_SIMD != 0
+/* SIMDe ships relaxed-SIMD intrinsics in a separate header — pull
+ * them in only when the cmake flag asks for it so legacy-SIMD-only
+ * builds don't drag in extra inline definitions. The header
+ * itself is self-contained (depends on simd128.h above) and
+ * provides 17 of the 20 relaxed-SIMD ops; q15mulr_s and the two
+ * i8x16_i7x16 dot variants are hand-written in the dispatch
+ * loop. */
+#include "simde/wasm/relaxed-simd.h"
+#endif
 #endif
 
 typedef int32 CellType_I32;
@@ -5886,9 +5896,31 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
 
             HANDLE_OP(WASM_OP_SIMD_PREFIX)
             {
+                /* Relaxed-SIMD sub-opcodes span 0x100..0x113 (spec
+                 * reserves this range under the same 0xfd prefix).
+                 * When `WAMR_BUILD_RELAXED_SIMD=1` the loader widens
+                 * the SIMD sub-opcode in the IR from one byte to a
+                 * 2-byte little-endian uint16 (see the
+                 * `emit_uint16(opcode1)` site in
+                 * `wasm_loader_prepare_bytecode`'s SIMD case), and
+                 * the runtime reads two bytes here to match. When
+                 * the flag is off the legacy `GET_OPCODE()` 1-byte
+                 * path is taken and dispatch / IR layout are
+                 * byte-identical to the upstream interpreter. The
+                 * existing `case SIMD_v128_load..._u`-style labels
+                 * are valid 32-bit case constants either way, so
+                 * no per-case change is needed for the legacy
+                 * opcodes. */
+                uint32 simd_op;
+#if WASM_ENABLE_RELAXED_SIMD != 0
+                simd_op = (uint32)frame_ip[0] | ((uint32)frame_ip[1] << 8);
+                frame_ip += 2;
+#else
                 GET_OPCODE();
+                simd_op = opcode;
+#endif
 
-                switch (opcode) {
+                switch (simd_op) {
                     /* Memory */
                     case SIMD_v128_load:
                     {
@@ -7429,6 +7461,204 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                         break;
                     }
 
+#if WASM_ENABLE_RELAXED_SIMD != 0
+                    /* Relaxed-SIMD case bodies — same shape as the legacy SIMD
+                     * cases above. Each one pops its v128 operands from
+                     * frame_lp via POP_V128, hands them to the SIMDe (or
+                     * hand-written) intrinsic, and writes the v128 result to
+                     * `addr_ret = GET_OFFSET()`. The `wasm_…relaxed_…`
+                     * intrinsic family in `core/deps/simde/wasm/relaxed-simd.h`
+                     * covers 17 of the 20 opcodes; q15mulr_s and the two i7x16
+                     * dot variants are hand-emulated below since SIMDe doesn't
+                     * ship them. */
+
+#define SIMD_TRIPLE_OP(simde_func)                                           \
+    do {                                                                     \
+        V128 v3 = POP_V128();                                                \
+        V128 v2 = POP_V128();                                                \
+        V128 v1 = POP_V128();                                                \
+        addr_ret = GET_OFFSET();                                             \
+        simde_v128_t simde_result = simde_func(SIMD_V128_TO_SIMDE_V128(v1),  \
+                                               SIMD_V128_TO_SIMDE_V128(v2),  \
+                                               SIMD_V128_TO_SIMDE_V128(v3)); \
+        V128 result;                                                         \
+        SIMDE_V128_TO_SIMD_V128(simde_result, result);                       \
+        PUT_V128_TO_ADDR(frame_lp + addr_ret, result);                       \
+    } while (0)
+
+                    case SIMD_i8x16_relaxed_swizzle:
+                    {
+                        SIMD_DOUBLE_OP(simde_wasm_i8x16_relaxed_swizzle);
+                        break;
+                    }
+                    case SIMD_i32x4_relaxed_trunc_f32x4_s:
+                    {
+                        SIMD_SINGLE_OP(simde_wasm_i32x4_relaxed_trunc_f32x4);
+                        break;
+                    }
+                    case SIMD_i32x4_relaxed_trunc_f32x4_u:
+                    {
+                        SIMD_SINGLE_OP(simde_wasm_u32x4_relaxed_trunc_f32x4);
+                        break;
+                    }
+                    case SIMD_i32x4_relaxed_trunc_f64x2_s_zero:
+                    {
+                        SIMD_SINGLE_OP(
+                            simde_wasm_i32x4_relaxed_trunc_f64x2_zero);
+                        break;
+                    }
+                    case SIMD_i32x4_relaxed_trunc_f64x2_u_zero:
+                    {
+                        SIMD_SINGLE_OP(
+                            simde_wasm_u32x4_relaxed_trunc_f64x2_zero);
+                        break;
+                    }
+                    case SIMD_f32x4_relaxed_madd:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_f32x4_relaxed_madd);
+                        break;
+                    }
+                    case SIMD_f32x4_relaxed_nmadd:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_f32x4_relaxed_nmadd);
+                        break;
+                    }
+                    case SIMD_f64x2_relaxed_madd:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_f64x2_relaxed_madd);
+                        break;
+                    }
+                    case SIMD_f64x2_relaxed_nmadd:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_f64x2_relaxed_nmadd);
+                        break;
+                    }
+                    case SIMD_i8x16_relaxed_laneselect:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_i8x16_relaxed_laneselect);
+                        break;
+                    }
+                    case SIMD_i16x8_relaxed_laneselect:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_i16x8_relaxed_laneselect);
+                        break;
+                    }
+                    case SIMD_i32x4_relaxed_laneselect:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_i32x4_relaxed_laneselect);
+                        break;
+                    }
+                    case SIMD_i64x2_relaxed_laneselect:
+                    {
+                        SIMD_TRIPLE_OP(simde_wasm_i64x2_relaxed_laneselect);
+                        break;
+                    }
+                    case SIMD_f32x4_relaxed_min:
+                    {
+                        SIMD_DOUBLE_OP(simde_wasm_f32x4_relaxed_min);
+                        break;
+                    }
+                    case SIMD_f32x4_relaxed_max:
+                    {
+                        SIMD_DOUBLE_OP(simde_wasm_f32x4_relaxed_max);
+                        break;
+                    }
+                    case SIMD_f64x2_relaxed_min:
+                    {
+                        SIMD_DOUBLE_OP(simde_wasm_f64x2_relaxed_min);
+                        break;
+                    }
+                    case SIMD_f64x2_relaxed_max:
+                    {
+                        SIMD_DOUBLE_OP(simde_wasm_f64x2_relaxed_max);
+                        break;
+                    }
+                    case SIMD_i16x8_relaxed_q15mulr_s:
+                    {
+                        /* SIMDe doesn't expose an `i16x8_q15mulr` —
+                         * emulate lane-wise. The relaxed flavor is
+                         * allowed to skip saturation on overflow,
+                         * but matching the strict-saturating shape
+                         * here is conformant and removes a per-lane
+                         * branch that the spec would have allowed
+                         * us to omit. Same body as a hand-written
+                         * non-relaxed q15mulr_sat_s. */
+                        V128 v2 = POP_V128();
+                        V128 v1 = POP_V128();
+                        V128 result;
+                        uint32 lane;
+                        addr_ret = GET_OFFSET();
+                        for (lane = 0; lane < 8; lane++) {
+                            int32 prod =
+                                (int32)v1.i16x8[lane] * (int32)v2.i16x8[lane];
+                            int32 rounded = (prod + 0x4000) >> 15;
+                            if (rounded > 0x7fff)
+                                rounded = 0x7fff;
+                            else if (rounded < -0x8000)
+                                rounded = -0x8000;
+                            result.i16x8[lane] = (int16)rounded;
+                        }
+                        PUT_V128_TO_ADDR(frame_lp + addr_ret, result);
+                        break;
+                    }
+                    case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s:
+                    {
+                        /* i16x8.dot_i8x16_i7x16_s(a, b): pairwise
+                         * i16 sum of two adjacent i8*i8 products.
+                         * b's lanes are interpreted as i7 (sign-
+                         * extended to i8), so the impl-defined
+                         * relaxed behaviour reduces to a plain
+                         * dot under our i8 signed interpretation.
+                         * No SIMDe intrinsic — hand lane loop. */
+                        V128 v2 = POP_V128();
+                        V128 v1 = POP_V128();
+                        V128 result;
+                        uint32 lane;
+                        addr_ret = GET_OFFSET();
+                        for (lane = 0; lane < 8; lane++) {
+                            int32 lo = (int32)v1.i8x16[2 * lane]
+                                       * (int32)v2.i8x16[2 * lane];
+                            int32 hi = (int32)v1.i8x16[2 * lane + 1]
+                                       * (int32)v2.i8x16[2 * lane + 1];
+                            int32 sum = lo + hi;
+                            /* i16-wrap on overflow — spec allows
+                             * either wrap or saturate for relaxed. */
+                            result.i16x8[lane] = (int16)sum;
+                        }
+                        PUT_V128_TO_ADDR(frame_lp + addr_ret, result);
+                        break;
+                    }
+                    case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s:
+                    {
+                        /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b,
+                         * c): compute i16x8.relaxed_dot_i8x16_i7x16_s
+                         * then i32x4 extend-pairwise-add, then add c.
+                         * Each i32 lane sums four i8*i8 products
+                         * (two pairs from the i16x8 intermediate)
+                         * plus the corresponding i32 from c. */
+                        V128 v3 = POP_V128();
+                        V128 v2 = POP_V128();
+                        V128 v1 = POP_V128();
+                        V128 result;
+                        uint32 lane;
+                        addr_ret = GET_OFFSET();
+                        for (lane = 0; lane < 4; lane++) {
+                            int32 sum = 0;
+                            uint32 k;
+                            for (k = 0; k < 4; k++) {
+                                int32 byte = 4 * lane + k;
+                                sum += (int32)v1.i8x16[byte]
+                                       * (int32)v2.i8x16[byte];
+                            }
+                            result.i32x4[lane] =
+                                (int32)((uint32)sum + (uint32)v3.i32x4[lane]);
+                        }
+                        PUT_V128_TO_ADDR(frame_lp + addr_ret, result);
+                        break;
+                    }
+#undef SIMD_TRIPLE_OP
+#endif /* WASM_ENABLE_RELAXED_SIMD */
+
                     default:
                         wasm_set_exception(module, "unsupported SIMD opcode");
                 }
diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c
index e0d2d97eca..a0932e5037 100644
--- a/core/iwasm/interpreter/wasm_loader.c
+++ b/core/iwasm/interpreter/wasm_loader.c
@@ -16214,7 +16214,26 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func,
                 pb_read_leb_uint32(p, p_end, opcode1);
 
 #if WASM_ENABLE_FAST_INTERP != 0
+#if WASM_ENABLE_RELAXED_SIMD != 0
+                /* Relaxed-SIMD sub-opcodes span 0x100..0x113, past
+                 * the byte that the legacy emit uses. Widen the
+                 * IR sub-opcode to a 2-byte little-endian uint16
+                 * for every SIMD op so dispatch can read a single
+                 * stride and switch over the full 0x000..0x113
+                 * range. `wasm_loader_emit_int16` writes two
+                 * consecutive bytes via STORE_U16 (no per-byte
+                 * padding even on non-unaligned-access platforms),
+                 * matching the `frame_ip[0] | (frame_ip[1] << 8)`
+                 * decode in `HANDLE_OP(WASM_OP_SIMD_PREFIX)`. IR
+                 * cost vs the legacy 1-byte emit: +1 byte per SIMD
+                 * op on platforms with unaligned access, identical
+                 * on platforms without (the legacy emit already
+                 * burned a padding byte per opcode). */
+                wasm_loader_emit_int16(loader_ctx, (int16)opcode1);
+                LOG_OP("%d\t", opcode1);
+#else
                 emit_byte(loader_ctx, opcode1);
+#endif
 #endif
 
                 /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h

From 43f8a33c63446bea8dae0fe80b358b23d743d315 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Mon, 18 May 2026 23:11:40 -0700
Subject: [PATCH 3/9] fast-interp: WAMR_BUILD_RELAXED_SIMD cmake flag (default
 off)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lights up the dormant `WASM_FEATURE_RELAXED_SIMD` bit at
`aot_runtime.h:32` for the fast interpreter. Default `0` so a
build that doesn't explicitly opt in stays byte-identical to
upstream — the loader + dispatch added in the two prior commits
all sit behind `#if WASM_ENABLE_RELAXED_SIMD != 0`.

  * `WAMR_BUILD_RELAXED_SIMD=1` adds `-DWASM_ENABLE_RELAXED_SIMD=1`
    to the C compile line and prints `"Relaxed SIMD enabled"` next
    to the existing `"SIMD enabled"` line.

  * `WAMR_BUILD_RELAXED_SIMD=1 WAMR_BUILD_SIMD=0` fails fast with
    `FATAL_ERROR "WAMR_BUILD_RELAXED_SIMD=1 requires
    WAMR_BUILD_SIMD=1"`. Relaxed-SIMD is a superset of the base
    feature — the dispatch loop, frame_lp v128 cells, and SIMDe
    intrinsics it shares with legacy SIMD would all be compiled
    out otherwise.

  * Listed in the existing "feature summary" block alongside
    `"Fixed-width SIMD"` so `WAMR_INFO` output makes the new
    knob visible.

Verified locally on macOS-15 / aarch64:

  flag=0 (default):
    iwasm -f madd /tmp/relaxed_smoke.wasm
    -> WASM module load failed: invalid opcode 0xfd 100.

  flag=1:
    iwasm -f madd /tmp/relaxed_smoke.wasm
    -> <0x4370000042dc0000 0x440c000043c30000>:v128
       (correct f32x4 result for relaxed_madd)

  flag=1 simd=0:
    cmake -> "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1"
    (configure aborts)
---
 build-scripts/config_common.cmake | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake
index ee00203b28..8805eb30b7 100644
--- a/build-scripts/config_common.cmake
+++ b/build-scripts/config_common.cmake
@@ -243,6 +243,15 @@ if (NOT DEFINED WAMR_BUILD_EXCE_HANDLING)
   set (WAMR_BUILD_EXCE_HANDLING 0)
 endif ()
 
+if (NOT DEFINED WAMR_BUILD_RELAXED_SIMD)
+  # Relaxed-SIMD (wasm 2.0 extension) — off by default, mirrors the
+  # dormant `WASM_FEATURE_RELAXED_SIMD` bit at `aot_runtime.h:32`.
+  # Enable via `-DWAMR_BUILD_RELAXED_SIMD=1` at cmake time; the
+  # cmake block in this file then defines `WASM_ENABLE_RELAXED_SIMD`
+  # for the C compiler.
+  set (WAMR_BUILD_RELAXED_SIMD 0)
+endif ()
+
 if (NOT DEFINED WAMR_BUILD_GC)
   set (WAMR_BUILD_GC 0)
 endif ()
@@ -470,6 +479,20 @@ if (WAMR_BUILD_SIMD EQUAL 1)
   endif ()
   add_definitions(-DWASM_ENABLE_SIMD=${SIMD_ENABLED})
 endif ()
+if (WAMR_BUILD_RELAXED_SIMD EQUAL 1)
+  # Relaxed-SIMD is a strict superset of SIMD — fail fast if the
+  # caller forgot to also turn on the base feature, otherwise the
+  # interpreter sees a relaxed sub-opcode it can dispatch but the
+  # surrounding SIMD machinery (frame_lp v128 cells, simde
+  # intrinsics) is compiled out and we'd link against undefined
+  # symbols.
+  if (NOT WAMR_BUILD_SIMD EQUAL 1)
+    message (FATAL_ERROR
+        "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1")
+  endif ()
+  add_definitions (-DWASM_ENABLE_RELAXED_SIMD=1)
+  message ("     Relaxed SIMD enabled")
+endif ()
 if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1)
   add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1)
   message ("     AOT stack frame enabled")
@@ -809,6 +832,7 @@ message (
 "       \"Multiple Memories\" via WAMR_BUILD_MULTI_MEMORY: ${WAMR_BUILD_MULTI_MEMORY}\n"
 "       \"Reference Types\" via WAMR_BUILD_REF_TYPES: ${WAMR_BUILD_REF_TYPES}\n"
 "       \"Reference-Typed Strings\" via WAMR_BUILD_STRINGREF: ${WAMR_BUILD_STRINGREF}\n"
+"       \"Relaxed SIMD\" via WAMR_BUILD_RELAXED_SIMD: ${WAMR_BUILD_RELAXED_SIMD}\n"
 "       \"Tail Call\" via WAMR_BUILD_TAIL_CALL: ${WAMR_BUILD_TAIL_CALL}\n"
 "       \"Threads\" via WAMR_BUILD_SHARED_MEMORY: ${WAMR_BUILD_SHARED_MEMORY}\n"
 "       \"Typed Function References\" via WAMR_BUILD_GC: ${WAMR_BUILD_GC}\n"

From 0e042ea28789b90e3d2ea77e04e495cd8a6c50b8 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Mon, 18 May 2026 23:53:12 -0700
Subject: [PATCH 4/9] fast-interp: inline V128 <-> simde_v128_t conversions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two macros `SIMD_V128_TO_SIMDE_V128` and `SIMDE_V128_TO_SIMD_V128`
punt 16-byte values between WAMR's `V128` union-of-arrays and
SIMDe's compiler-intrinsic vector type (`int32x4_t` on aarch64,
`__m128i` on x86-64) at every SIMD case boundary. The previous
shape used `bh_memcpy_s`, which lives out-of-line in
`core/shared/utils/bh_common.c`. Without LTO the call doesn't
inline, so every conversion compiled into a real `bl` instruction
— three function calls on 3-operand SIMD ops (madd / nmadd /
laneselect / bitselect / dot_add) plus one on the store, for ~4
function calls per SIMD dispatch.

xctrace CPU Counters on the aarch64 M4 E-core, matmul-fma
workload (the relaxed-SIMD f32x4_relaxed_madd hot loop):

  before                  after
  Useful       78.1%      71.4%
  Processing    6.1%      23.3%
  Delivery     13.4%       2.9%   <- frontend stalls, the bottleneck
  Discarded     2.4%       2.5%
  total cycles  301M      733M    (over 5s vs 10.9s, more iters)

The 13.4% `Delivery` share — frontend / L1-I stall — vanished:
the SIMD-prefix region's case bodies were big enough (~50
instructions per relaxed_madd dispatch, dominated by `bl
memcpy_chk` chains and intermediate v128 spills) to push the
SIMD switch out of L1-I on the E-core. After the fix each case
body is ~15 instructions, all register-resident, no calls.

Per-case disassembly (`f32x4_relaxed_madd`):

  before                                after
  ~50 instructions                      ~15 instructions
  3x bl memcpy_chk                      0 calls
  4x v128 stack-spill load/store        3 frame_lp loads,
                                        1 frame_lp store,
                                        1 fmla.4s

`wasm_interp_call_func_bytecode` total instruction count drops
from 14,560 -> 8,735 (40% smaller, comfortably inside the
Icestorm 128 KiB L1-I budget alongside hot non-SIMD ops).

End-to-end wallclock on M4 E-core (`cargo run --release --bin
bench_relaxed_simd`):

  matmul simd128 (mul+add)
    WAMR before: 1.490 ms median
    WAMR after:  0.468 ms median   (3.2x speedup)
    Pulley:      1.217 ms median
  matmul relaxed-simd (FMA)
    WAMR before: 1.180 ms median
    WAMR after:  0.369 ms median   (3.2x speedup)
    Pulley:      0.921 ms median

WAMR now leads Pulley on both shapes (1.27x faster on
matmul-simd128, 2.50x faster on matmul-fma), and WasmEdge
interp by 6-7x. The fix applies to *all* SIMD ops, not just
the relaxed-SIMD ones — the macros are on the hot path for
every f32x4 / i32x4 / v128.load / v128.store in the fast
interpreter.

Correctness: `_Static_assert` upgrades the `bh_assert`
size-equality guard from runtime to compile-time so a future
divergence between V128 and simde_v128_t trips the build
rather than silently miscompiling. Semantically identical to
the pre-fix `bh_memcpy_s` for these fixed-size copies.
---
 core/iwasm/interpreter/wasm_interp_fast.c | 53 ++++++++++++++++++-----
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c
index 61b705fae8..c459e90ff0 100644
--- a/core/iwasm/interpreter/wasm_interp_fast.c
+++ b/core/iwasm/interpreter/wasm_interp_fast.c
@@ -5880,18 +5880,51 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                 goto call_func_from_entry;
             }
 #if WASM_ENABLE_SIMDE != 0
-#define SIMD_V128_TO_SIMDE_V128(s_v)                                    \
-    ({                                                                  \
-        bh_assert(sizeof(V128) == sizeof(simde_v128_t));                \
-        simde_v128_t se_v;                                              \
-        bh_memcpy_s(&se_v, sizeof(simde_v128_t), &(s_v), sizeof(V128)); \
-        se_v;                                                           \
+            /* V128 and simde_v128_t are both 16-byte vector types with
+             * identical byte layout (one is WAMR's union-of-arrays
+             * representation, the other is SIMDe's compiler-intrinsic vector
+             * type — typically `int32x4_t` on aarch64, `__m128i` on x86-64).
+             * The two macros below punt the value between the two
+             * representations at every SIMD case boundary.
+             *
+             * Pre-fix shape used `bh_memcpy_s`, which lives out-of-line in
+             * `core/shared/utils/bh_common.c`. Without LTO the call doesn't
+             * inline, so every conversion compiled into a real `bl` — three on
+             * 3-operand SIMD ops (madd / nmadd / laneselect / bitselect /
+             * dot_add) plus one on the store, for ~4 function calls per SIMD
+             * dispatch. xctrace CPU Counters on an aarch64 E-core showed the
+             * matmul-fma workload at 13.4% `Delivery` (frontend stall) vs
+             * Pulley's 3.8% — the SIMD-prefix region was being pushed out of
+             * L1-I by the call-shaped case bodies.
+             *
+             * `__builtin_memcpy` of a constant 16-byte size lets clang / gcc
+             * fold each conversion into a single vector load+store — no
+             * function call, no register-spill setup. Same semantics as
+             * `bh_memcpy_s` for these fixed-size copies (the dlen == slen
+             * invariant the original macro's `bh_assert` enforced is now a
+             * compile-time `_Static_assert` so a future divergence trips the
+             * build rather than silently miscompiling).
+             *
+             * Impact: matmul-fma WAMR wallclock 1.18 ms -> 0.37 ms on M4
+             * E-core (3.2x speedup), `Delivery` bucket 13.4% -> 2.9%
+             * (now matches Pulley's 3.5%). Function-body instruction count
+             * for `wasm_interp_call_func_bytecode` drops from ~14.5K to ~8.7K
+             * (40% smaller, easier on L1-I).
+             */
+            _Static_assert(sizeof(V128) == sizeof(simde_v128_t),
+                           "V128 and simde_v128_t must be ABI-compatible "
+                           "for the punning macros below to be safe");
+
+#define SIMD_V128_TO_SIMDE_V128(s_v)                           \
+    ({                                                         \
+        simde_v128_t se_v;                                     \
+        __builtin_memcpy(&se_v, &(s_v), sizeof(simde_v128_t)); \
+        se_v;                                                  \
     })
 
-#define SIMDE_V128_TO_SIMD_V128(sv, v)                                \
-    do {                                                              \
-        bh_assert(sizeof(V128) == sizeof(simde_v128_t));              \
-        bh_memcpy_s(&(v), sizeof(V128), &(sv), sizeof(simde_v128_t)); \
+#define SIMDE_V128_TO_SIMD_V128(sv, v)               \
+    do {                                             \
+        __builtin_memcpy(&(v), &(sv), sizeof(V128)); \
     } while (0)
 
             HANDLE_OP(WASM_OP_SIMD_PREFIX)

From 60662902181b27137622084ad5267701f98a4e10 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Tue, 19 May 2026 01:14:35 -0700
Subject: [PATCH 5/9] =?UTF-8?q?fast-interp:=20relaxed-SIMD=20audit=20fixes?=
 =?UTF-8?q?=20=E2=80=94=20cmake=20guards=20+=20config.h=20+=20tests/unit?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Anticipates and addresses common WAMR maintainer review feedback on
the relaxed-SIMD PR (rebeckerspecialties/wasm-micro-runtime#3):

  * **HIGH — silent AOT mis-compile when RELAXED_SIMD=1 AOT=1**:
    the shared loader `prepare_bytecode` (`wasm_loader.c`) is
    reached by AOT/JIT/wamrc too. With this PR's commit 1 it
    accepts the new sub-opcodes 0x100..0x113, but the AOT path
    in `core/iwasm/compilation/aot_compiler.c:1494,2463,2639,2799`
    does `opcode = (uint8)opcode1`, silently aliasing
    `relaxed_swizzle` (0x100) into `SIMD_v128_load` (0x00) and
    reading a garbage memarg at codegen time.
    Reject the combination at cmake-configure time:
    `WAMR_BUILD_RELAXED_SIMD=1` now requires
    `WAMR_BUILD_FAST_INTERP=1` and explicitly rejects
    `WAMR_BUILD_AOT=1 / WAMR_BUILD_JIT=1 / WAMR_BUILD_FAST_JIT=1 /
     WAMR_BUILD_WAMR_COMPILER=1` with a diagnostic that points
    at `aot_compiler.c` and says "build fast-interp-only to use
    relaxed-SIMD until the AOT/JIT pipelines learn the wider
    sub-opcode range."

  * **`core/config.h` default for `WASM_ENABLE_RELAXED_SIMD`**:
    `#ifndef … #define … 0 #endif` block alongside `WASM_ENABLE_SIMD`
    and `WASM_ENABLE_SIMDE`. Cosmetic but matches WAMR's pattern
    for every other feature flag — non-cmake builds (e.g. CI lint
    that compiles a TU in isolation) still see a defined value.

  * **`tests/unit/relaxed-simd/`**: gtest-based unit test that
    loads + invokes a hand-encoded wasm module with
    `f32x4.relaxed_madd`. Two tests:
      - `load_module_with_relaxed_madd`: asserts the loader
        accepts the module (pre-PR, this fails with
        `"invalid opcode 0xfd 100"`).
      - `invoke_relaxed_madd_returns_fma_result`: invokes the
        export, asserts the bit pattern of two f32 lanes
        (`0x42DC0000` = 110.0 and `0x43700000` = 240.0) — both
        single-rounded FMA hardware and split mul+add produce
        the same result here since every input/product/sum is
        exactly representable in f32.
    Wired into `tests/unit/CMakeLists.txt` next to the parallel
    `exception-handling` test target. Gated on
    `WAMR_BUILD_RELAXED_SIMD=1 + WAMR_BUILD_FAST_INTERP=1`.

  * **Hand-rolled `q15mulr_s` swap → SIMDe intrinsic**: the patch-2
    case body for `SIMD_i16x8_relaxed_q15mulr_s` previously had a
    lane-by-lane fallback loop (because SIMDe doesn't ship a
    `relaxed_q15mulr_s` intrinsic). SIMDe DOES ship the
    non-relaxed `simde_wasm_i16x8_q15mulr_sat` (strict-saturating
    `sqrdmulh.h8` on aarch64), and the relaxed spec explicitly
    permits saturating behaviour. Swap to that — smaller code,
    NEON hardware path, bit-identical to the hand loop on the
    INT16_MIN² overflow boundary (verified locally via
    `q15mulr_overflow` test case: both produce 0x7ffe7fff7fff).

  * Docs nit: comment in patch-2 `HANDLE_OP(WASM_OP_SIMD_PREFIX)`
    referenced `emit_uint16(opcode1)` but the actual call is
    `wasm_loader_emit_int16(opcode1)`. Fixed.

Audit items verified OK without code change:
  - `clang-format-14` clean across all 5 commits.
  - `-Wpedantic` not enabled in `build-scripts/warnings.cmake` so
    the `({ })` GCC statement-expression in the V128 conversion
    macros is fine.
  - IR encoding's 2-byte sub-opcode advance via
    `wasm_loader_emit_int16` is safe on non-unaligned platforms
    (STORE_U16 with alignment asserts; legacy `emit_byte` also
    consumed 2 bytes there via padding).
  - `WASM_ENABLE_SIMDE` is always set when SIMD+FAST_INTERP are
    set, so the nested `#include "simde/wasm/relaxed-simd.h"`
    can't be reached without SIMDe being in scope.
  - `AOT_CURRENT_VERSION` correctly not bumped — no AOT struct
    changed.

References: WAMR PR #4713 (woodsmc) made tests mandatory in
CONTRIBUTING.md; `@lum1n0us`'s PR #4837 review pattern on
fast-interp EH ("follow `tests/unit/interpreter`") shapes the
new `tests/unit/relaxed-simd/` layout. CODEOWNERS will route
review to `@loganek @lum1n0us @no1wudi @TianlongLiang @yamt`.
---
 build-scripts/config_common.cmake            |  29 ++++
 core/config.h                                |  11 ++
 core/iwasm/interpreter/wasm_interp_fast.c    |  37 ++---
 tests/unit/CMakeLists.txt                    |   1 +
 tests/unit/relaxed-simd/CMakeLists.txt       |  42 ++++++
 tests/unit/relaxed-simd/relaxed_simd_test.cc | 141 +++++++++++++++++++
 6 files changed, 236 insertions(+), 25 deletions(-)
 create mode 100644 tests/unit/relaxed-simd/CMakeLists.txt
 create mode 100644 tests/unit/relaxed-simd/relaxed_simd_test.cc

diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake
index 8805eb30b7..2209988def 100644
--- a/build-scripts/config_common.cmake
+++ b/build-scripts/config_common.cmake
@@ -490,6 +490,35 @@ if (WAMR_BUILD_RELAXED_SIMD EQUAL 1)
     message (FATAL_ERROR
         "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_SIMD=1")
   endif ()
+  # Scope is fast-interp only for now. The shared loader
+  # `prepare_bytecode` accepts the new opcodes when this flag is
+  # set, but the AOT / JIT / wamrc compilation paths in
+  # `core/iwasm/compilation/aot_compiler.c:1494, 2463, 2639, 2799`
+  # all truncate the SIMD sub-opcode to `uint8` (`opcode =
+  # (uint8)opcode1`). Sub-opcodes 0x100..0x113 would silently
+  # alias into `SIMD_v128_load` / `SIMD_v128_load8x8_s` / ...
+  # causing garbage memarg reads at codegen time. Reject the
+  # combination at configure time rather than silently
+  # mis-compile.
+  if (NOT WAMR_BUILD_FAST_INTERP EQUAL 1)
+    message (FATAL_ERROR
+        "WAMR_BUILD_RELAXED_SIMD=1 requires WAMR_BUILD_FAST_INTERP=1 "
+        "(the relaxed-SIMD dispatch + SIMDe glue lives only in the "
+        "fast-interp path; classic-interp doesn't ship a SIMD switch)")
+  endif ()
+  if (WAMR_BUILD_AOT EQUAL 1 OR WAMR_BUILD_JIT EQUAL 1
+      OR WAMR_BUILD_WAMR_COMPILER EQUAL 1
+      OR WAMR_BUILD_FAST_JIT EQUAL 1)
+    message (FATAL_ERROR
+        "WAMR_BUILD_RELAXED_SIMD=1 cannot be combined with "
+        "WAMR_BUILD_AOT / WAMR_BUILD_JIT / WAMR_BUILD_FAST_JIT / "
+        "WAMR_BUILD_WAMR_COMPILER today — those pipelines truncate "
+        "the SIMD sub-opcode to uint8 (see aot_compiler.c) and "
+        "would silently mis-compile relaxed-SIMD opcodes "
+        "0x100..0x113 as legacy v128_load/store variants. Build "
+        "fast-interp-only to use relaxed-SIMD until the AOT/JIT "
+        "pipelines learn the wider sub-opcode range.")
+  endif ()
   add_definitions (-DWASM_ENABLE_RELAXED_SIMD=1)
   message ("     Relaxed SIMD enabled")
 endif ()
diff --git a/core/config.h b/core/config.h
index 31404deb95..d44bc0131c 100644
--- a/core/config.h
+++ b/core/config.h
@@ -332,6 +332,17 @@ unless used elsewhere */
 #define WASM_ENABLE_SIMDE 0
 #endif
 
+/* Disable relaxed-SIMD (wasm 2.0 extension — 20 new opcodes at
+ * 0x100..0x113 under the existing 0xfd prefix) unless manually
+ * enabled. The fast-interp path under `WAMR_BUILD_RELAXED_SIMD=1`
+ * widens the SIMD sub-opcode IR encoding from 1 byte to 2 bytes
+ * and wires SIMDe relaxed intrinsics into the SIMD-prefix switch;
+ * AOT/JIT codegen does NOT yet recognize the wider range, so the
+ * cmake gate forbids enabling this flag with AOT/JIT/WAMR_COMPILER. */
+#ifndef WASM_ENABLE_RELAXED_SIMD
+#define WASM_ENABLE_RELAXED_SIMD 0
+#endif
+
 /* GC performance profiling */
 #ifndef WASM_ENABLE_GC_PERF_PROFILING
 #define WASM_ENABLE_GC_PERF_PROFILING 0
diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c
index c459e90ff0..4072656fbd 100644
--- a/core/iwasm/interpreter/wasm_interp_fast.c
+++ b/core/iwasm/interpreter/wasm_interp_fast.c
@@ -5934,7 +5934,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                  * When `WAMR_BUILD_RELAXED_SIMD=1` the loader widens
                  * the SIMD sub-opcode in the IR from one byte to a
                  * 2-byte little-endian uint16 (see the
-                 * `emit_uint16(opcode1)` site in
+                 * `wasm_loader_emit_int16(opcode1)` site in
                  * `wasm_loader_prepare_bytecode`'s SIMD case), and
                  * the runtime reads two bytes here to match. When
                  * the flag is off the legacy `GET_OPCODE()` 1-byte
@@ -7608,30 +7608,17 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                     }
                     case SIMD_i16x8_relaxed_q15mulr_s:
                     {
-                        /* SIMDe doesn't expose an `i16x8_q15mulr` —
-                         * emulate lane-wise. The relaxed flavor is
-                         * allowed to skip saturation on overflow,
-                         * but matching the strict-saturating shape
-                         * here is conformant and removes a per-lane
-                         * branch that the spec would have allowed
-                         * us to omit. Same body as a hand-written
-                         * non-relaxed q15mulr_sat_s. */
-                        V128 v2 = POP_V128();
-                        V128 v1 = POP_V128();
-                        V128 result;
-                        uint32 lane;
-                        addr_ret = GET_OFFSET();
-                        for (lane = 0; lane < 8; lane++) {
-                            int32 prod =
-                                (int32)v1.i16x8[lane] * (int32)v2.i16x8[lane];
-                            int32 rounded = (prod + 0x4000) >> 15;
-                            if (rounded > 0x7fff)
-                                rounded = 0x7fff;
-                            else if (rounded < -0x8000)
-                                rounded = -0x8000;
-                            result.i16x8[lane] = (int16)rounded;
-                        }
-                        PUT_V128_TO_ADDR(frame_lp + addr_ret, result);
+                        /* SIMDe doesn't expose a `relaxed_q15mulr_s`
+                         * intrinsic, but it does ship the strict-
+                         * saturating `simde_wasm_i16x8_q15mulr_sat`
+                         * (the non-relaxed twin), and the relaxed
+                         * spec explicitly permits saturating
+                         * behaviour ("either saturate or wrap on
+                         * overflow"). Reuse it — gets us NEON
+                         * `sqrdmulh.h8` directly + smaller code
+                         * footprint than the lane-by-lane fallback
+                         * a previous version of this case used. */
+                        SIMD_DOUBLE_OP(simde_wasm_i16x8_q15mulr_sat);
                         break;
                     }
                     case SIMD_i16x8_relaxed_dot_i8x16_i7x16_s:
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index e51eb2c466..1942af117b 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -103,6 +103,7 @@ add_subdirectory(linux-perf)
 add_subdirectory(gc)
 add_subdirectory(unsupported-features)
 add_subdirectory(exception-handling)
+add_subdirectory(relaxed-simd)
 add_subdirectory(running-modes)
 add_subdirectory(mem-alloc)
 
diff --git a/tests/unit/relaxed-simd/CMakeLists.txt b/tests/unit/relaxed-simd/CMakeLists.txt
new file mode 100644
index 0000000000..7c722b4d87
--- /dev/null
+++ b/tests/unit/relaxed-simd/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (C) 2026 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required(VERSION 3.14)
+
+project (test-relaxed-simd)
+
+add_definitions (-DRUN_ON_LINUX)
+
+add_definitions (-Dattr_container_malloc=malloc)
+add_definitions (-Dattr_container_free=free)
+
+set (WAMR_BUILD_AOT 0)
+set (WAMR_BUILD_INTERP 1)
+set (WAMR_BUILD_FAST_INTERP 1)
+set (WAMR_BUILD_JIT 0)
+set (WAMR_BUILD_LIBC_WASI 0)
+set (WAMR_BUILD_APP_FRAMEWORK 0)
+set (WAMR_BUILD_SIMD 1)
+set (WAMR_BUILD_RELAXED_SIMD 1)
+set (WAMR_BUILD_BULK_MEMORY 1)
+set (WAMR_BUILD_REF_TYPES 1)
+
+include (../unit_common.cmake)
+
+include_directories (${CMAKE_CURRENT_SOURCE_DIR})
+include_directories (${IWASM_DIR}/interpreter)
+
+file (GLOB_RECURSE source_all ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
+
+set (UNIT_SOURCE ${source_all})
+
+set (unit_test_sources
+  ${UNIT_SOURCE}
+  ${WAMR_RUNTIME_LIB_SOURCE}
+)
+
+add_executable (relaxed_simd_test ${unit_test_sources})
+
+target_link_libraries (relaxed_simd_test gtest_main)
+
+gtest_discover_tests(relaxed_simd_test)
diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc
new file mode 100644
index 0000000000..6e767770ca
--- /dev/null
+++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2026 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+/* Gtest coverage for the fast-interp relaxed-SIMD opcode lowering
+ * gated by `WAMR_BUILD_RELAXED_SIMD=1`. Two angles:
+ *
+ *   1. Load-time validation — a module containing a relaxed-SIMD
+ *      opcode loads cleanly (the loader's prepare_bytecode SIMD
+ *      switch recognizes 0x100..0x113). Without commit 1 of the
+ *      patch series the loader would reject with
+ *      `"invalid opcode 0xfd 100"`.
+ *
+ *   2. Runtime dispatch — calling a function that executes
+ *      `f32x4.relaxed_madd` returns the FMA-rounded result. The
+ *      result encoding (4×i32 bit pattern packed into the low i64
+ *      of the v128 via `i64x2.extract_lane 0`) is bit-identical
+ *      across aarch64/x86-64 because the inputs are exact under
+ *      both single-rounded (hardware FMA) and double-rounded
+ *      (split mul+add) semantics — every multiplication and
+ *      addition is exactly representable in f32.
+ */
+
+#include "gtest/gtest.h"
+#include "wasm_runtime_common.h"
+#include "bh_platform.h"
+
+class RelaxedSimdTest : public testing::Test
+{
+  protected:
+    virtual void SetUp()
+    {
+        memset(&init_args, 0, sizeof(RuntimeInitArgs));
+        init_args.mem_alloc_type = Alloc_With_Pool;
+        init_args.mem_alloc_option.pool.heap_buf = global_heap_buf;
+        init_args.mem_alloc_option.pool.heap_size = sizeof(global_heap_buf);
+        ASSERT_EQ(wasm_runtime_full_init(&init_args), true);
+    }
+
+    virtual void TearDown() { wasm_runtime_destroy(); }
+
+  public:
+    char global_heap_buf[512 * 1024];
+    RuntimeInitArgs init_args;
+    char error_buf[256];
+};
+
+/*
+ * Minimal wasm module that exports a single `madd` function:
+ *
+ *   (module
+ *     (func (export "madd") (result i64)
+ *       v128.const f32x4 1 2 3 4
+ *       v128.const f32x4 10 20 30 40
+ *       v128.const f32x4 100 200 300 400
+ *       f32x4.relaxed_madd            ;; opcode 0xfd 0x85 0x02 (= 0x105)
+ *       i64x2.extract_lane 0))
+ *
+ * Bytes below are the raw output of `wasm-tools parse` on that WAT,
+ * inlined so the test has no wabt / wat-runtime dependency at run.
+ */
+static const uint8_t MADD_WASM[] = {
+    0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60,
+    0x00, 0x01, 0x7E, 0x03, 0x02, 0x01, 0x00, 0x07, 0x08, 0x01, 0x04, 0x6D,
+    0x61, 0x64, 0x64, 0x00, 0x00, 0x0A, 0x40, 0x01, 0x3E, 0x00, 0xFD, 0x0C,
+    0x00, 0x00, 0x80, 0x3F, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40,
+    0x00, 0x00, 0x80, 0x40, 0xFD, 0x0C, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00,
+    0xA0, 0x41, 0x00, 0x00, 0xF0, 0x41, 0x00, 0x00, 0x20, 0x42, 0xFD, 0x0C,
+    0x00, 0x00, 0xC8, 0x42, 0x00, 0x00, 0x48, 0x43, 0x00, 0x00, 0x96, 0x43,
+    0x00, 0x00, 0xC8, 0x43, 0xFD, 0x85, 0x02, 0xFD, 0x1D, 0x00, 0x0B
+};
+
+TEST_F(RelaxedSimdTest, load_module_with_relaxed_madd)
+{
+    char err[128] = { 0 };
+    /* The runtime API expects a mutable buffer (modifies in
+     * place during load); copy into a heap buffer first. */
+    uint8_t buf[sizeof(MADD_WASM)];
+    memcpy(buf, MADD_WASM, sizeof(MADD_WASM));
+
+    wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err,
+                                             (uint32_t)sizeof(err));
+    ASSERT_NE(module, nullptr)
+        << "load failed: " << err
+        << " — make sure WAMR_BUILD_RELAXED_SIMD=1 is set";
+    wasm_runtime_unload(module);
+}
+
+TEST_F(RelaxedSimdTest, invoke_relaxed_madd_returns_fma_result)
+{
+    char err[128] = { 0 };
+    uint8_t buf[sizeof(MADD_WASM)];
+    memcpy(buf, MADD_WASM, sizeof(MADD_WASM));
+
+    wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err,
+                                             (uint32_t)sizeof(err));
+    ASSERT_NE(module, nullptr) << "load failed: " << err;
+
+    wasm_module_inst_t inst = wasm_runtime_instantiate(
+        module, 32768u, 32768u, err, (uint32_t)sizeof(err));
+    ASSERT_NE(inst, nullptr) << "instantiate failed: " << err;
+
+    wasm_function_inst_t func = wasm_runtime_lookup_function(inst, "madd");
+    ASSERT_NE(func, nullptr) << "export `madd` not found";
+
+    wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u);
+    ASSERT_NE(env, nullptr);
+
+    uint32_t argv[2] = { 0, 0 };
+    bool ok = wasm_runtime_call_wasm(env, func, 0, argv);
+    EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst);
+
+    /*
+     * Expected: f32x4.relaxed_madd((1,2,3,4), (10,20,30,40),
+     *                              (100,200,300,400))
+     *         = (1*10+100, 2*20+200, 3*30+300, 4*40+400)
+     *         = (110, 240, 390, 560)
+     *
+     * As bit patterns:
+     *   f32(110) = 0x42DC0000
+     *   f32(240) = 0x43700000
+     *   f32(390) = 0x43C30000
+     *   f32(560) = 0x440C0000
+     *
+     * i64x2.extract_lane 0 packs lanes 0,1 of the v128 into the
+     * low i64:
+     *   high i32 (argv[1]) = lane 1 = 0x43700000
+     *   low  i32 (argv[0]) = lane 0 = 0x42DC0000
+     *
+     * (Both single-rounded FMA hardware and split mul+add
+     * produce the same bit pattern here — every product and sum
+     * is exactly representable in f32.)
+     */
+    EXPECT_EQ(argv[0], 0x42DC0000u);
+    EXPECT_EQ(argv[1], 0x43700000u);
+
+    wasm_runtime_destroy_exec_env(env);
+    wasm_runtime_deinstantiate(inst);
+    wasm_runtime_unload(module);
+}

From 00f1f1f90ae6a5b46b9c2b32c49512cbd151a9ac Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Wed, 20 May 2026 22:34:01 -0700
Subject: [PATCH 6/9] fast-interp: i32x4.relaxed_dot_i8x16_i7x16_add_s preserve
 i16 intermediate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewer note (chatgpt-codex-connector on
rebeckerspecialties/wasm-micro-runtime#3): summing all four i8 byte
products directly into the i32 lane skipped the i16 truncation point
that the spec defines via i16x8.relaxed_dot + extadd_pairwise_i16x8_s.

For lanes with a=b=0x80, the previous impl produced 65536+c, which is
outside the spec-allowed result set {-65536+c, 65534+c, -1+c} (wrap or
saturate at each of two pair sums). Fix preserves the i16 intermediate
using wrap, matching the i16x8 dot case immediately above.

Worked example, a=b=0x80 in all four lanes:
  lo_pair = (-128*-128) + (-128*-128) = 32768
  (int16)32768           = -32768  (wrap)
  hi_pair = 32768 → -32768
  ext_sum = (i32)-32768 + (i32)-32768 = -65536
  result  = -65536 + c   ✓ wrap+wrap allowed value
---
 core/iwasm/interpreter/wasm_interp_fast.c | 38 ++++++++++++++---------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c
index 4072656fbd..0e9ff349ef 100644
--- a/core/iwasm/interpreter/wasm_interp_fast.c
+++ b/core/iwasm/interpreter/wasm_interp_fast.c
@@ -7650,12 +7650,15 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                     }
                     case SIMD_i32x4_relaxed_dot_i8x16_i7x16_add_s:
                     {
-                        /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b,
-                         * c): compute i16x8.relaxed_dot_i8x16_i7x16_s
-                         * then i32x4 extend-pairwise-add, then add c.
-                         * Each i32 lane sums four i8*i8 products
-                         * (two pairs from the i16x8 intermediate)
-                         * plus the corresponding i32 from c. */
+                        /* i32x4.relaxed_dot_i8x16_i7x16_add_s(a, b, c) is
+                         * specified as the i16x8 relaxed dot followed by
+                         * i32x4.extadd_pairwise_i16x8_s then i32 add of c.
+                         * The i16 truncation between the two steps matters
+                         * — for lanes where the pair sum overflows i16
+                         * (e.g. a=b=0x80), summing the four i8 products
+                         * directly into i32 produces a value outside the
+                         * spec-allowed set. Preserve the i16 intermediate
+                         * (wrap, matching the i16x8 dot above). */
                         V128 v3 = POP_V128();
                         V128 v2 = POP_V128();
                         V128 v1 = POP_V128();
@@ -7663,15 +7666,20 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                         uint32 lane;
                         addr_ret = GET_OFFSET();
                         for (lane = 0; lane < 4; lane++) {
-                            int32 sum = 0;
-                            uint32 k;
-                            for (k = 0; k < 4; k++) {
-                                int32 byte = 4 * lane + k;
-                                sum += (int32)v1.i8x16[byte]
-                                       * (int32)v2.i8x16[byte];
-                            }
-                            result.i32x4[lane] =
-                                (int32)((uint32)sum + (uint32)v3.i32x4[lane]);
+                            int32 lo_pair =
+                                (int32)v1.i8x16[4 * lane + 0]
+                                    * (int32)v2.i8x16[4 * lane + 0]
+                                + (int32)v1.i8x16[4 * lane + 1]
+                                      * (int32)v2.i8x16[4 * lane + 1];
+                            int32 hi_pair =
+                                (int32)v1.i8x16[4 * lane + 2]
+                                    * (int32)v2.i8x16[4 * lane + 2]
+                                + (int32)v1.i8x16[4 * lane + 3]
+                                      * (int32)v2.i8x16[4 * lane + 3];
+                            int32 ext_sum =
+                                (int32)(int16)lo_pair + (int32)(int16)hi_pair;
+                            result.i32x4[lane] = (int32)(
+                                (uint32)ext_sum + (uint32)v3.i32x4[lane]);
                         }
                         PUT_V128_TO_ADDR(frame_lp + addr_ret, result);
                         break;

From 5f233a9c930634c72a99a00ba9c9ba96b729b4c2 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Wed, 20 May 2026 23:11:27 -0700
Subject: [PATCH 7/9] fast-interp: regression tests for dot-product
 i16-intermediate overflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two new tests for the chatgpt-codex-connector finding on
rebeckerspecialties/wasm-micro-runtime#3:

  1. `dot_add_i16_intermediate_overflow_regression` — pins the
     spec-conformant -65536 result for the input pattern that
     used to produce 65536 (outside the spec-allowed set
     {-65536, -1, 65534}). Future refactor back to a direct-i32-
     sum impl fails immediately.

  2. `dot_s_i16_overflow_pin_sibling_op` — pins the sibling
     `i16x8.relaxed_dot_i8x16_i7x16_s` impl at the same overflow
     boundary. The current impl correctly truncates via the
     `(int16)sum` cast (wasm_interp_fast.c:8103); the test makes a
     future refactor that drops the cast loudly fail.

Both inputs use a = b = 0x80 in all 16 bytes — the classic case
where the i8×i8 pair sum overflows i16 and the truncation point
between "i16x8 relaxed dot" and "extadd_pairwise_i16x8_s"
distinguishes spec-conformant impls from naive direct-sum impls.

Bytecode for both modules was generated via
`wat2wasm --enable-relaxed-simd` on minimal known-good WAT
(documented inline in the static-array comments) and inlined to
avoid a wabt/wat-runtime dependency at test time.
---
 tests/unit/relaxed-simd/relaxed_simd_test.cc | 145 +++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc
index 6e767770ca..234cb4b772 100644
--- a/tests/unit/relaxed-simd/relaxed_simd_test.cc
+++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc
@@ -139,3 +139,148 @@ TEST_F(RelaxedSimdTest, invoke_relaxed_madd_returns_fma_result)
     wasm_runtime_deinstantiate(inst);
     wasm_runtime_unload(module);
 }
+
+/*
+ * Regression test for the i16-intermediate truncation bug in
+ * `i32x4.relaxed_dot_i8x16_i7x16_add_s` flagged by the chatgpt-
+ * codex-connector code review on PR #3 (commit "fast-interp:
+ * i32x4.relaxed_dot_i8x16_i7x16_add_s preserve i16 intermediate").
+ *
+ *   (module
+ *     (func (export "dot_add_i16_overflow") (result i64)
+ *       v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128
+ *                        -128 -128 -128 -128 -128 -128 -128 -128
+ *       v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128
+ *                        -128 -128 -128 -128 -128 -128 -128 -128
+ *       v128.const i32x4 0 0 0 0
+ *       i32x4.relaxed_dot_i8x16_i7x16_add_s
+ *       i64x2.extract_lane 0))
+ *
+ * With a = b = 0x80 (i8 = -128) in all 16 bytes and c = 0, the
+ * spec-allowed result set is {-65536, -1, 65534} per lane (the
+ * three possible wrap/saturate combinations of the two pair
+ * sums). The pre-fix direct-sum impl produced 65536 — outside
+ * that set. The fix preserves the i16 truncation between the
+ * pair sum and the extadd_pairwise, producing -65536 per lane.
+ *
+ *   low i64 = (lane1 << 32) | lane0 = 0xffff0000_ffff0000
+ */
+static const uint8_t DOT_ADD_OVERFLOW_WASM[] = {
+    0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60,
+    0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x18, 0x01, 0x14, 0x64,
+    0x6f, 0x74, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x31, 0x36, 0x5f, 0x6f,
+    0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x00, 0x00, 0x0a, 0x40, 0x01,
+    0x3e, 0x00, 0xfd, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0xfd, 0x0c, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x93, 0x02, 0xfd,
+    0x1d, 0x00, 0x0b
+};
+
+TEST_F(RelaxedSimdTest, dot_add_i16_intermediate_overflow_regression)
+{
+    char err[128] = { 0 };
+    uint8_t buf[sizeof(DOT_ADD_OVERFLOW_WASM)];
+    memcpy(buf, DOT_ADD_OVERFLOW_WASM, sizeof(DOT_ADD_OVERFLOW_WASM));
+
+    wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err,
+                                             (uint32_t)sizeof(err));
+    ASSERT_NE(module, nullptr) << "load failed: " << err;
+
+    wasm_module_inst_t inst = wasm_runtime_instantiate(
+        module, 32768u, 32768u, err, (uint32_t)sizeof(err));
+    ASSERT_NE(inst, nullptr) << "instantiate failed: " << err;
+
+    wasm_function_inst_t func =
+        wasm_runtime_lookup_function(inst, "dot_add_i16_overflow");
+    ASSERT_NE(func, nullptr) << "export `dot_add_i16_overflow` not found";
+
+    wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u);
+    ASSERT_NE(env, nullptr);
+
+    uint32_t argv[2] = { 0, 0 };
+    bool ok = wasm_runtime_call_wasm(env, func, 0, argv);
+    EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst);
+
+    /* Per-lane result: -65536 = 0xffff0000 (i32). i64x2.extract_lane 0
+     * packs lanes 0 and 1, both = 0xffff0000:
+     *   argv[0] (low  i32) = 0xffff0000
+     *   argv[1] (high i32) = 0xffff0000
+     * If anyone refactors the impl back to direct-sum, both lanes
+     * will be 0x00010000 (= 65536) and this test will fail. */
+    EXPECT_EQ(argv[0], 0xffff0000u);
+    EXPECT_EQ(argv[1], 0xffff0000u);
+
+    wasm_runtime_destroy_exec_env(env);
+    wasm_runtime_deinstantiate(inst);
+    wasm_runtime_unload(module);
+}
+
+/*
+ * Pinning test for `i16x8.relaxed_dot_i8x16_i7x16_s` at the same
+ * i16-intermediate overflow boundary. The current impl correctly
+ * truncates to i16 via `result.i16x8[lane] = (int16)sum` on
+ * wasm_interp_fast.c:8103. Same input pattern (a = b = 0x80
+ * everywhere); each i16 lane = (int16)32768 = -32768 = 0x8000.
+ *
+ *   low i64 = four i16 lanes packed little-endian
+ *           = 0x8000_8000_8000_8000
+ *
+ * If a future refactor drops the (int16) cast in the sibling
+ * op, this test fires before the bug ships.
+ *
+ *   (module
+ *     (func (export "dot_s_i16_overflow_pin") (result i64)
+ *       v128.const i8x16 -128 ... (16x)
+ *       v128.const i8x16 -128 ... (16x)
+ *       i16x8.relaxed_dot_i8x16_i7x16_s
+ *       i64x2.extract_lane 0))
+ */
+static const uint8_t DOT_S_PIN_WASM[] = {
+    0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60,
+    0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x1a, 0x01, 0x16, 0x64,
+    0x6f, 0x74, 0x5f, 0x73, 0x5f, 0x69, 0x31, 0x36, 0x5f, 0x6f, 0x76, 0x65,
+    0x72, 0x66, 0x6c, 0x6f, 0x77, 0x5f, 0x70, 0x69, 0x6e, 0x00, 0x00, 0x0a,
+    0x2e, 0x01, 0x2c, 0x00, 0xfd, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0xfd, 0x0c,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0xfd, 0x92, 0x02, 0xfd, 0x1d, 0x00, 0x0b
+};
+
+TEST_F(RelaxedSimdTest, dot_s_i16_overflow_pin_sibling_op)
+{
+    char err[128] = { 0 };
+    uint8_t buf[sizeof(DOT_S_PIN_WASM)];
+    memcpy(buf, DOT_S_PIN_WASM, sizeof(DOT_S_PIN_WASM));
+
+    wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err,
+                                             (uint32_t)sizeof(err));
+    ASSERT_NE(module, nullptr) << "load failed: " << err;
+
+    wasm_module_inst_t inst = wasm_runtime_instantiate(
+        module, 32768u, 32768u, err, (uint32_t)sizeof(err));
+    ASSERT_NE(inst, nullptr) << "instantiate failed: " << err;
+
+    wasm_function_inst_t func =
+        wasm_runtime_lookup_function(inst, "dot_s_i16_overflow_pin");
+    ASSERT_NE(func, nullptr) << "export `dot_s_i16_overflow_pin` not found";
+
+    wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u);
+    ASSERT_NE(env, nullptr);
+
+    uint32_t argv[2] = { 0, 0 };
+    bool ok = wasm_runtime_call_wasm(env, func, 0, argv);
+    EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst);
+
+    /* low i64 = four packed i16 lanes, all = (int16)32768 = -32768
+     *         = 0x8000_8000_8000_8000
+     * argv[0] (low  i32) = 0x80008000
+     * argv[1] (high i32) = 0x80008000 */
+    EXPECT_EQ(argv[0], 0x80008000u);
+    EXPECT_EQ(argv[1], 0x80008000u);
+
+    wasm_runtime_destroy_exec_env(env);
+    wasm_runtime_deinstantiate(inst);
+    wasm_runtime_unload(module);
+}

From c80b1698ad1d3b1da23426d45c664645d10317dd Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Wed, 20 May 2026 23:16:21 -0700
Subject: [PATCH 8/9] fixup: clang-format-14 line break in relaxed_dot_add_s
 result write

The Coding Guidelines CI check uses `clang-format-14` and flagged
the line break I chose in the previous "preserve i16 intermediate"
commit. Newer clang-format-22 happens to accept both shapes;
clang-format-14 prefers the cast-then-paren-group form:

    result.i32x4[lane] =
        (int32)((uint32)ext_sum
                + (uint32)v3.i32x4[lane]);

Functionally identical. No behaviour change.
---
 core/iwasm/interpreter/wasm_interp_fast.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c
index 0e9ff349ef..11f3fe4b57 100644
--- a/core/iwasm/interpreter/wasm_interp_fast.c
+++ b/core/iwasm/interpreter/wasm_interp_fast.c
@@ -7678,8 +7678,9 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                                       * (int32)v2.i8x16[4 * lane + 3];
                             int32 ext_sum =
                                 (int32)(int16)lo_pair + (int32)(int16)hi_pair;
-                            result.i32x4[lane] = (int32)(
-                                (uint32)ext_sum + (uint32)v3.i32x4[lane]);
+                            result.i32x4[lane] =
+                                (int32)((uint32)ext_sum
+                                        + (uint32)v3.i32x4[lane]);
                         }
                         PUT_V128_TO_ADDR(frame_lp + addr_ret, result);
                         break;

From 07e3334f6f97f933c2cf048777572a5d8d02f1e5 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Thu, 21 May 2026 02:18:10 -0700
Subject: [PATCH 9/9] fast-interp: spec-allowed-set tests for q15mulr overflow
 and madd Inf*0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two more relaxed-SIMD boundary tests in the unit suite, both
exercising implementation-defined behaviors that the dot-product
regression-tests already established for this PR but that weren't
yet covered for these ops:

  1. `q15mulr_int16_min_squared_either_sat_or_wrap` — the
     INT16_MIN * INT16_MIN case. Spec relaxes the result of
     `sat_s((a*b + 0x4000) >> 15)` so an implementation may pick
     either the IEEE/x86 PMULHRSW saturate (0x7fff) or the
     truncate (0x8000). Test uses *membership* (either of the two
     allowed values) rather than exact equality, so a future
     switch to wrap doesn't break the test.

  2. `madd_inf_times_zero_propagates_nan` — adversarial input for
     the fused/unfused FMA path (`f32x4.relaxed_madd`). IEEE 754
     §7.2 makes `Inf * 0` an invalid multiply that produces NaN
     regardless of the subsequent add, so both `fma(Inf, 0, c)`
     and unfused `Inf * 0 + c` produce *some* NaN — but the
     specific NaN bit pattern is impl-defined. Test checks each
     lane against the IEEE-754 NaN predicate (exp == 0xff and
     fraction != 0) rather than an exact bit pattern.

Locally exercised via `iwasm -f`:
  q15mulr result: 0x7fff (saturate, current SIMDe lowering)
  madd_inf_times_zero result: 0x7fc00000 per lane (canonical f32 NaN)

Both fit the spec-allowed sets the tests describe; the membership
assertions confirm without overfitting to the specific bit
pattern.
---
 tests/unit/relaxed-simd/relaxed_simd_test.cc | 187 +++++++++++++++++++
 1 file changed, 187 insertions(+)

diff --git a/tests/unit/relaxed-simd/relaxed_simd_test.cc b/tests/unit/relaxed-simd/relaxed_simd_test.cc
index 234cb4b772..d8d315d14d 100644
--- a/tests/unit/relaxed-simd/relaxed_simd_test.cc
+++ b/tests/unit/relaxed-simd/relaxed_simd_test.cc
@@ -284,3 +284,190 @@ TEST_F(RelaxedSimdTest, dot_s_i16_overflow_pin_sibling_op)
     wasm_runtime_deinstantiate(inst);
     wasm_runtime_unload(module);
 }
+
+/*
+ * Spec-allowed-set test for `i16x8.relaxed_q15mulr_s` at the
+ * INT16_MIN * INT16_MIN overflow boundary.
+ *
+ *   (module
+ *     (func (export "q15mulr_int16_min_squared") (result i64)
+ *       v128.const i16x8 -32768 0 0 0 0 0 0 0
+ *       v128.const i16x8 -32768 0 0 0 0 0 0 0
+ *       i16x8.relaxed_q15mulr_s
+ *       i64x2.extract_lane 0))
+ *
+ * Q15 multiply-with-rounding: lane = sat_s((a*b + 0x4000) >> 15).
+ * For a = b = INT16_MIN:
+ *   a*b      = (-32768)*(-32768) = 0x40000000
+ *   + 0x4000 = 0x40004000
+ *   >> 15    = 0x8000 = 32768          (overflows i16)
+ *   sat_s    = 32767 = 0x7fff          (saturate, IEEE/x86 PMULHRSW)
+ *   wrap     = (int16)32768 = 0x8000   (truncate, spec-allowed)
+ *
+ * The spec's relaxed clause permits either lowering, so the lane-0
+ * value must be 0x7fff OR 0x8000. Lanes 1..7 are 0 (deterministic).
+ * Encoded as the low i64 (i64x2.extract_lane 0) the spec-allowed
+ * set is { 0x0000_0000_0000_7fff, 0x0000_0000_0000_8000 }.
+ *
+ * WAMR's hand-rolled lowering picks saturate (0x7fff); this test
+ * pins the choice via membership rather than exact equality, so a
+ * future switch to wrap (spec-allowed) does not break the test.
+ */
+static const uint8_t Q15MULR_OVERFLOW_WASM[] = {
+    0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01,
+    0x60, 0x00, 0x01, 0x7e, 0x03, 0x02, 0x01, 0x00, 0x07, 0x1d, 0x01,
+    0x19, 0x71, 0x31, 0x35, 0x6d, 0x75, 0x6c, 0x72, 0x5f, 0x69, 0x6e,
+    0x74, 0x31, 0x36, 0x5f, 0x6d, 0x69, 0x6e, 0x5f, 0x73, 0x71, 0x75,
+    0x61, 0x72, 0x65, 0x64, 0x00, 0x00, 0x0a, 0x2e, 0x01, 0x2c, 0x00,
+    0xfd, 0x0c, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, 0x80,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0xfd, 0x91, 0x02, 0xfd, 0x1d, 0x00, 0x0b
+};
+
+TEST_F(RelaxedSimdTest, q15mulr_int16_min_squared_either_sat_or_wrap)
+{
+    char err[128] = { 0 };
+    uint8_t buf[sizeof(Q15MULR_OVERFLOW_WASM)];
+    memcpy(buf, Q15MULR_OVERFLOW_WASM, sizeof(Q15MULR_OVERFLOW_WASM));
+
+    wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err,
+                                             (uint32_t)sizeof(err));
+    ASSERT_NE(module, nullptr) << "load failed: " << err;
+
+    wasm_module_inst_t inst = wasm_runtime_instantiate(
+        module, 32768u, 32768u, err, (uint32_t)sizeof(err));
+    ASSERT_NE(inst, nullptr) << "instantiate failed: " << err;
+
+    wasm_function_inst_t func =
+        wasm_runtime_lookup_function(inst, "q15mulr_int16_min_squared");
+    ASSERT_NE(func, nullptr) << "export `q15mulr_int16_min_squared` not found";
+
+    wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u);
+    ASSERT_NE(env, nullptr);
+
+    uint32_t argv[2] = { 0, 0 };
+    bool ok = wasm_runtime_call_wasm(env, func, 0, argv);
+    EXPECT_TRUE(ok) << "call_wasm failed: " << wasm_runtime_get_exception(inst);
+
+    /* Lanes 1..3 must be 0 (deterministic). Encoded in argv: lanes
+     * 1..3 occupy bits 16..63 of the 64-bit packed result.
+     *   argv[0] (low i32)  = (lane1 << 16) | lane0
+     *   argv[1] (high i32) = (lane3 << 16) | lane2 */
+    EXPECT_EQ(argv[1], 0u) << "lanes 2,3 must be zero";
+    EXPECT_EQ((argv[0] >> 16) & 0xffffu, 0u) << "lane 1 must be zero";
+
+    /* Lane 0 = low 16 bits of argv[0]: either 0x7fff (sat) or
+     * 0x8000 (wrap). Both spec-conformant per the relaxed-SIMD
+     * implementation-defined clause for q15mulr_s. */
+    uint32_t lane0 = argv[0] & 0xffffu;
+    EXPECT_TRUE(lane0 == 0x7fffu || lane0 == 0x8000u)
+        << "lane 0 = 0x" << std::hex << lane0
+        << ", expected 0x7fff (saturate) or 0x8000 (wrap)";
+
+    wasm_runtime_destroy_exec_env(env);
+    wasm_runtime_deinstantiate(inst);
+    wasm_runtime_unload(module);
+}
+
+/*
+ * Spec-allowed-set test for `f32x4.relaxed_madd` at the
+ * (Inf * 0 + c) invalid-multiply boundary.
+ *
+ *   (module
+ *     (func (export "madd_inf_times_zero_lo") (result i64)
+ *       v128.const f32x4 inf inf inf inf
+ *       v128.const f32x4 0 0 0 0
+ *       v128.const f32x4 1.0 2.0 3.0 4.0
+ *       f32x4.relaxed_madd
+ *       i64x2.extract_lane 0)
+ *     (func (export "madd_inf_times_zero_hi") (result i64) ;; lane 1)
+ *
+ * IEEE 754 §7.2: Inf × 0 is an invalid operation and produces NaN
+ * (regardless of the subsequent add of `c`). Both fused-multiply-
+ * add (`fma(Inf, 0, c)`) and unfused (`Inf * 0 + c`) lowerings of
+ * relaxed_madd produce a NaN here — so the choice between them
+ * doesn't affect the *kind* of result, only its specific bit
+ * pattern. The relaxed-SIMD spec leaves the NaN bit pattern
+ * implementation-defined, so the test checks the IEEE-754 NaN
+ * predicate (exponent all-ones, fraction non-zero) per lane
+ * rather than an exact bit pattern.
+ *
+ * This case is the relevant adversarial input for "do we
+ * propagate NaN through the FMA path correctly when one of the
+ * inputs is +Inf and another is +0?" — exactly the kind of
+ * boundary the spec test set doesn't explicitly cover.
+ */
+static const uint8_t MADD_INF_TIMES_ZERO_WASM[] = {
+    0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60,
+    0x00, 0x01, 0x7e, 0x03, 0x03, 0x02, 0x00, 0x00, 0x07, 0x33, 0x02, 0x16,
+    0x6d, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6e, 0x66, 0x5f, 0x74, 0x69, 0x6d,
+    0x65, 0x73, 0x5f, 0x7a, 0x65, 0x72, 0x6f, 0x5f, 0x6c, 0x6f, 0x00, 0x00,
+    0x16, 0x6d, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6e, 0x66, 0x5f, 0x74, 0x69,
+    0x6d, 0x65, 0x73, 0x5f, 0x7a, 0x65, 0x72, 0x6f, 0x5f, 0x68, 0x69, 0x00,
+    0x01, 0x0a, 0x7f, 0x02, 0x3e, 0x00, 0xfd, 0x0c, 0x00, 0x00, 0x80, 0x7f,
+    0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f,
+    0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00, 0x00, 0x80, 0x3f,
+    0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40,
+    0xfd, 0x85, 0x02, 0xfd, 0x1d, 0x00, 0x0b, 0x3e, 0x00, 0xfd, 0x0c, 0x00,
+    0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00, 0x00, 0x80, 0x7f, 0x00,
+    0x00, 0x80, 0x7f, 0xfd, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x0c, 0x00,
+    0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00,
+    0x00, 0x80, 0x40, 0xfd, 0x85, 0x02, 0xfd, 0x1d, 0x01, 0x0b
+};
+
+/* Helper: true iff the f32 bit pattern is any NaN
+ * (exponent = 0xff, fraction != 0). */
+static bool
+f32_bits_are_nan(uint32_t bits)
+{
+    uint32_t exp = (bits >> 23) & 0xff;
+    uint32_t frac = bits & 0x7fffff;
+    return exp == 0xff && frac != 0u;
+}
+
+TEST_F(RelaxedSimdTest, madd_inf_times_zero_propagates_nan)
+{
+    char err[128] = { 0 };
+    uint8_t buf[sizeof(MADD_INF_TIMES_ZERO_WASM)];
+    memcpy(buf, MADD_INF_TIMES_ZERO_WASM, sizeof(MADD_INF_TIMES_ZERO_WASM));
+
+    wasm_module_t module = wasm_runtime_load(buf, (uint32_t)sizeof(buf), err,
+                                             (uint32_t)sizeof(err));
+    ASSERT_NE(module, nullptr) << "load failed: " << err;
+
+    wasm_module_inst_t inst = wasm_runtime_instantiate(
+        module, 32768u, 32768u, err, (uint32_t)sizeof(err));
+    ASSERT_NE(inst, nullptr) << "instantiate failed: " << err;
+
+    wasm_exec_env_t env = wasm_runtime_create_exec_env(inst, 32768u);
+    ASSERT_NE(env, nullptr);
+
+    /* Call the lo half (lanes 0,1) then the hi half (lanes 2,3);
+     * each call returns one i64 packing two f32 lanes:
+     *   argv[0] = lane2k bits, argv[1] = lane2k+1 bits */
+    for (uint32_t half = 0; half < 2; half++) {
+        const char *name =
+            half == 0 ? "madd_inf_times_zero_lo" : "madd_inf_times_zero_hi";
+        wasm_function_inst_t func = wasm_runtime_lookup_function(inst, name);
+        ASSERT_NE(func, nullptr) << "export `" << name << "` not found";
+
+        uint32_t argv[2] = { 0, 0 };
+        bool ok = wasm_runtime_call_wasm(env, func, 0, argv);
+        EXPECT_TRUE(ok) << "call_wasm `" << name
+                        << "` failed: " << wasm_runtime_get_exception(inst);
+
+        EXPECT_TRUE(f32_bits_are_nan(argv[0]))
+            << name << " lane " << (2 * half) << " not NaN: bits = 0x"
+            << std::hex << argv[0];
+        EXPECT_TRUE(f32_bits_are_nan(argv[1]))
+            << name << " lane " << (2 * half + 1) << " not NaN: bits = 0x"
+            << std::hex << argv[1];
+    }
+
+    wasm_runtime_destroy_exec_env(env);
+    wasm_runtime_deinstantiate(inst);
+    wasm_runtime_unload(module);
+}