From 8ba873f08d2d5d7fed4ca870770979f1d79b9118 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 01:59:16 -0700 Subject: [PATCH 01/16] feat(interpreter): legacy exception handling (throw-only) for fast-interp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enables WAMR_BUILD_EXCE_HANDLING=1 together with FAST_INTERP=1 for the *throw-only* subset of the legacy wasm-eh proposal — modules that declare tags and execute `throw`/`rethrow` but never define a same- function `try`/`catch` handler. The throw escapes via the existing `got_exception` bailout path, exactly like any other trap, and the host sees the exception via `wasm_runtime_get_exception`. This is the shape produced in the wild by Porffor (the JS-to-wasm compiler used by Fastly's StarlingMonkey): its graphql-validation benchmark we measure cross-runtime contains 561 `throw` opcodes and zero in-wasm try/catch handlers. Every JS throw escapes to the host JS engine, which is the typical Porffor / static-JS-to-wasm pattern. Three changes: * `build-scripts/unsupported_combination.cmake` — lift the EXCE_HANDLING + FAST_INTERP ban (with a comment explaining the scope: throw-only is supported, in-function try/catch is the natural follow-up). * `core/iwasm/interpreter/wasm_loader.c` — when fast-interp parses WASM_OP_THROW, emit the tag index as a uint32 immediate after the auto-emitted THROW opcode. Same shape as how WASM_OP_CALL emits its funcidx. * `core/iwasm/interpreter/wasm_interp_fast.c` — `HANDLE_OP(WASM_OP _THROW)` now reads the uint32 immediate, surfaces a tag-bearing exception via `wasm_set_exception`, and falls through to `got_exception`. The other legacy-EH ops (TRY / CATCH / CATCH_ALL / RETHROW / DELEGATE / EXT_OP_TRY) keep the existing "unsupported opcode" diagnostic — they're unreachable for fast-interp-compiled code today (the loader's fast-interp path treats TRY as a plain block via skip_label and never emits CATCH-family opcodes into the IR), so the diagnostic only fires if a future loader change starts emitting them. Validated end-to-end on aarch64-apple-darwin: a benchmark-core harness loads Porffor's graphql-validation-porf.wasm, runs `m()` (the export that drives the validation pipeline), and gets `result=0` — matching the cross-runtime consensus from wasmtime / WasmEdge interpreter. Before this PR the same workload failed at LOAD with "invalid section id" (the tag section couldn't be parsed without EXCE_HANDLING=1). Full same-function try/catch lowering — porting the classic interpreter's `find_a_catch_handler` design to fast-interp's slot- allocator + pre-decoded IR — is the natural follow-up. --- build-scripts/unsupported_combination.cmake | 12 +++++- core/iwasm/interpreter/wasm_interp_fast.c | 44 ++++++++++++++++++++- core/iwasm/interpreter/wasm_loader.c | 13 ++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/build-scripts/unsupported_combination.cmake b/build-scripts/unsupported_combination.cmake index 4284be32bf..1789c71b52 100644 --- a/build-scripts/unsupported_combination.cmake +++ b/build-scripts/unsupported_combination.cmake @@ -64,7 +64,17 @@ endfunction() if(WAMR_BUILD_EXCE_HANDLING EQUAL 1) check_aot_mode_error("Unsupported build configuration: EXCE_HANDLING + AOT") - check_fast_interp_error("Unsupported build configuration: EXCE_HANDLING + FAST_INTERP") + # FAST_INTERP + EXCE_HANDLING is supported for *throw-only* shapes: + # WASM modules that declare tags and execute throw / rethrow without + # ever entering a same-function try / catch handler. The throw + # propagates to the caller via the existing got_exception bailout + # path, exactly like any other trap. This covers Porffor (its + # JS-to-wasm compiler emits 0 try/catch handlers; every JS throw + # escapes to the host). Modules that contain WASM_OP_TRY / CATCH / + # CATCH_ALL / DELEGATE still load, but those handlers report + # "unsupported opcode" at runtime — see the WASM_OP_TRY handler in + # core/iwasm/interpreter/wasm_interp_fast.c. Full same-function + # try / catch lowering is the natural follow-up. check_fast_jit_error("Unsupported build configuration: EXCE_HANDLING + FAST_JIT") check_llvm_jit_error("Unsupported build configuration: EXCE_HANDLING + JIT") endif() diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 937a7fdecf..0e3a26c03d 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -1836,14 +1836,56 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } #if WASM_ENABLE_EXCE_HANDLING != 0 + HANDLE_OP(WASM_OP_THROW) + { + /* The loader emits the tag index as a uint32 immediate + * after the THROW opcode (see WASM_OP_THROW in + * wasm_loader.c::wasm_loader_prepare_bytecode). Read it, + * surface a tag-bearing exception, and escape to the + * caller via got_exception — the existing trap-bailout + * path is exactly what an uncaught wasm exception + * should do. + * + * Same-function try/catch handlers are NOT implemented + * yet: the loader skips emitting TRY/CATCH/CATCH_ALL/ + * DELEGATE into the fast-interp IR, so a throw inside a + * try-block currently still escapes to the caller + * (where the host can observe it via + * wasm_runtime_get_exception). That matches the only + * shape the wild emits today — Porffor's JS-to-wasm + * compiler emits ~hundreds of throws and zero in-wasm + * try/catch handlers in our test corpus. Full + * in-function try/catch lowering is the natural + * follow-up. */ + uint32 exception_tag_index = read_uint32(frame_ip); + { + char exception_buf[64]; + snprintf(exception_buf, sizeof(exception_buf), + "wasm exception thrown (tag %u)", + exception_tag_index); + wasm_set_exception(module, exception_buf); + } + goto got_exception; + } + HANDLE_OP(WASM_OP_TRY) HANDLE_OP(WASM_OP_CATCH) - HANDLE_OP(WASM_OP_THROW) HANDLE_OP(WASM_OP_RETHROW) HANDLE_OP(WASM_OP_DELEGATE) HANDLE_OP(WASM_OP_CATCH_ALL) HANDLE_OP(EXT_OP_TRY) { + /* The loader's fast-interp emit path treats TRY as a + * plain block (skip_label) and doesn't emit CATCH / + * CATCH_ALL / DELEGATE / EXT_OP_TRY into the IR at all + * — they only fire if a future loader change starts + * emitting them. Keep the diagnostic so misbehaving + * loader paths surface immediately instead of silently + * dropping bytes. RETHROW is the only one we'd hit on + * well-formed input today, and only if a same-function + * catch handler caught a throw and re-raised it; we + * treat it as "unsupported" pending in-function catch + * lowering. */ wasm_set_exception(module, "unsupported opcode"); goto got_exception; } diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index a2c67bea2c..4f3dcffdc4 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12353,6 +12353,19 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint32 tag_index = 0; pb_read_leb_int32(p, p_end, tag_index); +#if WASM_ENABLE_FAST_INTERP != 0 + /* Fast-interp: the LEB-encoded tag_index from the source + * bytecode is consumed above; re-emit it as a plain + * uint32 immediate after the (auto-emitted) THROW opcode + * so the runtime handler can read it without re-running + * the LEB decoder. The runtime currently treats the tag + * as opaque (it surfaces a generic "wasm exception + * thrown (tag N)" string via wasm_set_exception and + * escapes via got_exception). Same emit shape as + * WASM_OP_CALL's funcidx. */ + emit_uint32(loader_ctx, tag_index); +#endif + /* check validity of tag_index against module->tag_count */ /* check tag index is within the tag index space */ if (tag_index >= module->import_tag_count + module->tag_count) { From 91f51f38896b4a42faf88f9fc9ad72d97ed07475 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 16:19:31 -0700 Subject: [PATCH 02/16] fast-interp: loader-side exception-handler metadata table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds per-function `WASMFastEHEntry[]` (sized by the existing `func->exception_handler_count` field, allocated in pass 2 of the preprocess pass and freed in `wasm_loader_unload`) recording each try-region's catch handler pcs in the rewritten fast-interp IR. This is the data the upcoming runtime EH-frame stack will consult when a `throw` walks for a matching catch handler — it is *not yet used* in this commit. Three pieces of plumbing on the loader side: * `WASMFastEHCatch` / `WASMFastEHEntry` typedefs in `wasm.h`, plus a `WASMFunction.exception_handlers` field. The struct is gated on `WASM_ENABLE_EXCE_HANDLING && WASM_ENABLE_FAST_INTERP` so classic-interp builds are byte-identical. * `BranchBlock.eh_entry_idx` (loader-internal CSP slot) and `WASMLoaderContext.cur_eh_entry_idx` (the source-order cursor). These let CATCH / CATCH_ALL / DELEGATE / END handlers resolve back to the right try-region without walking the CSP at runtime — same pattern the existing fast-interp loader uses to pre-patch BR / BR_IF / BR_TABLE targets. * Pass-2-only populate logic on the existing CATCH, CATCH_ALL, DELEGATE, and END cases. The pass-1 increment of `exception_handler_count` is now gated on `loader_ctx->p_code_compiled == NULL` so it doesn't double- count when the loader re_scans for the second traverse. Runtime behavior is unchanged in this commit: CATCH / CATCH_ALL / RETHROW / DELEGATE still hit the "unsupported opcode" stub from the throw-only patch. The dispatch wiring lands in the next commit; this one establishes the data layout reviewers will sanity-check first. Cost-model note: no changes to any hot-op handler (CALL, LOAD, STORE) and the new struct fields are entirely behind the existing WASM_ENABLE_EXCE_HANDLING guard, matching classic-interp's posture where EH-on builds carry one byte store per PUSH_CSP and a small per-frame allocation but leave hot ops untaxed. --- core/iwasm/interpreter/wasm.h | 41 ++++++ core/iwasm/interpreter/wasm_loader.c | 208 ++++++++++++++++++++++++++- 2 files changed, 245 insertions(+), 4 deletions(-) diff --git a/core/iwasm/interpreter/wasm.h b/core/iwasm/interpreter/wasm.h index c60349d10f..e18ab7ae68 100644 --- a/core/iwasm/interpreter/wasm.h +++ b/core/iwasm/interpreter/wasm.h @@ -681,6 +681,35 @@ typedef struct WASMImport { } u; } WASMImport; +#if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 +/* One typed `catch N` clause inside a single try-region. The handler_pc + * points at the first opcode of the catch body in the rewritten fast- + * interp IR; the loader patches it in pass 2 of the preprocess pass. */ +typedef struct WASMFastEHCatch { + uint32 tag_index; + uint8 *handler_pc; +} WASMFastEHCatch; + +/* One entry per same-function try-region, indexed by the uint32 immediate + * emitted after the rewritten TRY opcode. Allocated once per function at + * load time, sized by `func->exception_handler_count`. At runtime the + * dispatch loop carries one stack-allocated handle per *active* try- + * region (see frame->eh_stack); hot ops (CALL / LOAD / STORE) never + * touch this table. */ +typedef struct WASMFastEHEntry { + uint32 catch_count; + WASMFastEHCatch *catches; /* may be NULL when catch_count == 0 */ + uint8 *catch_all_pc; /* NULL if no `catch_all` clause */ + /* UINT32_MAX iff the try-region closes with `end`; otherwise the + * LEB depth from `delegate N`. */ + uint32 delegate_target_depth; + /* Rewritten-IR pc of the op immediately after the try-region's `end` + * (or `delegate`). CATCH / CATCH_ALL handlers branch here when their + * body completes; the loader patches it when the `end` is seen. */ + uint8 *end_of_region_pc; +} WASMFastEHEntry; +#endif /* WASM_ENABLE_EXCE_HANDLING && WASM_ENABLE_FAST_INTERP */ + struct WASMFunction { #if WASM_ENABLE_CUSTOM_NAME_SECTION != 0 char *field_name; @@ -721,7 +750,19 @@ struct WASMFunction { #endif #if WASM_ENABLE_EXCE_HANDLING != 0 + /* Number of `try` opcodes in this function. Populated by the loader + * during the preprocess pass (classic-interp uses this to size the + * runtime handler-pointer array stored on the value stack; fast- + * interp uses it to size `exception_handlers[]` below). */ uint32 exception_handler_count; +#if WASM_ENABLE_FAST_INTERP != 0 + /* Per-function table of try-regions in source order, length + * `exception_handler_count`. Allocated and populated in pass 2 of + * the fast-interp preprocess pass; the uint32 immediate emitted + * after the rewritten TRY opcode is the index into this array. + * NULL iff `exception_handler_count == 0`. */ + WASMFastEHEntry *exception_handlers; +#endif #endif #if WASM_ENABLE_FAST_JIT != 0 || WASM_ENABLE_JIT != 0 \ diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index 4f3dcffdc4..f1418df494 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -7360,6 +7360,23 @@ wasm_loader_unload(WASMModule *module) wasm_runtime_free(module->functions[i]->code_compiled); if (module->functions[i]->consts) wasm_runtime_free(module->functions[i]->consts); +#if WASM_ENABLE_EXCE_HANDLING != 0 + if (module->functions[i]->exception_handlers) { + uint32 eh_idx; + for (eh_idx = 0; + eh_idx < module->functions[i]->exception_handler_count; + eh_idx++) { + if (module->functions[i] + ->exception_handlers[eh_idx] + .catches) { + wasm_runtime_free(module->functions[i] + ->exception_handlers[eh_idx] + .catches); + } + } + wasm_runtime_free(module->functions[i]->exception_handlers); + } +#endif /* end of WASM_ENABLE_EXCE_HANDLING */ #endif #if WASM_ENABLE_FAST_JIT != 0 if (module->functions[i]->fast_jit_jitted_code) { @@ -8470,6 +8487,14 @@ typedef struct BranchBlock { * to copy the stack operands to the loop block's arguments in * wasm_loader_emit_br_info for opcode br. */ uint16 start_dynamic_offset; +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* For LABEL_TYPE_TRY/CATCH/CATCH_ALL: index into + * func->exception_handlers (the same index across the whole try- + * catch-end region — a CATCH clause inherits its parent TRY's + * index when the loader rewrites the block label). UINT32_MAX + * for non-EH label types. */ + uint32 eh_entry_idx; +#endif #endif /* Indicate the operand stack is in polymorphic state. @@ -8551,6 +8576,13 @@ typedef struct WASMLoaderContext { * than the final code_compiled_size, we record the peak size to ensure * there will not be invalid memory access during second traverse */ uint32 code_compiled_peak_size; +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* Index of the next entry to claim in func->exception_handlers, + * during the second traverse only (the first traverse merely counts + * try-blocks into func->exception_handler_count to size the array). + * Reset to 0 in wasm_loader_ctx_reinit. */ + uint32 cur_eh_entry_idx; +#endif #endif } WASMLoaderContext; @@ -8822,6 +8854,11 @@ wasm_loader_ctx_init(WASMFunction *func, char *error_buf, uint32 error_buf_size) #if WASM_ENABLE_EXCE_HANDLING != 0 func->exception_handler_count = 0; +#if WASM_ENABLE_FAST_INTERP != 0 + /* Allocated at the start of the second traverse, once + * exception_handler_count is known from the first traverse. */ + func->exception_handlers = NULL; +#endif #endif #if WASM_ENABLE_FAST_INTERP != 0 @@ -9344,6 +9381,12 @@ wasm_loader_push_frame_csp(WASMLoaderContext *ctx, uint8 label_type, #if WASM_ENABLE_FAST_INTERP != 0 ctx->frame_csp->dynamic_offset = ctx->dynamic_offset; ctx->frame_csp->patch_list = NULL; +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* Default sentinel; the WASM_OP_TRY handler patches this on entry + * and the CATCH/CATCH_ALL handlers propagate it onto the rewritten + * label. */ + ctx->frame_csp->eh_entry_idx = UINT32_MAX; +#endif #endif ctx->frame_csp++; ctx->csp_num++; @@ -9567,6 +9610,13 @@ wasm_loader_ctx_reinit(WASMLoaderContext *ctx) /* init preserved local offsets */ ctx->preserved_local_offset = ctx->max_dynamic_offset; +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* Start of the second traverse — reset the per-function try-block + * cursor so it tracks the same source-order index as the first + * traverse used to size func->exception_handlers. */ + ctx->cur_eh_entry_idx = 0; +#endif + /* const buf is reserved */ return true; } @@ -11961,6 +12011,27 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, loader_ctx->i32_const_num = k; } } + +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* The first traverse counted `func->exception_handler_count` + * try-blocks; the second traverse is about to populate one + * entry per try-block in source order. Allocate the array now + * (zero-initialized) and reset delegate_target_depth to the + * "no delegate" sentinel on every entry. */ + if (func->exception_handler_count > 0) { + uint64 eh_size = + (uint64)sizeof(WASMFastEHEntry) * func->exception_handler_count; + uint32 eh_i; + if (!(func->exception_handlers = + loader_malloc(eh_size, error_buf, error_buf_size))) { + goto fail; + } + for (eh_i = 0; eh_i < func->exception_handler_count; eh_i++) { + func->exception_handlers[eh_i].delegate_target_depth = + UINT32_MAX; + } + } +#endif } #endif @@ -12011,11 +12082,17 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, #if WASM_ENABLE_EXCE_HANDLING != 0 case WASM_OP_TRY: if (opcode == WASM_OP_TRY) { - /* - * keep track of exception handlers to account for - * memory allocation - */ +#if WASM_ENABLE_FAST_INTERP != 0 + /* Two-traverse loader: the first traverse counts + * try-blocks into func->exception_handler_count so + * the second traverse can allocate the per-function + * exception_handlers[] table (see re_scan block). */ + if (loader_ctx->p_code_compiled == NULL) + func->exception_handler_count++; +#else + /* Single-traverse classic-interp / shared loader. */ func->exception_handler_count++; +#endif /* * try is a block @@ -12276,7 +12353,27 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #if WASM_ENABLE_EXCE_HANDLING != 0 else if (opcode == WASM_OP_TRY) { + /* TRY is a control-flow block in the source bytecode + * but produces no operand-stack work itself; like + * BLOCK and LOOP we strip the label from the + * rewritten IR. The TRY's runtime effects (pushing + * an EH frame, identifying which catch handlers are + * in scope) are reached via the per-function + * exception_handlers[] table populated below — the + * runtime dispatch never reads a separate TRY + * opcode at all. */ skip_label(); + + /* Claim the next entry in exception_handlers[] for + * this try-region and remember the index on the + * loader CSP so subsequent CATCH / CATCH_ALL / + * DELEGATE / END opcodes for this region (and any + * nested ones) can resolve back to it. */ + bh_assert(loader_ctx->cur_eh_entry_idx + < func->exception_handler_count); + (loader_ctx->frame_csp - 1)->eh_entry_idx = + loader_ctx->cur_eh_entry_idx; + loader_ctx->cur_eh_entry_idx++; } #endif else if (opcode == WASM_OP_IF) { @@ -12498,6 +12595,21 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint8 label_type = cur_block->label_type; (void)label_type; +#if WASM_ENABLE_FAST_INTERP != 0 + /* Second traverse only: a `delegate N` closes the try + * region and forwards uncaught exceptions to an outer + * block. Record end_of_region_pc now — the actual depth + * is wired up by a follow-up commit (the runtime can't + * dispatch through delegate_target_depth until the + * EH-frame stack exists). */ + if (loader_ctx->p_code_compiled != NULL) { + uint32 eh_idx = cur_block->eh_entry_idx; + bh_assert(eh_idx < func->exception_handler_count); + bh_assert(func->exception_handlers != NULL); + func->exception_handlers[eh_idx].end_of_region_pc = + loader_ctx->p_code_compiled; + } +#endif /* DELEGATE ends the block */ POP_CSP(); break; @@ -12546,6 +12658,45 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; } +#if WASM_ENABLE_FAST_INTERP != 0 + /* Second traverse only: append (tag_index, handler_pc) + * to the parent try-region's catches[]. The handler PC + * is the first rewritten-IR byte after the CATCH + * opcode, which is what the runtime catch dispatch will + * branch to when a throw matches `tag_index`. The CATCH + * opcode itself remains in the IR for now — the runtime + * still routes it through the "unsupported opcode" stub + * (a follow-up commit wires up the runtime EH-frame + * stack and converts CATCH into a real handler). */ + if (loader_ctx->p_code_compiled != NULL) { + uint32 eh_idx = cur_block->eh_entry_idx; + WASMFastEHEntry *entry; + WASMFastEHCatch *new_catches; + uint64 new_size; + bh_assert(eh_idx < func->exception_handler_count); + bh_assert(func->exception_handlers != NULL); + entry = &func->exception_handlers[eh_idx]; + new_size = (uint64)sizeof(WASMFastEHCatch) + * (entry->catch_count + 1); + if (!(new_catches = loader_malloc(new_size, error_buf, + error_buf_size))) { + goto fail; + } + if (entry->catches) { + bh_memcpy_s(new_catches, (uint32)new_size, + entry->catches, + (uint32)sizeof(WASMFastEHCatch) + * entry->catch_count); + wasm_runtime_free(entry->catches); + } + new_catches[entry->catch_count].tag_index = tag_index; + new_catches[entry->catch_count].handler_pc = + loader_ctx->p_code_compiled; + entry->catches = new_catches; + entry->catch_count++; + } +#endif + /* * replace frame_csp by LABEL_TYPE_CATCH */ @@ -12589,6 +12740,23 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; } +#if WASM_ENABLE_FAST_INTERP != 0 + /* Second traverse only: record this clause's handler PC + * on the parent try-region. Mirrors the CATCH path + * above. catch_all_pc starts NULL (zero-init from + * loader_malloc) and is set exactly once per region — + * the wasm spec allows at most one catch_all per try. */ + if (loader_ctx->p_code_compiled != NULL) { + uint32 eh_idx = cur_block->eh_entry_idx; + bh_assert(eh_idx < func->exception_handler_count); + bh_assert(func->exception_handlers != NULL); + bh_assert(func->exception_handlers[eh_idx].catch_all_pc + == NULL); + func->exception_handlers[eh_idx].catch_all_pc = + loader_ctx->p_code_compiled; + } +#endif + /* no immediates */ /* replace frame_csp by LABEL_TYPE_CATCH_ALL */ cur_block->label_type = LABEL_TYPE_CATCH_ALL; @@ -12672,6 +12840,22 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, case WASM_OP_END: { BranchBlock *cur_block = loader_ctx->frame_csp - 1; +#if WASM_ENABLE_FAST_INTERP != 0 && WASM_ENABLE_EXCE_HANDLING != 0 + /* If this END closes a try-region (LABEL_TYPE_TRY when + * the region has only a try-body and no catch, or + * LABEL_TYPE_CATCH / CATCH_ALL when at least one catch + * clause is present), we need to remember the entry's + * index and label type now — POP_CSP and the subsequent + * skip_label / reserve_block_ret happen first, but the + * end_of_region_pc capture has to wait until after + * those advance loader_ctx->p_code_compiled. */ + uint32 ending_eh_idx = cur_block->eh_entry_idx; + bool ending_was_eh = + (ending_eh_idx != UINT32_MAX) + && (cur_block->label_type == LABEL_TYPE_TRY + || cur_block->label_type == LABEL_TYPE_CATCH + || cur_block->label_type == LABEL_TYPE_CATCH_ALL); +#endif /* check whether block stack matches its result type */ if (!check_block_stack(loader_ctx, cur_block, error_buf, @@ -12746,6 +12930,22 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #endif +#if WASM_ENABLE_FAST_INTERP != 0 && WASM_ENABLE_EXCE_HANDLING != 0 + /* Second-traverse-only: if this END closed a try- + * region, record where the rewritten IR continues so a + * runtime catch-handler body can branch past the + * region after running. The captured pc lands *after* + * the END's own skip_label and reserve_block_ret, so + * the next dispatched op is whatever follows the + * source-level END byte. */ + if (loader_ctx->p_code_compiled != NULL && ending_was_eh) { + bh_assert(ending_eh_idx < func->exception_handler_count); + bh_assert(func->exception_handlers != NULL); + func->exception_handlers[ending_eh_idx].end_of_region_pc = + loader_ctx->p_code_compiled; + } +#endif + break; } From 5202bfabefa12eed7a88b023b92bd50f28b49009 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 16:54:29 -0700 Subject: [PATCH 03/16] fast-interp: runtime EH-frame stack + TRY push / END(try) pop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up the per-frame eh-stack that commit 1 laid the metadata for. A program can now enter and exit a try-region without aborting; same- function throw → catch dispatch still bails out via got_exception (follow-up commit hooks that up). Frame layout: one extra cell per try-region appended past the value stack in the existing frame->operand[] allocation, sized by cur_wasm_func->exception_handler_count. Functions without try blocks pay zero cells. WASMInterpFrame gains a `uint32 eh_count` (the eh- stack top), clustered next to the existing EH-gated exception_raised/tag_index fields — same cache line, cold path only. Hot-op invariants preserved: * No new instructions in HANDLE_OP(WASM_OP_CALL), HANDLE_OP(WASM_OP_*_LOAD_*), HANDLE_OP(WASM_OP_*_STORE_*). * Dispatch table size is unchanged (slots 0x06 = WASM_OP_TRY, 0x07 = WASM_OP_CATCH, 0x0b = WASM_OP_END, 0x19 = WASM_OP_CATCH_ALL just get new bodies — they previously fell through to the "unsupported opcode" stub). * eh_count writes/reads only happen on TRY/CATCH/CATCH_ALL/END, none of which are on the dispatch loop's hot path. Loader changes (wasm_loader.c): * WASM_OP_TRY no longer skip_labels; emits its `eh_idx:u32` immediate after the auto-emitted opcode byte so the runtime push handler can find the right exception_handlers[] entry. * WASM_OP_CATCH / CATCH_ALL emit the same `eh_idx:u32` immediate; the runtime handler reads it to find end_of_region_pc to branch to on normal-flow exit. * WASM_OP_END for try-regions keeps the END byte in the IR (with the patch-list rewind dance to make `br N`-targeted PATCH_END addresses land *on* the END byte so the pop runs for branches too, not just fall-through). Runtime handlers (wasm_interp_fast.c): * HANDLE_OP(WASM_OP_TRY) pushes eh_idx onto frame_lp[eh_offset + eh_count] and increments eh_count. * HANDLE_OP(WASM_OP_CATCH) and HANDLE_OP(WASM_OP_CATCH_ALL) share a body: decrement eh_count, set frame_ip to func->exception_handlers[eh_idx].end_of_region_pc. * HANDLE_OP(WASM_OP_END) moves out of the "unsupported opcode" block when EXCE_HANDLING is enabled; decrements eh_count. * WASM_OP_RETHROW / WASM_OP_DELEGATE / EXT_OP_TRY still route to the diagnostic — wired up in a follow-up commit. After this commit: programs with try-regions where no throw fires inside the try body run correctly (the eh-stack is correctly maintained through entry/exit). Throws inside try bodies still escape via got_exception, matching the throw-only patch's behavior. porf-accurate still errors at the first throw escape (its catch handler does real work; full catch dispatch is the next commit). --- core/iwasm/interpreter/wasm_interp.h | 11 ++ core/iwasm/interpreter/wasm_interp_fast.c | 104 +++++++++++++++-- core/iwasm/interpreter/wasm_loader.c | 134 ++++++++++++++-------- 3 files changed, 186 insertions(+), 63 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp.h b/core/iwasm/interpreter/wasm_interp.h index 1416405460..8ca6fe5f23 100644 --- a/core/iwasm/interpreter/wasm_interp.h +++ b/core/iwasm/interpreter/wasm_interp.h @@ -40,6 +40,17 @@ typedef struct WASMInterpFrame { */ bool exception_raised; uint32 tag_index; +#if WASM_ENABLE_FAST_INTERP != 0 + /* Number of *currently-active* try-regions on this frame's eh- + * stack. The stack itself lives in the trailing cells of the + * frame's operand[] block — see call_func_from_entry in + * wasm_interp_fast.c where all_cell_num is grown by + * `exception_handler_count` cells per frame. Read+written only by + * the WASM_OP_TRY / CATCH / CATCH_ALL / END / THROW handlers; the + * hot ops (CALL / LOAD / STORE) never touch it, so this field + * stays cold and clusters with exception_raised/tag_index above. */ + uint32 eh_count; +#endif #endif #if WASM_ENABLE_FAST_INTERP != 0 diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 0e3a26c03d..060169036c 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -1869,23 +1869,67 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } HANDLE_OP(WASM_OP_TRY) + { + /* Loader emits `WASM_OP_TRY `. Push one + * entry onto the per-frame eh-stack so subsequent + * THROW/RETHROW handlers (added in follow-up commits) + * can find the in-scope catches by walking it. + * + * The eh-stack lives in the trailing cells of + * frame->operand[] — one cell per try-region, sized + * by cur_wasm_func->exception_handler_count at frame + * setup. Cost: 1 indexed store + 1 increment, both + * on a cold path; CALL / LOAD / STORE are untouched. */ + uint32 eh_idx = read_uint32(frame_ip); + WASMFunction *cur_wasm_func = cur_func->u.func; + uint32 *eh_stack = frame_lp + cur_func->param_cell_num + + cur_func->local_cell_num + + cur_wasm_func->max_stack_cell_num; + bh_assert(frame->eh_count + < cur_wasm_func->exception_handler_count); + eh_stack[frame->eh_count] = eh_idx; + frame->eh_count++; + HANDLE_OP_END(); + } + HANDLE_OP(WASM_OP_CATCH) + HANDLE_OP(WASM_OP_CATCH_ALL) + { + /* Loader emits ` ` (commit 1's + * exception_handlers table records each catch body's + * pc and the region's end_of_region_pc). + * + * Reached via *normal flow* — execution either ran the + * try body to completion (CATCH is the first opcode + * after the try body) or fell through from a previous + * catch body. Either way: pop one eh-stack entry and + * branch past the try-region's end. The THROW dispatch + * (follow-up commit) jumps directly to a catch body's + * first opcode, *skipping* the CATCH opcode itself, so + * this handler never runs as a result of a caught + * throw — only as a fall-through exit. */ + uint32 eh_idx = read_uint32(frame_ip); + WASMFunction *cur_wasm_func = cur_func->u.func; + bh_assert(eh_idx < cur_wasm_func->exception_handler_count); + bh_assert(frame->eh_count > 0); + frame->eh_count--; + frame_ip = + cur_wasm_func->exception_handlers[eh_idx].end_of_region_pc; + HANDLE_OP_END(); + } + HANDLE_OP(WASM_OP_RETHROW) HANDLE_OP(WASM_OP_DELEGATE) - HANDLE_OP(WASM_OP_CATCH_ALL) HANDLE_OP(EXT_OP_TRY) { - /* The loader's fast-interp emit path treats TRY as a - * plain block (skip_label) and doesn't emit CATCH / - * CATCH_ALL / DELEGATE / EXT_OP_TRY into the IR at all - * — they only fire if a future loader change starts - * emitting them. Keep the diagnostic so misbehaving - * loader paths surface immediately instead of silently - * dropping bytes. RETHROW is the only one we'd hit on - * well-formed input today, and only if a same-function - * catch handler caught a throw and re-raised it; we - * treat it as "unsupported" pending in-function catch - * lowering. */ + /* Still routed through the diagnostic until a follow- + * up commit wires up rethrow / delegate dispatch. + * EXT_OP_TRY is the type-index-blocktype variant of + * TRY; the fast-interp loader currently doesn't emit + * it (CATCH / DELEGATE indices are recorded directly + * on the per-function exception_handlers table), so + * hitting it here means a future loader change + * started emitting it. */ wasm_set_exception(module, "unsupported opcode"); goto got_exception; } @@ -7568,9 +7612,31 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, HANDLE_OP(WASM_OP_GET_LOCAL) HANDLE_OP(WASM_OP_DROP) HANDLE_OP(WASM_OP_DROP_64) +#if WASM_ENABLE_EXCE_HANDLING != 0 + HANDLE_OP(WASM_OP_END) + { + /* Block / loop / if / function-level `end` is stripped from + * the IR at load time (skip_label in the END case of + * wasm_loader_prepare_bytecode). Only try-region `end`s + * survive — the loader keeps them so the runtime can pop + * the matching eh-stack entry here when control falls + * through the bottom of a catch body (or runs the body of + * a catchless `try ... end`). + * + * Cost: one decrement on a cold path. CALL / LOAD / STORE + * are untouched. */ + bh_assert(frame->eh_count > 0); + frame->eh_count--; + HANDLE_OP_END(); + } + + HANDLE_OP(WASM_OP_BLOCK) + HANDLE_OP(WASM_OP_LOOP) +#else HANDLE_OP(WASM_OP_BLOCK) HANDLE_OP(WASM_OP_LOOP) HANDLE_OP(WASM_OP_END) +#endif HANDLE_OP(WASM_OP_NOP) HANDLE_OP(EXT_OP_BLOCK) HANDLE_OP(EXT_OP_LOOP) @@ -7776,6 +7842,15 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * these cells */ local_cell_num = cur_func->param_cell_num + cur_func->local_cell_num; +#endif +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* One cell per try-region in the function, appended past + * the value stack. Used by the WASM_OP_TRY / CATCH / + * CATCH_ALL / END / THROW handlers as a small per-frame + * eh-stack; functions without try blocks pay zero cells. + * Mirrors classic-interp's eh_size accounting at + * wasm_interp_classic.c:6786. */ + all_cell_num += cur_wasm_func->exception_handler_count; #endif /* param_cell_num, local_cell_num, const_cell_num and max_stack_cell_num are all no larger than UINT16_MAX (checked @@ -7793,6 +7868,11 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, frame_ip = wasm_get_func_code(cur_func); frame_ip_end = wasm_get_func_code_end(cur_func); +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* eh-stack starts empty; WASM_OP_TRY appends entries. */ + frame->eh_count = 0; +#endif + frame_lp = frame->lp = frame->operand + cur_wasm_func->const_cell_num; diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index f1418df494..1948efefca 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12353,26 +12353,21 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #if WASM_ENABLE_EXCE_HANDLING != 0 else if (opcode == WASM_OP_TRY) { - /* TRY is a control-flow block in the source bytecode - * but produces no operand-stack work itself; like - * BLOCK and LOOP we strip the label from the - * rewritten IR. The TRY's runtime effects (pushing - * an EH frame, identifying which catch handlers are - * in scope) are reached via the per-function - * exception_handlers[] table populated below — the - * runtime dispatch never reads a separate TRY - * opcode at all. */ - skip_label(); - - /* Claim the next entry in exception_handlers[] for - * this try-region and remember the index on the - * loader CSP so subsequent CATCH / CATCH_ALL / - * DELEGATE / END opcodes for this region (and any - * nested ones) can resolve back to it. */ + /* The auto-emit_label at the top of the dispatch + * loop already wrote the WASM_OP_TRY byte into the + * rewritten IR; the runtime handler for that + * opcode (HANDLE_OP(WASM_OP_TRY) in + * wasm_interp_fast.c) reads the uint32 eh_idx + * immediate we emit below and pushes one entry + * onto the per-frame eh-stack. Unlike BLOCK / LOOP, + * we keep the opcode in the IR — its runtime + * effect (push) is what makes throws find the + * right catches. */ bh_assert(loader_ctx->cur_eh_entry_idx < func->exception_handler_count); (loader_ctx->frame_csp - 1)->eh_entry_idx = loader_ctx->cur_eh_entry_idx; + emit_uint32(loader_ctx, loader_ctx->cur_eh_entry_idx); loader_ctx->cur_eh_entry_idx++; } #endif @@ -12659,15 +12654,16 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #if WASM_ENABLE_FAST_INTERP != 0 - /* Second traverse only: append (tag_index, handler_pc) - * to the parent try-region's catches[]. The handler PC - * is the first rewritten-IR byte after the CATCH - * opcode, which is what the runtime catch dispatch will - * branch to when a throw matches `tag_index`. The CATCH - * opcode itself remains in the IR for now — the runtime - * still routes it through the "unsupported opcode" stub - * (a follow-up commit wires up the runtime EH-frame - * stack and converts CATCH into a real handler). */ + /* Second traverse: emit `` after the auto- + * emitted CATCH opcode and record (tag_index, + * handler_pc) on the parent try-region's catches[]. + * handler_pc is the first rewritten-IR byte *after* + * the eh_idx immediate — that's where the runtime + * throw dispatch (follow-up commit) jumps when a + * matching tag is caught. The CATCH op itself only + * runs on *normal-flow* fall-through, in which case + * the runtime handler pops one eh-stack entry and + * branches to end_of_region_pc. */ if (loader_ctx->p_code_compiled != NULL) { uint32 eh_idx = cur_block->eh_entry_idx; WASMFastEHEntry *entry; @@ -12675,6 +12671,7 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint64 new_size; bh_assert(eh_idx < func->exception_handler_count); bh_assert(func->exception_handlers != NULL); + emit_uint32(loader_ctx, eh_idx); entry = &func->exception_handlers[eh_idx]; new_size = (uint64)sizeof(WASMFastEHCatch) * (entry->catch_count + 1); @@ -12741,17 +12738,20 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #if WASM_ENABLE_FAST_INTERP != 0 - /* Second traverse only: record this clause's handler PC - * on the parent try-region. Mirrors the CATCH path - * above. catch_all_pc starts NULL (zero-init from - * loader_malloc) and is set exactly once per region — - * the wasm spec allows at most one catch_all per try. */ + /* Second traverse: emit `` after the auto- + * emitted CATCH_ALL opcode and record catch_all_pc on + * the parent try-region. catch_all_pc starts NULL + * (zero-init from loader_malloc) and is set exactly + * once per region — the wasm spec allows at most one + * catch_all per try. handler_pc points after the + * eh_idx immediate, same shape as a typed CATCH. */ if (loader_ctx->p_code_compiled != NULL) { uint32 eh_idx = cur_block->eh_entry_idx; bh_assert(eh_idx < func->exception_handler_count); bh_assert(func->exception_handlers != NULL); bh_assert(func->exception_handlers[eh_idx].catch_all_pc == NULL); + emit_uint32(loader_ctx, eh_idx); func->exception_handlers[eh_idx].catch_all_pc = loader_ctx->p_code_compiled; } @@ -12882,30 +12882,62 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, POP_CSP(); #if WASM_ENABLE_FAST_INTERP != 0 - skip_label(); - /* copy the result to the block return address */ - if (!reserve_block_ret(loader_ctx, opcode, disable_emit, - error_buf, error_buf_size)) { - /* it could be tmp frame_csp allocated from opcode like - * OP_BR and not counted in loader_ctx->csp_num, it won't - * be freed in wasm_loader_ctx_destroy(loader_ctx) so need - * to free the loader_ctx->frame_csp if fails */ +#if WASM_ENABLE_EXCE_HANDLING != 0 + if (ending_was_eh) { + /* try-region END must execute the eh-stack pop in + * the runtime END handler — including when reached + * via `br N` (whose target was registered into + * this block's PATCH_END list by emit_br_info). + * + * Rewind the auto-emitted END byte, point all + * PATCH_END entries at the rewound position, then + * re-emit the END byte so both branches and fall- + * through dispatch the pop. reserve_block_ret's + * COPY (if any) lands *after* the END byte: the + * pop only adjusts eh_count and doesn't touch the + * operand stack the COPY moves from. */ + skip_label(); + apply_label_patch(loader_ctx, 0, PATCH_END); + emit_label(WASM_OP_END); + if (!reserve_block_ret(loader_ctx, opcode, disable_emit, + error_buf, error_buf_size)) { + free_label_patch_list(loader_ctx->frame_csp); + goto fail; + } free_label_patch_list(loader_ctx->frame_csp); - goto fail; + /* A try-region's END can never coincide with + * LABEL_TYPE_FUNCTION (the implicit function block + * is not a try); no WASM_OP_RETURN emit needed. */ } + else +#endif /* WASM_ENABLE_EXCE_HANDLING */ + { + skip_label(); + /* copy the result to the block return address */ + if (!reserve_block_ret(loader_ctx, opcode, disable_emit, + error_buf, error_buf_size)) { + /* it could be tmp frame_csp allocated from opcode like + * OP_BR and not counted in loader_ctx->csp_num, it + * won't be freed in wasm_loader_ctx_destroy(loader_ctx) + * so need to free the loader_ctx->frame_csp if fails */ + free_label_patch_list(loader_ctx->frame_csp); + goto fail; + } - apply_label_patch(loader_ctx, 0, PATCH_END); - free_label_patch_list(loader_ctx->frame_csp); - if (loader_ctx->frame_csp->label_type == LABEL_TYPE_FUNCTION) { - int32 idx; - uint8 ret_type; - - emit_label(WASM_OP_RETURN); - for (idx = (int32)func->func_type->result_count - 1; - idx >= 0; idx--) { - ret_type = *(func->func_type->types - + func->func_type->param_count + idx); - POP_OFFSET_TYPE(ret_type); + apply_label_patch(loader_ctx, 0, PATCH_END); + free_label_patch_list(loader_ctx->frame_csp); + if (loader_ctx->frame_csp->label_type + == LABEL_TYPE_FUNCTION) { + int32 idx; + uint8 ret_type; + + emit_label(WASM_OP_RETURN); + for (idx = (int32)func->func_type->result_count - 1; + idx >= 0; idx--) { + ret_type = *(func->func_type->types + + func->func_type->param_count + idx); + POP_OFFSET_TYPE(ret_type); + } } } #endif From 08625c5e18b0785fc48229cb625d3477188459bf Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 20:01:58 -0700 Subject: [PATCH 04/16] fast-interp: WASM_OP_THROW catch-walk + return_func exception hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Activates same-function and inter-function catch dispatch for the *void-result* try-region shape (which is what graphql-validation- porf-accurate emits — `06 40` = try-with-blocktype-void). Programs that throw inside a void try body now land in the matching catch handler (or catch_all) instead of escaping to the host trap path. The eh-stack push/pop infrastructure from the prior commit gives us the in-scope handlers; this commit adds the walk and the cross-frame unwind. Hot-op cost-model check: * HANDLE_OP(WASM_OP_THROW) is itself a cold op — programs that never throw never enter it. The walk runs in find_a_catch_ handler, also cold. * The one new check on a path every wasm-to-wasm call return visits is the `if (frame->exception_raised)` branch in return_func. Predicted strongly not-taken (exceptions are rare); two AArch64 instructions; identical in shape to classic-interp's existing check at wasm_interp_classic.c:6877. * The eh-stack cells share the cache line with the value stack they're allocated next to, so the walk hits warm memory. * CALL / LOAD / STORE handlers are byte-identical to the no-EH path. Mechanism: * `find_a_catch_handler` is a labeled block reached either by WASM_OP_THROW or by return_func when a callee stashed a tag on this frame. It walks frame->eh_count entries top-down, skipping entries whose top bit is set (state CATCH — already in an active handler; throws raised inside skip outward). On a tag match it ORs in EH_TRY_CATCH_STATE_BIT and dispatches frame_ip to entry->catches[j].handler_pc (or entry->catch_all_pc when no typed clause matches). * On exhaustion, the walker stashes exception_tag_index on prev_frame->tag_index, sets prev_frame->exception_raised = true, and goes to return_func. return_func, after RECOVER_CONTEXT has restored the caller's context, re-enters find_a_catch_handler with the caller's frame in scope. * At the top of the wasm stack (prev_frame->ip == NULL) the walker takes the existing got_exception escape so the host can read the trap message via wasm_runtime_get_exception. * frame->exception_raised and frame->tag_index are pre-existing fields originally added for classic-interp. exception_raised must now be cleared on every fast-interp frame setup — ALLOC_ FRAME doesn't zero-init the header and a stale non-zero byte trips the return_func check on every call return. Loader-side bug fix: the CATCH and CATCH_ALL emit_uint32(eh_idx) calls used to live inside the `if (loader_ctx->p_code_compiled != NULL)` populate guard. That gating skipped them in pass 1 but ran them in pass 2, so pass 2 wrote 4 bytes per catch *past* the code_compiled buffer allocated based on pass 1's measurement. The overrun corrupted whatever loader allocation the heap placed immediately after — typically func->exception_handlers itself (the first 4 bytes of entry[0], i.e. catch_count, was the usual victim). Surfaced as "wasm exception thrown (tag 0)" on `test_local_throw` where the typed-catch's catches[] array showed count=0 at runtime even though the loader populated count=1 in pass 2 — the populate itself wrote correctly, then a later opcode's reserve_block_ret overran the buffer and zeroed catch_count. Moved both emit_uint32 calls outside the populate guard so both passes account for the 4-byte immediate. State encoding: each eh-stack cell packs the loader's exception_handlers[] index in the low 31 bits and a state bit (EH_TRY_CATCH_STATE_BIT) in the top bit. No cell-count change vs the prior commit; same per-frame allocation footprint. Known limitation: try-regions with a non-void result-type are not yet supported by the *normal-flow* path. The fix is a loader-side try-body→block-dynamic-offset COPY emit at CATCH processing time (mirrors how WASM_OP_ELSE aligns the if-body's result via reserve_block_ret). See AGENTS.md's "Open follow-up — WAMR fast- interp legacy exception handling" section. graphql-validation-porf- accurate uses void-result try-blocks so it isn't blocked by this. Verified by `crates/benchmark-core/src/bin/probe_eh_void.rs` (5 cases — typed catch, catch_all, inter-function unwind, nested, no-throw — all PASS) and the existing run_graphql_validation_wamr regression (AS / porf-fast / porf-accurate within run-to-run variance vs the prior commit). --- core/iwasm/interpreter/wasm_interp_fast.c | 158 ++++++++++++++++++---- core/iwasm/interpreter/wasm_loader.c | 45 +++--- 2 files changed, 156 insertions(+), 47 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 060169036c..f8167fea03 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -102,6 +102,18 @@ typedef float64 CellType_F64; #define CHECK_INSTRUCTION_LIMIT() (void)0 #endif +#if WASM_ENABLE_EXCE_HANDLING != 0 +/* Top bit of each per-frame eh-stack cell: clear when the try-region's + * handler is *in scope* (TRY state — a throw matching one of its + * catches will dispatch into the handler), set once the throw walker + * has selected one of its handlers (CATCH state — further throws + * raised from inside that handler skip the entry and propagate + * outward). The low 31 bits hold the index into + * func->exception_handlers, bounded by the loader's + * `exception_handler_count`. */ +#define EH_TRY_CATCH_STATE_BIT 0x80000000u +#endif + static inline uint32 rotl32(uint32 n, uint32 c) { @@ -1538,6 +1550,14 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, uint8 *maddr = NULL; uint32 local_idx, local_offset, global_idx; uint8 opcode = 0, local_type, *global_addr; +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* Carries the wasm tag index from WASM_OP_THROW to the + * find_a_catch_handler label, and from a callee's return through + * frame->tag_index back to a caller-side find_a_catch_handler. + * Cold path only — the dispatch loop's hot ops never reference + * this variable, so the compiler is free to spill it. */ + uint32 exception_tag_index = 0; +#endif #if WASM_ENABLE_INSTRUCTION_METERING != 0 int instructions_left = -1; @@ -1838,35 +1858,94 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, #if WASM_ENABLE_EXCE_HANDLING != 0 HANDLE_OP(WASM_OP_THROW) { - /* The loader emits the tag index as a uint32 immediate - * after the THROW opcode (see WASM_OP_THROW in - * wasm_loader.c::wasm_loader_prepare_bytecode). Read it, - * surface a tag-bearing exception, and escape to the - * caller via got_exception — the existing trap-bailout - * path is exactly what an uncaught wasm exception - * should do. - * - * Same-function try/catch handlers are NOT implemented - * yet: the loader skips emitting TRY/CATCH/CATCH_ALL/ - * DELEGATE into the fast-interp IR, so a throw inside a - * try-block currently still escapes to the caller - * (where the host can observe it via - * wasm_runtime_get_exception). That matches the only - * shape the wild emits today — Porffor's JS-to-wasm - * compiler emits ~hundreds of throws and zero in-wasm - * try/catch handlers in our test corpus. Full - * in-function try/catch lowering is the natural - * follow-up. */ - uint32 exception_tag_index = read_uint32(frame_ip); - { - char exception_buf[64]; - snprintf(exception_buf, sizeof(exception_buf), - "wasm exception thrown (tag %u)", - exception_tag_index); - wasm_set_exception(module, exception_buf); + /* Loader emits `WASM_OP_THROW `. Read + * the tag, then walk the eh-stack in + * find_a_catch_handler to find a matching catch — + * first in this frame, then via return_func's hook in + * caller frames, and finally falling out to the host + * via got_exception when no match is found anywhere. */ + exception_tag_index = read_uint32(frame_ip); + goto find_a_catch_handler; + } + + find_a_catch_handler: + { + /* The eh-stack lives in the trailing cells of + * frame->operand[] (see call_func_from_entry and the + * runtime push from WASM_OP_TRY). Each entry packs the + * eh-table index into the low 31 bits; the top bit + * (EH_TRY_CATCH_STATE_BIT) is set on entries whose + * catch handler is *already running* — those are + * skipped here so a throw raised from inside a catch + * body propagates outward rather than re-entering the + * same handler. + * + * Cost shape: the walk runs only on the throw path + * (cold). CALL / LOAD / STORE handlers are untouched, + * and the eh-stack cells share a cache line with the + * value stack they're allocated next to, so the walk + * hits warm memory. + * + * Known limitation in this patch: try-regions with a + * non-void result-type are *not yet supported* by the + * normal-flow path. The fix is a loader-side + * try-body→block-dynamic-offset COPY emit at CATCH + * processing time (mirrors how WASM_OP_ELSE aligns + * the if-body's result via reserve_block_ret). See the + * AGENTS.md "Open follow-up — WAMR fast-interp legacy + * exception handling" section for the architectural + * note. The throw → catch dispatch implemented here + * still works correctly for void-result try-regions + * (which is what graphql-validation-porf-accurate's + * single try-block is). */ + WASMFunction *cur_wasm_func = cur_func->u.func; + uint32 *eh_stack = frame_lp + cur_func->param_cell_num + + cur_func->local_cell_num + + cur_wasm_func->max_stack_cell_num; + uint32 i; + for (i = frame->eh_count; i > 0; i--) { + uint32 packed = eh_stack[i - 1]; + uint32 eh_idx; + WASMFastEHEntry *entry; + uint32 j; + if (packed & EH_TRY_CATCH_STATE_BIT) + continue; /* in-progress catch — skip */ + eh_idx = packed & ~EH_TRY_CATCH_STATE_BIT; + bh_assert(eh_idx < cur_wasm_func->exception_handler_count); + entry = &cur_wasm_func->exception_handlers[eh_idx]; + for (j = 0; j < entry->catch_count; j++) { + if (entry->catches[j].tag_index == exception_tag_index) { + eh_stack[i - 1] = packed | EH_TRY_CATCH_STATE_BIT; + frame_ip = entry->catches[j].handler_pc; + HANDLE_OP_END(); + } } - goto got_exception; + if (entry->catch_all_pc) { + eh_stack[i - 1] = packed | EH_TRY_CATCH_STATE_BIT; + frame_ip = entry->catch_all_pc; + HANDLE_OP_END(); + } + } + /* No handler in this frame. Hand the exception off to + * the caller via return_func, which checks + * frame->exception_raised after RECOVER_CONTEXT and + * re-enters this label with the caller's frame in + * scope. If we're already at the top of the wasm + * stack, the existing got_exception path lets the + * host observe the trap via wasm_runtime_get_exception. */ + if (prev_frame && prev_frame->ip) { + prev_frame->tag_index = exception_tag_index; + prev_frame->exception_raised = true; + goto return_func; } + { + char exception_buf[64]; + snprintf(exception_buf, sizeof(exception_buf), + "wasm exception thrown (tag %u)", exception_tag_index); + wasm_set_exception(module, exception_buf); + } + goto got_exception; + } HANDLE_OP(WASM_OP_TRY) { @@ -7871,6 +7950,16 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, #if WASM_ENABLE_EXCE_HANDLING != 0 /* eh-stack starts empty; WASM_OP_TRY appends entries. */ frame->eh_count = 0; + /* exception_raised is the marker `return_func` reads on + * every wasm-to-wasm call return; if a callee's throw + * found no in-frame handler it stashes the tag on the + * caller's frame->tag_index and sets this flag, then + * goes to return_func. ALLOC_FRAME doesn't zero-init + * the frame header, so leaving the slot uninitialized + * trips the return_func hook on every call return with + * stale memory contents — turning a non-throwing run + * into "wasm exception thrown (tag N)" for random N. */ + frame->exception_raised = false; #endif frame_lp = frame->lp = @@ -7929,6 +8018,21 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, RECOVER_CONTEXT(prev_frame); #if WASM_ENABLE_GC != 0 local_cell_num = cur_func->param_cell_num + cur_func->local_cell_num; +#endif +#if WASM_ENABLE_EXCE_HANDLING != 0 + /* Inter-function unwind: the callee stashed a wasm tag on + * this frame (now the active one after RECOVER_CONTEXT) + * when its eh-stack walk found no in-frame match. Re-enter + * find_a_catch_handler so the caller's eh-stack gets a + * chance to catch. Predicted strongly not-taken — + * exceptions are rare, this single check is the entire + * CALL-return-side cost of EH; the success path takes the + * HANDLE_OP_END() below. */ + if (frame->exception_raised) { + exception_tag_index = frame->tag_index; + frame->exception_raised = false; + goto find_a_catch_handler; + } #endif HANDLE_OP_END(); } diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index 1948efefca..cef9db116b 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12652,18 +12652,24 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, "Unexpected block sequence encountered."); goto fail; } - #if WASM_ENABLE_FAST_INTERP != 0 - /* Second traverse: emit `` after the auto- - * emitted CATCH opcode and record (tag_index, - * handler_pc) on the parent try-region's catches[]. - * handler_pc is the first rewritten-IR byte *after* - * the eh_idx immediate — that's where the runtime - * throw dispatch (follow-up commit) jumps when a - * matching tag is caught. The CATCH op itself only - * runs on *normal-flow* fall-through, in which case - * the runtime handler pops one eh-stack entry and - * branches to end_of_region_pc. */ + /* Emit `` after the auto-emitted CATCH + * opcode. The runtime CATCH handler reads it to find + * end_of_region_pc when the catch is reached via + * normal flow. Emitted in BOTH traverses so pass 1's + * size measurement and pass 2's actual writes match; + * if this were inside the populate guard below, + * pass 2 would overrun the code_compiled buffer by + * sizeof(uint32) bytes per catch, corrupting whatever + * loader allocation the heap placed immediately after + * (typically func->exception_handlers itself). */ + emit_uint32(loader_ctx, cur_block->eh_entry_idx); + + /* Second traverse only: append (tag_index, handler_pc) + * to the parent try-region's catches[]. handler_pc is + * the first rewritten-IR byte *after* the eh_idx + * immediate — that's where the runtime throw dispatch + * jumps when a matching tag is caught. */ if (loader_ctx->p_code_compiled != NULL) { uint32 eh_idx = cur_block->eh_entry_idx; WASMFastEHEntry *entry; @@ -12671,7 +12677,6 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint64 new_size; bh_assert(eh_idx < func->exception_handler_count); bh_assert(func->exception_handlers != NULL); - emit_uint32(loader_ctx, eh_idx); entry = &func->exception_handlers[eh_idx]; new_size = (uint64)sizeof(WASMFastEHCatch) * (entry->catch_count + 1); @@ -12738,20 +12743,20 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #if WASM_ENABLE_FAST_INTERP != 0 - /* Second traverse: emit `` after the auto- - * emitted CATCH_ALL opcode and record catch_all_pc on - * the parent try-region. catch_all_pc starts NULL - * (zero-init from loader_malloc) and is set exactly - * once per region — the wasm spec allows at most one - * catch_all per try. handler_pc points after the - * eh_idx immediate, same shape as a typed CATCH. */ + /* Emit `` after the auto-emitted CATCH_ALL + * opcode in BOTH traverses (pass 1's size accounting + * must include this or pass 2 overruns + * code_compiled). Pass 2 additionally records + * catch_all_pc on the parent try-region — set exactly + * once per region (spec allows at most one catch_all + * per try). */ + emit_uint32(loader_ctx, cur_block->eh_entry_idx); if (loader_ctx->p_code_compiled != NULL) { uint32 eh_idx = cur_block->eh_entry_idx; bh_assert(eh_idx < func->exception_handler_count); bh_assert(func->exception_handlers != NULL); bh_assert(func->exception_handlers[eh_idx].catch_all_pc == NULL); - emit_uint32(loader_ctx, eh_idx); func->exception_handlers[eh_idx].catch_all_pc = loader_ctx->p_code_compiled; } From 683a9dbee72cb7777bfd291e233ff469ce3d1949 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 20:57:10 -0700 Subject: [PATCH 05/16] fast-interp: WASM_OP_RETHROW catch-walk re-raise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Activates the RETHROW opcode: re-raise the exception currently being handled by the (depth+1)-th `state=CATCH` entry from the top of the per-frame eh-stack. Source form `rethrow N` becomes `RETHROW ` in the rewritten IR; the runtime walker scans the eh-stack top-down, skips state=TRY entries (they're not "catch handlers in progress"), and on the (depth+1)-th state=CATCH match reads its stashed caught tag and dispatches to `find_a_catch_handler` exactly as a fresh throw with that tag would. Storage shape: each eh-stack entry is now `EH_ENTRY_CELLS = 2` cells wide. Cell 0 packs `eh_idx | EH_TRY_CATCH_STATE_BIT` (unchanged); cell 1 holds the wasm tag index of the exception currently being handled on that entry (undefined while the entry is in TRY state — the throw walker writes it on catch dispatch). Frame allocation grows by `exception_handler_count * 2` cells per call; functions without try blocks still pay zero cells. Hot-op cost-model check: * No new code in HANDLE_OP(WASM_OP_CALL) / LOAD_* / STORE_*. * RETHROW is a cold op (only fires inside catch bodies); the walk runs across at most the number of catches nested around the rethrow site. * TRY's push gained a no-op write (cell 1 stays undefined until the throw walker overwrites it on dispatch) — same one indexed store as before, just with a wider stride. * `frame->exception_raised` init + the return_func hook are unchanged from the prior commit; no new branches on any return path. Loader-side land-mine cleared: WAMR's shared `check_branch_block` calls `emit_br_info` unconditionally, which for a typical arity-zero catch target writes 4 bytes (arity) + 8 bytes (target ptr placeholder via `add_label_patch_to_list`) into the IR between the auto-emitted opcode label and the next op. RETHROW doesn't *branch* to its target — it walks the eh-stack — so those br-info bytes are dead weight, and worse: they shift our depth immediate past where the runtime `read_uint32(frame_ip)` looks for it. The RETHROW case in the loader now does its own depth + label-type validation (manual `loader_ctx->frame_csp - depth - 1` lookup, LABEL_TYPE_CATCH/CATCH_ALL check) and skips check_branch_block entirely. Verified by three new cases in `crates/benchmark-core/tests/eh_correctness.rs`: - `rethrow_depth_zero`: inner catch sets a flag, `rethrow 0`, outer catch sees the same tag (= 11). - `rethrow_preserves_tag`: two tags ($a, $b); throw $b → inner catch $b → rethrow 0; outer catch $b wins over outer catch $a (= 11). - `rethrow_depth_one`: nested catches; from inside the innermost (which caught $b), `rethrow 1` re-raises the *outer* catch's tag ($a). All 23 cases in the EH correctness suite pass; AS / porf-fast / porf-accurate benchmark medians overlap the prior commit's range within run-to-run variance (three runs each). --- core/iwasm/interpreter/wasm_interp_fast.c | 120 ++++++++++++++++------ core/iwasm/interpreter/wasm_loader.c | 55 +++++++--- 2 files changed, 130 insertions(+), 45 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index f8167fea03..0f13d93fea 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -103,15 +103,18 @@ typedef float64 CellType_F64; #endif #if WASM_ENABLE_EXCE_HANDLING != 0 -/* Top bit of each per-frame eh-stack cell: clear when the try-region's - * handler is *in scope* (TRY state — a throw matching one of its - * catches will dispatch into the handler), set once the throw walker - * has selected one of its handlers (CATCH state — further throws - * raised from inside that handler skip the entry and propagate - * outward). The low 31 bits hold the index into - * func->exception_handlers, bounded by the loader's - * `exception_handler_count`. */ +/* Per-frame eh-stack entries are 2 cells wide. Cell 0 packs the index + * into `func->exception_handlers[]` (low 31 bits) and a state bit + * (top bit): clear when the try-region's handler is *in scope* (TRY + * state — a throw matching one of its catches will dispatch into the + * handler), set once the throw walker has selected one of its + * handlers (CATCH state — further throws raised from inside that + * handler skip the entry and propagate outward). Cell 1 holds the + * wasm tag index of the exception currently being handled (written + * by the throw walker on dispatch; read by RETHROW). The tag is + * undefined while the entry is in TRY state. */ #define EH_TRY_CATCH_STATE_BIT 0x80000000u +#define EH_ENTRY_CELLS 2 #endif static inline uint32 @@ -1904,7 +1907,8 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, + cur_wasm_func->max_stack_cell_num; uint32 i; for (i = frame->eh_count; i > 0; i--) { - uint32 packed = eh_stack[i - 1]; + uint32 *cells = eh_stack + (i - 1) * EH_ENTRY_CELLS; + uint32 packed = cells[0]; uint32 eh_idx; WASMFastEHEntry *entry; uint32 j; @@ -1915,13 +1919,19 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, entry = &cur_wasm_func->exception_handlers[eh_idx]; for (j = 0; j < entry->catch_count; j++) { if (entry->catches[j].tag_index == exception_tag_index) { - eh_stack[i - 1] = packed | EH_TRY_CATCH_STATE_BIT; + /* Mark the entry as in-progress catch and + * stash the tag that's being handled so a + * RETHROW from this catch body can re- + * raise it. */ + cells[0] = packed | EH_TRY_CATCH_STATE_BIT; + cells[1] = exception_tag_index; frame_ip = entry->catches[j].handler_pc; HANDLE_OP_END(); } } if (entry->catch_all_pc) { - eh_stack[i - 1] = packed | EH_TRY_CATCH_STATE_BIT; + cells[0] = packed | EH_TRY_CATCH_STATE_BIT; + cells[1] = exception_tag_index; frame_ip = entry->catch_all_pc; HANDLE_OP_END(); } @@ -1951,14 +1961,18 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, { /* Loader emits `WASM_OP_TRY `. Push one * entry onto the per-frame eh-stack so subsequent - * THROW/RETHROW handlers (added in follow-up commits) - * can find the in-scope catches by walking it. + * THROW / RETHROW handlers can find the in-scope + * catches by walking it. * * The eh-stack lives in the trailing cells of - * frame->operand[] — one cell per try-region, sized - * by cur_wasm_func->exception_handler_count at frame - * setup. Cost: 1 indexed store + 1 increment, both - * on a cold path; CALL / LOAD / STORE are untouched. */ + * frame->operand[] — EH_ENTRY_CELLS cells per try- + * region, sized by + * cur_wasm_func->exception_handler_count * + * EH_ENTRY_CELLS at frame setup. Cell 1 (caught_tag) + * is unspecified while the entry is in TRY state and + * gets written by the throw walker on catch dispatch. + * Cost: one indexed store + one increment, both on a + * cold path; CALL / LOAD / STORE are untouched. */ uint32 eh_idx = read_uint32(frame_ip); WASMFunction *cur_wasm_func = cur_func->u.func; uint32 *eh_stack = frame_lp + cur_func->param_cell_num @@ -1966,7 +1980,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, + cur_wasm_func->max_stack_cell_num; bh_assert(frame->eh_count < cur_wasm_func->exception_handler_count); - eh_stack[frame->eh_count] = eh_idx; + eh_stack[frame->eh_count * EH_ENTRY_CELLS + 0] = eh_idx; frame->eh_count++; HANDLE_OP_END(); } @@ -1998,17 +2012,57 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } HANDLE_OP(WASM_OP_RETHROW) + { + /* Loader emits `WASM_OP_RETHROW `. Re-raise + * the exception currently being handled by an + * enclosing catch (the (depth+1)-th `state=CATCH` + * entry from the top of the eh-stack at this point — + * each in-progress catch we're nested in contributes + * one such entry, in source order). RETHROW is a + * cold op (only fires inside catch bodies); the walk + * runs across at most the number of catches nested + * around the rethrow site. CALL / LOAD / STORE are + * untouched. */ + uint32 depth = read_uint32(frame_ip); + WASMFunction *cur_wasm_func = cur_func->u.func; + uint32 *eh_stack = frame_lp + cur_func->param_cell_num + + cur_func->local_cell_num + + cur_wasm_func->max_stack_cell_num; + uint32 i; + uint32 catch_seen = 0; + for (i = frame->eh_count; i > 0; i--) { + uint32 *cells = eh_stack + (i - 1) * EH_ENTRY_CELLS; + if (!(cells[0] & EH_TRY_CATCH_STATE_BIT)) + continue; + if (catch_seen == depth) { + /* Re-raise the caught tag against the *outer* + * try-regions. find_a_catch_handler iterates + * top-down and skips state=CATCH entries, so + * this same entry won't re-match. */ + exception_tag_index = cells[1]; + goto find_a_catch_handler; + } + catch_seen++; + } + /* Loader validated rethrow's depth at compile time; + * if we got here the eh-stack is inconsistent with + * the IR (typically a runtime bug in the loader's + * eh-table population). */ + wasm_set_exception(module, "rethrow depth out of range"); + goto got_exception; + } + HANDLE_OP(WASM_OP_DELEGATE) HANDLE_OP(EXT_OP_TRY) { /* Still routed through the diagnostic until a follow- - * up commit wires up rethrow / delegate dispatch. - * EXT_OP_TRY is the type-index-blocktype variant of - * TRY; the fast-interp loader currently doesn't emit - * it (CATCH / DELEGATE indices are recorded directly - * on the per-function exception_handlers table), so - * hitting it here means a future loader change - * started emitting it. */ + * up commit wires up delegate dispatch and the type- + * index-blocktype variant of TRY. The fast-interp + * loader currently doesn't emit EXT_OP_TRY (CATCH / + * DELEGATE indices are recorded directly on the + * per-function exception_handlers table); hitting + * either here means a future loader change started + * emitting them. */ wasm_set_exception(module, "unsupported opcode"); goto got_exception; } @@ -7923,13 +7977,15 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, cur_func->param_cell_num + cur_func->local_cell_num; #endif #if WASM_ENABLE_EXCE_HANDLING != 0 - /* One cell per try-region in the function, appended past - * the value stack. Used by the WASM_OP_TRY / CATCH / - * CATCH_ALL / END / THROW handlers as a small per-frame - * eh-stack; functions without try blocks pay zero cells. - * Mirrors classic-interp's eh_size accounting at - * wasm_interp_classic.c:6786. */ - all_cell_num += cur_wasm_func->exception_handler_count; + /* EH_ENTRY_CELLS cells per try-region in the function, + * appended past the value stack — cell 0 holds the + * packed eh_idx | state_bit, cell 1 holds the caught tag + * for RETHROW. Functions without try blocks pay zero + * cells. Mirrors classic-interp's eh_size accounting at + * wasm_interp_classic.c:6786 (which also stores per- + * handler pointers on the value stack). */ + all_cell_num += + cur_wasm_func->exception_handler_count * EH_ENTRY_CELLS; #endif /* param_cell_num, local_cell_num, const_cell_num and max_stack_cell_num are all no larger than UINT16_MAX (checked diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index cef9db116b..9ef6603d3b 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12555,21 +12555,50 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } case WASM_OP_RETHROW: { - /* must be done before checking branch block */ + /* must be done before reading the depth */ SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE(true); - /* check the target catching block: LABEL_TYPE_CATCH */ - if (!(frame_csp_tmp = - check_branch_block(loader_ctx, &p, p_end, opcode, - error_buf, error_buf_size))) - goto fail; - - if (frame_csp_tmp->label_type != LABEL_TYPE_CATCH - && frame_csp_tmp->label_type != LABEL_TYPE_CATCH_ALL) { - /* trap according to spectest (rethrow.wast) */ - set_error_buf(error_buf, error_buf_size, - "invalid rethrow label"); - goto fail; + /* Manual depth + label-type validation. We deliberately + * skip the shared `check_branch_block` here because + * RETHROW doesn't *branch* to its target — it walks + * the eh-stack at runtime and re-raises — so the + * branch-info bytes that check_branch_block / + * emit_br_info would write between the auto-emitted + * opcode label and our depth immediate are dead + * weight (4 bytes arity + 8 bytes target ptr + + * arity-dependent operand-offsets, all unread by the + * runtime walker). Worse, leaving them in the IR + * shifts our depth immediate past where the runtime + * read_uint32(frame_ip) looks for it. */ + { + uint32 rethrow_depth = 0; + BranchBlock *target_block; + pb_read_leb_uint32(p, p_end, rethrow_depth); + if (rethrow_depth + 1 > loader_ctx->csp_num) { +#if WASM_ENABLE_SPEC_TEST == 0 + set_error_buf(error_buf, error_buf_size, + "unknown rethrow label"); +#else + set_error_buf(error_buf, error_buf_size, + "unknown label"); +#endif + goto fail; + } + target_block = loader_ctx->frame_csp - rethrow_depth - 1; + if (target_block->label_type != LABEL_TYPE_CATCH + && target_block->label_type != LABEL_TYPE_CATCH_ALL) { + /* trap according to spectest (rethrow.wast) */ + set_error_buf(error_buf, error_buf_size, + "invalid rethrow label"); + goto fail; + } +#if WASM_ENABLE_FAST_INTERP != 0 + /* Emit the depth as a uint32 immediate after the + * auto-emitted RETHROW opcode. Pass 1's size + * accounting must match pass 2's actual emit so + * we run this branch in both traverses. */ + emit_uint32(loader_ctx, rethrow_depth); +#endif } BranchBlock *cur_block = loader_ctx->frame_csp - 1; From a1c647cc36a437e56103d5843dfd82255f51741e Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 22:26:29 -0700 Subject: [PATCH 06/16] fast-interp: WASM_OP_DELEGATE forward-to-outer dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up the runtime + loader for `try ... delegate N` so the throw walker can re-raise the exception at the target block's location without spending hot-op budget. Loader (wasm_loader.c, WASM_OP_DELEGATE case): Skip the shared `check_branch_block_for_delegate` — its `emit_br_info` call would write 12 bytes of branch metadata between the auto-emitted DELEGATE label and the next op, dead weight at runtime and (worse) the same alignment-shift gotcha that bit RETHROW. Do the depth read + bounds check inline. In pass 2, count try/catch/catch_all blocks STRICTLY between the delegate's frame and the target block — that count (`delta`) is exactly how many eh-stack entries the runtime walker must skip past, by spec. Runtime (wasm_interp_fast.c): * find_a_catch_handler: before catch-matching, check `entry->delegate_target_depth`. If set, mark the delegate's own eh-stack entry consumed (STATE bit) and do `i -= delta; continue;` so the for-loop's natural i-- lands on the first eh-stack entry strictly outside the target block. The `delta + 1 >= i` guard catches "delegate to function block" (target lies outside this function's eh-stack) and falls through to the existing "no handler in this frame" return_func path. * WASM_OP_DELEGATE: split out of the "unsupported opcode" stub into its own normal-flow handler — fires when the try body completes without throwing; just `frame->eh_count--` and advance. Cost shape preserved: zero new bytes in CALL / LOAD / STORE; all delegate work lives on the cold throw walker or the cold normal- flow exit handler. --- core/iwasm/interpreter/wasm_interp_fast.c | 78 ++++++++++++++++++-- core/iwasm/interpreter/wasm_loader.c | 89 ++++++++++++++++++++--- 2 files changed, 147 insertions(+), 20 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 0f13d93fea..4db492ca3f 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -1917,6 +1917,43 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, eh_idx = packed & ~EH_TRY_CATCH_STATE_BIT; bh_assert(eh_idx < cur_wasm_func->exception_handler_count); entry = &cur_wasm_func->exception_handlers[eh_idx]; + if (entry->delegate_target_depth != UINT32_MAX) { + /* This try-region was closed by `delegate N`, + * not `end`. The spec says the exception is + * re-raised at the location of the target + * block — i.e. it propagates past every try + * whose body the delegate's try sits inside + * (but the target is also inside). The loader + * already counted those tries as + * `delegate_target_depth = delta`. Marking + * THIS entry as consumed and decrementing `i` + * by `delta` makes the for-loop's natural + * i-- land on the first eh-stack entry + * strictly *outside* the target block — which + * is exactly where the spec wants the throw + * to resume matching. + * + * If `delta + 1 >= i`, the target block is + * outside this function's eh-stack entirely + * (e.g. `delegate `): + * break out to the "no handler in this + * frame" path and let return_func forward the + * exception to the caller. + * + * Cost: cold path; only THROW reaches here. + * Hot ops untouched. */ + uint32 delta = entry->delegate_target_depth; + cells[0] = packed | EH_TRY_CATCH_STATE_BIT; + if (delta + 1 >= i) { + /* Underflow guard + escape signal: any + * `delta` that would skip past the start + * of the eh-stack means the target lies + * past this function's try-blocks. */ + break; + } + i -= delta; + continue; + } for (j = 0; j < entry->catch_count; j++) { if (entry->catches[j].tag_index == exception_tag_index) { /* Mark the entry as in-progress catch and @@ -2053,16 +2090,41 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } HANDLE_OP(WASM_OP_DELEGATE) + { + /* Normal-flow exit from a `try ... delegate N` region: + * the try body completed without throwing, so the + * runtime just pops the eh-stack entry that + * HANDLE_OP(WASM_OP_TRY) pushed and falls through to + * the next op in the rewritten IR (which is whatever + * came after the `delegate N` in source). + * + * The forwarding semantics ("if the try body throws, + * re-raise at the target block") are handled by the + * find_a_catch_handler walker reading the eh-table + * entry's `delegate_target_depth` and skipping that + * many nested-try eh-stack entries — DELEGATE itself + * doesn't run in the throw path, only on fall-through. + * + * No immediate to read: the loader skipped emit_br_info + * so the depth lives in the per-function eh-table + * indexed by the eh_idx of *this* try-region (which is + * the eh-stack's top). Cost: one decrement on a cold + * path; CALL / LOAD / STORE untouched. */ + bh_assert(frame->eh_count > 0); + frame->eh_count--; + HANDLE_OP_END(); + } + HANDLE_OP(EXT_OP_TRY) { - /* Still routed through the diagnostic until a follow- - * up commit wires up delegate dispatch and the type- - * index-blocktype variant of TRY. The fast-interp - * loader currently doesn't emit EXT_OP_TRY (CATCH / - * DELEGATE indices are recorded directly on the - * per-function exception_handlers table); hitting - * either here means a future loader change started - * emitting them. */ + /* The fast-interp loader doesn't emit EXT_OP_TRY yet + * (the eh-table records CATCH / CATCH_ALL / DELEGATE + * indices directly on the per-function table; TRY's + * uint32 immediate is the eh_idx, not a type-index + * blocktype). Reaching this handler means a future + * loader change started emitting EXT_OP_TRY without + * the runtime catching up — surface that as an + * explicit trap. */ wasm_set_exception(module, "unsupported opcode"); goto got_exception; } diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index 9ef6603d3b..d637b171ce 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12610,26 +12610,91 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } case WASM_OP_DELEGATE: { - /* check target block is valid */ - if (!(frame_csp_tmp = check_branch_block_for_delegate( - loader_ctx, &p, p_end, error_buf, error_buf_size))) - goto fail; - + /* Manual depth + label-type validation. Like RETHROW + * (above), we deliberately skip the shared + * `check_branch_block_for_delegate` here because: + * (1) DELEGATE doesn't *branch* to its target at + * runtime — when the try-body throws, the + * find_a_catch_handler walker reads the precomputed + * `delegate_target_depth` off the eh-table entry + * and skips the right number of nested-try entries + * on the per-frame eh-stack. The branch-info bytes + * that `emit_br_info` would write between the + * auto-emitted DELEGATE label and any subsequent + * operand are dead weight (4 bytes arity + 8 bytes + * target ptr, all unread by either the runtime + * DELEGATE handler or the throw walker). + * (2) Worse, leaving them in the IR shifts any + * immediate we *do* want to emit past where the + * runtime reads it — same gotcha that bit + * RETHROW. + * + * `delegate N` targets the (N+1)-th block out from the + * current try-delegate frame. The try-delegate itself + * still sits on the loader's csp stack at this point + * (POP_CSP is called below), so the target is at + * frame_csp - N - 2 + * and the spec rejects `delegate N` whose N+1 would + * climb past the function frame. */ + uint32 delegate_depth = 0; BranchBlock *cur_block = loader_ctx->frame_csp - 1; + BranchBlock *target_block; uint8 label_type = cur_block->label_type; - (void)label_type; + + pb_read_leb_uint32(p, p_end, delegate_depth); + bh_assert(loader_ctx->csp_num > 0); + if (loader_ctx->csp_num - 1 <= delegate_depth) { +#if WASM_ENABLE_SPEC_TEST == 0 + set_error_buf(error_buf, error_buf_size, + "unknown delegate label"); +#else + set_error_buf(error_buf, error_buf_size, "unknown label"); +#endif + goto fail; + } + target_block = loader_ctx->frame_csp - delegate_depth - 2; + (void)target_block; + #if WASM_ENABLE_FAST_INTERP != 0 - /* Second traverse only: a `delegate N` closes the try - * region and forwards uncaught exceptions to an outer - * block. Record end_of_region_pc now — the actual depth - * is wired up by a follow-up commit (the runtime can't - * dispatch through delegate_target_depth until the - * EH-frame stack exists). */ + /* Second traverse only: populate the eh-table entry so + * the runtime walker can dispatch through it. + * + * delegate_target_depth = (count of try / catch / + * catch_all blocks STRICTLY between cur_block and + * target_block on the loader's csp stack) + * + * At runtime those `delta` blocks are exactly the + * eh-stack entries immediately below the delegate's own + * entry that the throw walker must SKIP — the spec + * re-raises the exception "at the target block's + * location", so any try whose body the delegate's try + * is nested inside (but the target is also inside) + * doesn't get to catch it. + * + * end_of_region_pc still gets set to the IR pc just + * after the auto-emitted DELEGATE label. The walker + * never reads it for delegate entries (it forwards via + * delta instead), but a future DELEGATE-end runtime + * handler that wanted to advance frame_ip past the + * region could use it; recording it keeps the + * shape identical to the END(try) capture and the + * field semantics easy to reason about. */ if (loader_ctx->p_code_compiled != NULL) { uint32 eh_idx = cur_block->eh_entry_idx; + uint32 delta = 0; + BranchBlock *b; bh_assert(eh_idx < func->exception_handler_count); bh_assert(func->exception_handlers != NULL); + for (b = cur_block - 1; b > target_block; b--) { + if (b->label_type == LABEL_TYPE_TRY + || b->label_type == LABEL_TYPE_CATCH + || b->label_type == LABEL_TYPE_CATCH_ALL) { + delta++; + } + } + func->exception_handlers[eh_idx].delegate_target_depth = + delta; func->exception_handlers[eh_idx].end_of_region_pc = loader_ctx->p_code_compiled; } From 169f9e757ae4b83f79cf90e11f80c84c609fa8b8 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 22:48:18 -0700 Subject: [PATCH 07/16] fast-interp: tag-with-params payload routing (same-function dispatch) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up the loader + runtime path so a tagged exception with i32 / i64 / v128 parameters delivers its payload to the matching catch body's operand stack — same-function dispatch only. Cross-function dispatch (callee throws, caller catches) still drops the payload; that gap is now surfaced explicitly via the `cross_function_tag_with_params` integration test (#[ignore]'d with the same justification recorded in AGENTS.md). WASMFastEHCatch grows two fields: uint32 param_cell_num; int16 *param_dst_offsets; The dst-slots array is a loader-owned int16[] of length `param_cell_num`, capturing the cell-wise frame_lp slot offsets that the catch body's downstream ops will pop from. NULL for the common tag-without-params case (Porffor's empty-payload tags, all of the spec-test's `tag $err` declarations) — no heap allocation and the runtime walker's copy loop is a trivial zero-iteration no-op. Loader (wasm_loader.c) — CATCH case: * Swap `PUSH_TYPE` for `PUSH_OFFSET_TYPE` so the catch body's incoming params get fresh `dynamic_offset` slots allocated + emitted as int16 operands in the IR (right after the eh_idx immediate). The PUSH_OFFSET_TYPE emits are dead bytes on the normal-flow CATCH dispatch (which only reads eh_idx and branches to end_of_region_pc), but they're necessary so the catch body's POP_OFFSET_TYPEs find the right slot offsets in frame_offset[]. * Pass 2 captures handler_pc AFTER the PUSH_OFFSET_TYPEs so the throw walker's `frame_ip = handler_pc` lands at the first byte of the catch body proper (skipping the dead dst-slot bytes). * Pass 2 also bh_memcpy_s's frame_offset[]'s top `param_cell_num` cells into a fresh int16[] on the catch's WASMFastEHCatch — these are the destination offsets the runtime walker will write payload values to. * Free path in wasm_loader_unload extended to free the per-catch dst-offsets array. Loader — THROW case (wasm_loader.c): * Moved the existing `emit_uint32(tag_index)` below the tag-type lookup + validation so `tag_type->param_cell_num` is available. * After tag_index, emit `` plus `` for i in 0..param_cell_num. The src offsets are read directly off the top of `loader_ctx-> frame_offset[]` — the validation loop above pops frame_ref but doesn't touch frame_offset, so they're stable. Both traverses run the same emit to keep pass-1 / pass-2 size accounting balanced. Runtime (wasm_interp_fast.c) — new locals in the dispatch function (cold-path only, same scope as `exception_tag_index`): uint32 throw_param_cell_num = 0; int16 *throw_src_offsets = NULL; These get populated by HANDLE_OP(WASM_OP_THROW), which now reads tag_index + param_cell_num + the src-offsets array off the IR (advancing frame_ip past all three). The pair is consumed by find_a_catch_handler's catch-match dispatch: on a typed-catch match it does the cell-wise copy `frame_lp[dst[c]] = frame_lp[src[c]]`. catch_all dispatch explicitly drops the payload (per spec — catch_all binds no exception values). The copy loop is fully cold (only THROW reaches here); CALL / LOAD / STORE handlers untouched. WASM_OP_RETHROW: extended to re-point throw_src_offsets at the matched catch's `param_dst_offsets` before goto find_a_catch_ handler — so rethrow from inside a typed catch carries the same payload outward. The catch body can't mutate the dst slots (they're allocated from `dynamic_offset`, separate from the local-slot range that local.set writes to), so the values are still the original ones at rethrow time. Rethrow from inside a catch_all (whose `param_dst_offsets == NULL`) falls back to zero-cell — documented as a known limitation. return_func hook: the cross-frame branch zeros throw_param_cell_ num and throw_src_offsets before the goto find_a_catch_handler, since the callee's source slots live in a frame that's about to be torn down — same payload-dropping semantics as the existing cross-function-no-payload case, but explicit instead of relying on uninitialized stack. Cost shape preserved: zero new bytes in CALL / LOAD / STORE. EH_ENTRY_CELLS still 2; no extra cells per try-region. The two new locals get spilled by the compiler since the hot loop doesn't reference them. --- core/iwasm/interpreter/wasm.h | 19 ++ core/iwasm/interpreter/wasm_interp_fast.c | 130 ++++++++++++- core/iwasm/interpreter/wasm_loader.c | 215 ++++++++++++++++------ 3 files changed, 303 insertions(+), 61 deletions(-) diff --git a/core/iwasm/interpreter/wasm.h b/core/iwasm/interpreter/wasm.h index e18ab7ae68..879bdc64b1 100644 --- a/core/iwasm/interpreter/wasm.h +++ b/core/iwasm/interpreter/wasm.h @@ -688,6 +688,25 @@ typedef struct WASMImport { typedef struct WASMFastEHCatch { uint32 tag_index; uint8 *handler_pc; + /* Tag-with-params payload routing (same-function dispatch only). + * When this catch matches, the throw walker copies `param_cell_num` + * 32-bit cells from the throw site's *source* slots (encoded as + * `int16` immediates after the THROW opcode in the rewritten IR) + * into these *destination* slots in the catch body's `frame_lp`, + * then sets `frame_ip = handler_pc`. The destination slots are + * allocated by the CATCH loader at preprocess time, mirroring how + * block-with-params allocate fresh `dynamic_offset` slots via + * `PUSH_OFFSET_TYPE`. NULL iff `param_cell_num == 0` (the typical + * tag-without-params shape, e.g. Porffor's empty-payload tags). + * + * Cross-function dispatch (caller's catch fires for a callee's + * throw) does NOT copy the payload: the callee's source slots + * sit in a frame that's about to be torn down by return_func. + * That gap is documented as an ignored integration test — + * `cross_function_tag_with_params` in + * crates/benchmark-core/tests/eh_correctness.rs. */ + uint32 param_cell_num; + int16 *param_dst_offsets; } WASMFastEHCatch; /* One entry per same-function try-region, indexed by the uint32 immediate diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 4db492ca3f..1f587b1032 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -1560,6 +1560,22 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * Cold path only — the dispatch loop's hot ops never reference * this variable, so the compiler is free to spill it. */ uint32 exception_tag_index = 0; + /* Tag-with-params payload routing for same-function dispatch. + * Read off the IR after THROW's tag_index immediate; + * `throw_src_offsets` points at the first src-slot int16 in the + * rewritten IR, and `throw_param_cell_num` is the total cell + * count across all of the tag's params. find_a_catch_handler + * uses these to copy frame_lp[src[i]] into the matched catch's + * pre-allocated dst slots. Both are cold-path-only — like + * exception_tag_index, the dispatch loop's hot ops never + * reference them. RETHROW re-points throw_src_offsets at the + * still-alive catch's `param_dst_offsets` (the original + * payload values, unchanged by the catch body since they live + * in a different slot range from locals) so the re-raised + * exception carries the same payload across outer try-regions + * in this frame. */ + uint32 throw_param_cell_num = 0; + int16 *throw_src_offsets = NULL; #endif #if WASM_ENABLE_INSTRUCTION_METERING != 0 @@ -1861,13 +1877,32 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, #if WASM_ENABLE_EXCE_HANDLING != 0 HANDLE_OP(WASM_OP_THROW) { - /* Loader emits `WASM_OP_THROW `. Read - * the tag, then walk the eh-stack in - * find_a_catch_handler to find a matching catch — - * first in this frame, then via return_func's hook in - * caller frames, and finally falling out to the host - * via got_exception when no match is found anywhere. */ + /* Loader emits + * `WASM_OP_THROW + * + * ... + * `. Read the tag plus payload-source + * metadata, then walk the eh-stack in find_a_catch_handler to + * find a matching catch — first in this frame, then via + * return_func's hook in caller frames, and finally + * falling out to the host via got_exception when no + * match is found anywhere. + * + * Payload routing: when a same-function catch matches, + * find_a_catch_handler copies frame_lp[src[i]] into + * the catch's pre-allocated dst slots (recorded on + * `WASMFastEHCatch.param_dst_offsets` at load time). + * For tag-without-params (the typical Porffor shape), + * `throw_param_cell_num == 0` makes the copy a no-op. + * For cross-function dispatch the source frame is + * torn down before the caller's walker runs, so the + * payload is dropped — this gap is documented in + * AGENTS.md and exercised as + * `cross_function_tag_with_params` (#[ignore]). */ exception_tag_index = read_uint32(frame_ip); + throw_param_cell_num = read_uint32(frame_ip); + throw_src_offsets = (int16 *)frame_ip; + frame_ip += sizeof(int16) * throw_param_cell_num; goto find_a_catch_handler; } @@ -1962,11 +1997,33 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * raise it. */ cells[0] = packed | EH_TRY_CATCH_STATE_BIT; cells[1] = exception_tag_index; + /* Payload copy (same-function dispatch). + * The loader guaranteed + * `entry->catches[j].param_cell_num == + * throw_param_cell_num` by checking the + * tag type at both THROW and CATCH; the + * runtime just executes the cell-wise + * frame_lp move. Tag-without-params makes + * the loop trivial. */ + if (throw_param_cell_num > 0 + && entry->catches[j].param_dst_offsets) { + uint32 c; + int16 *dst = entry->catches[j].param_dst_offsets; + for (c = 0; c < throw_param_cell_num; c++) { + frame_lp[dst[c]] = + frame_lp[throw_src_offsets[c]]; + } + } frame_ip = entry->catches[j].handler_pc; HANDLE_OP_END(); } } if (entry->catch_all_pc) { + /* catch_all binds no payload (spec: catch_all + * has no exception values), so we drop the + * src cells here. RETHROW from inside a + * catch_all body cannot re-emit a payload — + * documented as a known limitation. */ cells[0] = packed | EH_TRY_CATCH_STATE_BIT; cells[1] = exception_tag_index; frame_ip = entry->catch_all_pc; @@ -2075,7 +2132,51 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, /* Re-raise the caught tag against the *outer* * try-regions. find_a_catch_handler iterates * top-down and skips state=CATCH entries, so - * this same entry won't re-match. */ + * this same entry won't re-match. + * + * Payload routing: the original throw's + * payload values were copied into THIS + * catch's dst slots by the previous + * find_a_catch_handler dispatch. The wasm + * spec says the catch body can't mutate + * those exception values directly (they're + * not addressable as locals, and the only + * way to read them is to pop off the + * operand stack at catch entry — which + * advances past the dst slots without + * writing them back). So at RETHROW time + * the dst slots still hold the original + * payload, and we can point throw_src_offsets + * at them so the outer catch's copy lands + * on a fresh set of dst slots with the + * same values. + * + * If the original match was via catch_all + * (no typed catch matched cells[1]), + * `match->param_dst_offsets == NULL` and the + * payload was already dropped at the + * catch_all dispatch. RETHROW from + * catch_all then re-raises with no payload + * — documented as a known limitation. */ + uint32 ent_eh_idx = cells[0] & ~EH_TRY_CATCH_STATE_BIT; + WASMFastEHEntry *ent = + &cur_wasm_func->exception_handlers[ent_eh_idx]; + WASMFastEHCatch *match = NULL; + uint32 mj; + for (mj = 0; mj < ent->catch_count; mj++) { + if (ent->catches[mj].tag_index == cells[1]) { + match = &ent->catches[mj]; + break; + } + } + if (match && match->param_dst_offsets) { + throw_param_cell_num = match->param_cell_num; + throw_src_offsets = match->param_dst_offsets; + } + else { + throw_param_cell_num = 0; + throw_src_offsets = NULL; + } exception_tag_index = cells[1]; goto find_a_catch_handler; } @@ -8145,9 +8246,22 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * chance to catch. Predicted strongly not-taken — * exceptions are rare, this single check is the entire * CALL-return-side cost of EH; the success path takes the - * HANDLE_OP_END() below. */ + * HANDLE_OP_END() below. + * + * Cross-frame payload routing: the callee's throw site's + * source slots lived in the callee's frame_lp, which has + * already been freed by the time we get here. We zero out + * the throw_param_cell_num / throw_src_offsets pair so the + * caller's find_a_catch_handler doesn't try to dereference + * freed memory — the catch (if any matches) will fire with + * a zero-cell payload. This is the same gap documented at + * the WASM_OP_THROW handler and surfaced as + * `cross_function_tag_with_params` in the integration + * suite. */ if (frame->exception_raised) { exception_tag_index = frame->tag_index; + throw_param_cell_num = 0; + throw_src_offsets = NULL; frame->exception_raised = false; goto find_a_catch_handler; } diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index d637b171ce..53e0b6af84 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -7366,12 +7366,22 @@ wasm_loader_unload(WASMModule *module) for (eh_idx = 0; eh_idx < module->functions[i]->exception_handler_count; eh_idx++) { - if (module->functions[i] - ->exception_handlers[eh_idx] - .catches) { - wasm_runtime_free(module->functions[i] - ->exception_handlers[eh_idx] - .catches); + WASMFastEHEntry *eh_entry = + &module->functions[i]->exception_handlers[eh_idx]; + if (eh_entry->catches) { + uint32 cj; + /* Free each catch's tag-with-params dst + * slot array. param_dst_offsets is NULL + * for the (common) tag-without-params + * case, in which case the free is a + * no-op. */ + for (cj = 0; cj < eh_entry->catch_count; cj++) { + if (eh_entry->catches[cj].param_dst_offsets) { + wasm_runtime_free(eh_entry->catches[cj] + .param_dst_offsets); + } + } + wasm_runtime_free(eh_entry->catches); } } wasm_runtime_free(module->functions[i]->exception_handlers); @@ -12445,19 +12455,6 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint32 tag_index = 0; pb_read_leb_int32(p, p_end, tag_index); -#if WASM_ENABLE_FAST_INTERP != 0 - /* Fast-interp: the LEB-encoded tag_index from the source - * bytecode is consumed above; re-emit it as a plain - * uint32 immediate after the (auto-emitted) THROW opcode - * so the runtime handler can read it without re-running - * the LEB decoder. The runtime currently treats the tag - * as opaque (it surfaces a generic "wasm exception - * thrown (tag N)" string via wasm_set_exception and - * escapes via got_exception). Same emit shape as - * WASM_OP_CALL's funcidx. */ - emit_uint32(loader_ctx, tag_index); -#endif - /* check validity of tag_index against module->tag_count */ /* check tag index is within the tag index space */ if (tag_index >= module->import_tag_count + module->tag_count) { @@ -12484,6 +12481,59 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; } +#if WASM_ENABLE_FAST_INTERP != 0 + /* Fast-interp THROW IR shape (emitted in BOTH traverses + * so pass-1 / pass-2 size accounting stays balanced): + * + * + * + * + * ... + * + * Where `param_cell_num` is the sum across all params' + * cell widths (i32 = 1, i64 = 2, v128 = 4, etc.) and + * src_offset_i is the throw-site's frame_lp slot for + * the i-th payload cell, read directly off the top of + * `loader_ctx->frame_offset[]`. The validation loop + * below pops frame_ref / available_stack_cell but + * doesn't touch frame_offset, so the src offsets are + * stable to read here. They get consumed at runtime + * by find_a_catch_handler when a *same-function* + * catch matches: it copies `param_cell_num` cells + * from frame_lp[src_offset_i] into the catch body's + * `param_dst_offsets[i]` slots before jumping to + * handler_pc. + * + * Cross-function dispatch (callee throws, caller's + * catch fires after return_func unwinds) does NOT + * preserve the payload — the source slots live in a + * frame that's about to be torn down. That gap is + * documented as an ignored integration test, in line + * with the cost-model rule that EH must not tax hot + * ops: a per-thread payload buffer would force every + * CALL / RETURN handler to spill scratch state across + * the boundary, which we explicitly refuse. + * + * Tag-without-params is the common case (Porffor + * emits empty payloads; many spec tests use bare + * tags too). param_cell_num=0 makes the for-loop + * trivial and the resulting IR is just the tag_index + * + a single zero — same hot-path cost as the + * pre-tag-with-params shape, since the runtime + * read_uint32 of param_cell_num happens on the cold + * THROW handler. */ + emit_uint32(loader_ctx, tag_index); + emit_uint32(loader_ctx, tag_type->param_cell_num); + { + uint32 ci; + for (ci = 0; ci < tag_type->param_cell_num; ci++) { + int16 src = *(loader_ctx->frame_offset + - tag_type->param_cell_num + ci); + emit_operand(loader_ctx, src); + } + } +#endif + int32 available_stack_cell = (int32)(loader_ctx->stack_cell_num - cur_block->stack_cell_num); @@ -12758,39 +12808,6 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, * loader allocation the heap placed immediately after * (typically func->exception_handlers itself). */ emit_uint32(loader_ctx, cur_block->eh_entry_idx); - - /* Second traverse only: append (tag_index, handler_pc) - * to the parent try-region's catches[]. handler_pc is - * the first rewritten-IR byte *after* the eh_idx - * immediate — that's where the runtime throw dispatch - * jumps when a matching tag is caught. */ - if (loader_ctx->p_code_compiled != NULL) { - uint32 eh_idx = cur_block->eh_entry_idx; - WASMFastEHEntry *entry; - WASMFastEHCatch *new_catches; - uint64 new_size; - bh_assert(eh_idx < func->exception_handler_count); - bh_assert(func->exception_handlers != NULL); - entry = &func->exception_handlers[eh_idx]; - new_size = (uint64)sizeof(WASMFastEHCatch) - * (entry->catch_count + 1); - if (!(new_catches = loader_malloc(new_size, error_buf, - error_buf_size))) { - goto fail; - } - if (entry->catches) { - bh_memcpy_s(new_catches, (uint32)new_size, - entry->catches, - (uint32)sizeof(WASMFastEHCatch) - * entry->catch_count); - wasm_runtime_free(entry->catches); - } - new_catches[entry->catch_count].tag_index = tag_index; - new_catches[entry->catch_count].handler_pc = - loader_ctx->p_code_compiled; - entry->catches = new_catches; - entry->catch_count++; - } #endif /* @@ -12807,7 +12824,29 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint32 j = 0; #endif - /* push types on the stack according to caught type */ + /* Push the tag's params onto the catch body's operand + * stack. Classic-interp uses PUSH_TYPE (which only + * touches the value-type stack used by validation); + * fast-interp also needs `PUSH_OFFSET_TYPE`, which + * allocates fresh `dynamic_offset` slots for each cell + * (and emits the slot offsets as `int16` operands in + * the IR right after the eh_idx). The catch body's + * downstream ops then `POP_OFFSET_TYPE` to consume + * these slots — same shape the loader uses for + * block-with-params (see `copy_params_to_dynamic_ + * space`). + * + * Note: the emitted dst slots are *unused* by the + * runtime CATCH normal-flow handler (it only reads + * eh_idx and branches to end_of_region_pc) — they + * sit in the IR as dead bytes on the fall-through + * path. The throw walker doesn't read them either; + * it consults the pre-decoded copy on + * `WASMFastEHCatch.param_dst_offsets` (populated + * below). They're emitted only so PUSH_OFFSET_TYPE's + * pass-1 / pass-2 size accounting stays balanced and + * the catch body's POP_OFFSET_TYPEs find the right + * slot offsets in `frame_offset[]`. */ for (i = 0; i < func_type->param_count; i++) { #if WASM_ENABLE_GC != 0 if (wasm_is_type_multi_byte_type(func_type->types[i])) { @@ -12819,8 +12858,78 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, j++; } #endif +#if WASM_ENABLE_FAST_INTERP != 0 + PUSH_OFFSET_TYPE(func_type->types[i]); +#else PUSH_TYPE(func_type->types[i]); +#endif } + +#if WASM_ENABLE_FAST_INTERP != 0 + /* Second traverse only: append a fully-populated + * `WASMFastEHCatch` entry to the parent try-region's + * catches[]. handler_pc is captured *after* the + * PUSH_OFFSET_TYPE emits above so it points at the + * first rewritten-IR byte of the catch body proper + * (skipping the dead dst-slot bytes). param_cell_num + * is the sum of cells across all tag params (i32 = 1 + * cell, i64 = 2, v128 = 4); param_dst_offsets is a + * loader-owned copy of the int16 slot offsets just + * pushed onto frame_offset[]. NULL when the tag has + * no params (the typical Porffor shape). */ + if (loader_ctx->p_code_compiled != NULL) { + uint32 eh_idx = cur_block->eh_entry_idx; + WASMFastEHEntry *entry; + WASMFastEHCatch *new_catches; + uint64 new_size; + bh_assert(eh_idx < func->exception_handler_count); + bh_assert(func->exception_handlers != NULL); + entry = &func->exception_handlers[eh_idx]; + new_size = (uint64)sizeof(WASMFastEHCatch) + * (entry->catch_count + 1); + if (!(new_catches = loader_malloc(new_size, error_buf, + error_buf_size))) { + goto fail; + } + if (entry->catches) { + bh_memcpy_s(new_catches, (uint32)new_size, + entry->catches, + (uint32)sizeof(WASMFastEHCatch) + * entry->catch_count); + wasm_runtime_free(entry->catches); + } + new_catches[entry->catch_count].tag_index = tag_index; + new_catches[entry->catch_count].handler_pc = + loader_ctx->p_code_compiled; + new_catches[entry->catch_count].param_cell_num = + func_type->param_cell_num; + new_catches[entry->catch_count].param_dst_offsets = NULL; + if (func_type->param_cell_num > 0) { + uint64 dst_size = + (uint64)sizeof(int16) * func_type->param_cell_num; + int16 *dst; + if (!(dst = loader_malloc(dst_size, error_buf, + error_buf_size))) { + wasm_runtime_free(new_catches); + goto fail; + } + /* The just-completed PUSH_OFFSET_TYPE sequence + * pushed param_cell_num int16 entries onto + * frame_offset[] in source order (param 0's + * cell 0, cell 1, ..., param 1's cell 0, ...). + * Copy them out now while the top of + * frame_offset[] still holds them — the next + * RESET_STACK would wipe them. */ + bh_memcpy_s(dst, (uint32)dst_size, + loader_ctx->frame_offset + - func_type->param_cell_num, + (uint32)dst_size); + new_catches[entry->catch_count].param_dst_offsets = dst; + } + entry->catches = new_catches; + entry->catch_count++; + } +#endif break; } case WASM_OP_CATCH_ALL: From 72db3ca8a2ba2c7c93f373dd489e89125cdb86a5 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Sun, 17 May 2026 23:30:11 -0700 Subject: [PATCH 08/16] fast-interp: tag-with-params loader bug-fix pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs surfaced once same-function tag-with-params actually got exercised by integration tests: 1. **`PUSH_OFFSET_TYPE` is offset-only.** The CATCH loader was bumping `dynamic_offset` + `frame_offset[]` but never `stack_cell_num`, leaving the operand and ref stacks out of sync. The catch body's first consumer (e.g. `global.set $g`) then hit `wasm_loader_pop_frame_offset`'s polymorphic short-circuit — the CATCH block inherits the polymorphic flag from THROW's `SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE` and with `available_stack_cell == 0` the pop silently returned without emitting the source-slot operand bytes. The consumer's runtime read then landed on heap garbage and crashed with SIGBUS / SIGSEGV. Fix: pair `PUSH_OFFSET_TYPE` with `PUSH_TYPE` (ref-only) so both stacks advance in lockstep. 2. **Multi-cell `frame_offset[]` entries are unreliable past the first cell.** `wasm_loader_push_frame_offset` writes a meaningful int16 only for the FIRST cell of a multi-cell value (i64, f64, v128); the subsequent cell entries are left uninitialized (just a pointer increment, no write). My pass-1 THROW src-offset emit and pass-2 CATCH dst-offset capture were reading those uninitialized cells directly, producing garbage offsets for any param wider than 32 bits. Fix: walk params (not cells) and synthesize consecutive cell offsets `(first, first+1, ..., first+N-1)` per param, where `first = frame_offset[cell_so_far]`. Matches the runtime invariant that an N-cell value occupies N consecutive frame_lp cells. 3 new integration tests cover the fixes: * `tag_single_i64_param` — 2-cell payload * `tag_mixed_i32_i64_params` — exercises per-param cell synthesis (would fail if cell-walk offset by 1) * `repeated_throw_with_payload` — confirms catch-allocated dst slots get fresh writes every invocation Plus a wat fix in `nested_try_with_params_inner_wins`: the outer catch's body was `i32.const 999 / global.set $g`, leaving the param on the operand stack at `end`. That was a latent bug masked before tag-with-params support (PUSH_TYPE-only didn't let the param "exist" for validation purposes). Now corrected by adding an explicit `drop` so the catch body's stack validates clean. No hot-op cost change: all the new loader work is in the cold CATCH / THROW preprocess paths, and the runtime walker copy loop is unchanged. --- core/iwasm/interpreter/wasm_loader.c | 87 ++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 18 deletions(-) diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index 53e0b6af84..88885a27e6 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12525,11 +12525,28 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, emit_uint32(loader_ctx, tag_index); emit_uint32(loader_ctx, tag_type->param_cell_num); { - uint32 ci; - for (ci = 0; ci < tag_type->param_cell_num; ci++) { - int16 src = *(loader_ctx->frame_offset - - tag_type->param_cell_num + ci); - emit_operand(loader_ctx, src); + /* Multi-cell types (i64, f64, v128) only have a + * meaningful first-cell offset in + * `frame_offset[]` — subsequent cells of the + * same value are left uninitialized by + * `wasm_loader_push_frame_offset` (it just + * advances the pointer without writing). For + * each param walk the per-param first cell out + * of frame_offset and synthesize consecutive + * cell offsets `(first, first+1, ...)`; that + * matches the runtime invariant that an n-cell + * value occupies n consecutive frame_lp cells. */ + uint32 pi, c, cell_so_far = 0; + int16 *base = + loader_ctx->frame_offset - tag_type->param_cell_num; + for (pi = 0; pi < tag_type->param_count; pi++) { + uint32 this_cells = + wasm_value_type_cell_num(tag_type->types[pi]); + int16 first_slot = base[cell_so_far]; + for (c = 0; c < this_cells; c++) { + emit_operand(loader_ctx, (int16)(first_slot + c)); + } + cell_so_far += this_cells; } } #endif @@ -12858,11 +12875,28 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, j++; } #endif + /* Allocate a fresh `dynamic_offset` slot for the + * catch param AND push its type onto `frame_ref` + * (so `stack_cell_num` stays balanced). One + * without the other doesn't work: a bare + * `PUSH_OFFSET_TYPE` leaves the offset side + * ahead of the ref side, so the catch body's + * first consumer (e.g. `global.set $g`) hits + * `wasm_loader_pop_frame_offset`'s polymorphic + * short-circuit — the CATCH block inherits the + * polymorphic flag from THROW's + * `SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE`, and + * with `available_stack_cell == 0` the pop + * silently returns without emitting the source + * slot. The consumer's runtime read then lands + * on heap garbage and crashes with SIGBUS / + * SIGSEGV. PUSH_TYPE rebalances and avoids + * the short-circuit so the catch body's pops + * emit real source-slot operand bytes. */ #if WASM_ENABLE_FAST_INTERP != 0 PUSH_OFFSET_TYPE(func_type->types[i]); -#else - PUSH_TYPE(func_type->types[i]); #endif + PUSH_TYPE(func_type->types[i]); } #if WASM_ENABLE_FAST_INTERP != 0 @@ -12908,22 +12942,39 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, uint64 dst_size = (uint64)sizeof(int16) * func_type->param_cell_num; int16 *dst; + uint32 pi, c, cell_so_far = 0; + int16 *base; if (!(dst = loader_malloc(dst_size, error_buf, error_buf_size))) { wasm_runtime_free(new_catches); goto fail; } - /* The just-completed PUSH_OFFSET_TYPE sequence - * pushed param_cell_num int16 entries onto - * frame_offset[] in source order (param 0's - * cell 0, cell 1, ..., param 1's cell 0, ...). - * Copy them out now while the top of - * frame_offset[] still holds them — the next - * RESET_STACK would wipe them. */ - bh_memcpy_s(dst, (uint32)dst_size, - loader_ctx->frame_offset - - func_type->param_cell_num, - (uint32)dst_size); + /* Synthesize per-cell dst offsets from each + * param's first cell. Same multi-cell shape + * concern as the THROW src emit: + * `wasm_loader_push_frame_offset` writes a + * meaningful int16 only for the first cell + * of a multi-cell value (i64 / f64 / v128); + * subsequent cells of the same value have + * unspecified frame_offset entries. The + * runtime walker copies one frame_lp cell + * per iteration, so its `param_cell_num` + * loop needs an offset array indexed by + * absolute cell number, not by frame_offset + * position. Build that here by walking + * params and synthesizing `(first, first+1, + * ..., first+param_cells-1)` for each one. */ + base = loader_ctx->frame_offset + - func_type->param_cell_num; + for (pi = 0; pi < func_type->param_count; pi++) { + uint32 this_cells = + wasm_value_type_cell_num(func_type->types[pi]); + int16 first_slot = base[cell_so_far]; + for (c = 0; c < this_cells; c++) { + dst[cell_so_far + c] = (int16)(first_slot + c); + } + cell_so_far += this_cells; + } new_catches[entry->catch_count].param_dst_offsets = dst; } entry->catches = new_catches; From d481d9bc9bb0e3a5375df879affcf31b005e9807 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 08:50:15 -0700 Subject: [PATCH 09/16] fast-interp: result-typed try-region COPY-at-CATCH alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `try (result T)` regions now route the try body's normal-flow value into the block's `dynamic_offset` slot the same way `else` routes the if-body's value via `reserve_block_ret`. The throw- dispatch path's catch-body END already handled the catch's COPY via the existing reserve_block_ret call; this patch fills the remaining gap by injecting a COPY before each CATCH/CATCH_ALL label so the normal-flow exit (try body completes without throwing → falls through to CATCH → CATCH runtime handler jumps to end_of_region_pc) also deposits the value at the right slot. Loader (wasm_loader.c): * WASM_OP_CATCH and WASM_OP_CATCH_ALL: before the existing emit_uint32(eh_idx) emit, call `check_block_stack` on the previous body (the try body on the first CATCH; the prior catch body on subsequent ones) and emit an EXT_OP_COPY_STACK_TOP / _I64 / _V128 if the body's last cell isn't already at `cur_block->dynamic_offset`. The `src != dst` predicate runs in both passes; the sign-stable nature of dynamic_offset (≥ 0) vs const-pool slots (≤ -1) keeps pass-1 size accounting and pass-2 writes aligned even though const-pool slots get renumbered by the qsort/dedup at the start of pass 2. * Both cases now also `SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE (false)` after `RESET_STACK()`, matching how `WASM_OP_ELSE` resets the if-body's polymorphic flag. Without this reset, a catch body following a throw inherits the polymorphic state and `check_block_stack` at END takes the polymorphic branch (`POP_OFFSET_TYPE` → 2 bytes per return-cell emitted). Those bytes land between the auto-emitted END label and the EH-END branch's `skip_label()`, shifting the re-emitted END label forward and leaving a corrupt handler-ptr at the recorded `handler_pc` — SIGSEGV on the first dispatch. Multi-return-value try-regions get an explicit "not yet supported" error; they need `EXT_OP_COPY_STACK_VALUES` emit support that's not in this commit. Single-return-value covers every shape Porffor / AS / our 51-case integration suite emits. 6 new result-typed integration tests (single i32 / i64, with and without throw, multi-catch picked by tag, catch_all fallback, mixed-with-locals slot allocation). Plus a wat fix in `multiple_catches_with_params_pick_by_tag`: the `catch $a` body left its param on the operand stack before the catch-to-catch transition. The previous loader didn't validate catch transitions, so this latent imbalance was silently accepted; now `check_block_stack` runs at every CATCH, catches the unbalanced stack, and reports the spec-required `type mismatch: block requires [] but stack has [i32]`. Added an explicit `drop` in the catch body so the test's wat validates clean. Verified end-to-end: 51/51 EH integration tests pass (was 45/45 before; +6 new result-typed cases). porf-accurate runs at 15.6 ms median (no regression vs the 17.3 ms baseline; small improvement plausibly from the polymorphic-reset path no longer emitting redundant POP_OFFSET_TYPE operands). --- core/iwasm/interpreter/wasm_loader.c | 152 +++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index 88885a27e6..ba9122e4f7 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -12813,7 +12813,88 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, "Unexpected block sequence encountered."); goto fail; } + + /* Validate previous body's stack (try body on first + * CATCH, previous catch body on subsequent CATCH) + * matches the block's result type. Without this the + * loader would silently accept stack-shape mismatches + * between the try body and the catch bodies and the + * next op would read garbage. Same pattern as ELSE + * runs `check_block_stack` on the if-body before the + * else body's PUSH_TYPE sequence. */ + if (!check_block_stack(loader_ctx, cur_block, error_buf, + error_buf_size)) + goto fail; + #if WASM_ENABLE_FAST_INTERP != 0 + /* For result-typed try-regions, inject a COPY of the + * previous body's last value(s) into the block's + * `dynamic_offset` slot BEFORE the auto-emitted CATCH + * label. The normal-flow CATCH dispatch jumps from + * here to `end_of_region_pc` — the body's value would + * otherwise be lost. Mirrors how `reserve_block_ret` + * + `case WASM_OP_ELSE` align the if-body's result + * for the else-body's END to read. Layout becomes: + * + * [previous body ops...] + * [EXT_OP_COPY_STACK_TOP src=prev_top dst=dyn_off] + * [CATCH label][eh_idx][dst-slots from PUSH...] + * [catch body ops...] + * + * The `src != dst` check runs in BOTH traverses so + * pass-1 size accounting matches pass-2 writes: + * `dynamic_offset` evolves identically in both + * passes, and although const-pool slots get + * renumbered between passes by the qsort/dedup at + * the start of pass 2, they stay strictly negative + * (offsets `-(count)..-1`) while `dynamic_offset` is + * strictly non-negative (`>= start_dynamic_offset = + * param_cell_num + local_cell_num`). So the + * predicate is sign-stable across passes. + * + * Multi-return-value try-regions need + * `EXT_OP_COPY_STACK_VALUES`; we error out + * explicitly until a follow-up commit lifts that + * restriction. Single-return covers every shape + * Porffor / AS / our integration tests emit. */ + { + uint8 *return_types = NULL; +#if WASM_ENABLE_GC == 0 + uint32 return_count = block_type_get_result_types( + &cur_block->block_type, &return_types); +#else + WASMRefTypeMap *return_reftype_maps = NULL; + uint32 return_reftype_map_count = 0; + uint32 return_count = block_type_get_result_types( + &cur_block->block_type, &return_types, + &return_reftype_maps, &return_reftype_map_count); +#endif + if (return_count == 1) { + uint8 cell = + (uint8)wasm_value_type_cell_num(return_types[0]); + int16 src = *(loader_ctx->frame_offset - cell); + int16 dst = cur_block->dynamic_offset; + if (src != dst) { + skip_label(); + if (cell == 4) + emit_label(EXT_OP_COPY_STACK_TOP_V128); + else if (cell == 2) + emit_label(EXT_OP_COPY_STACK_TOP_I64); + else + emit_label(EXT_OP_COPY_STACK_TOP); + emit_operand(loader_ctx, src); + emit_operand(loader_ctx, dst); + emit_label(opcode); + } + } + else if (return_count > 1) { + set_error_buf(error_buf, error_buf_size, + "multi-return try-region not " + "supported in fast interpreter"); + goto fail; + } + } + /* Emit `` after the auto-emitted CATCH * opcode. The runtime CATCH handler reads it to find * end_of_region_pc when the catch is reached via @@ -12835,6 +12916,23 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, /* RESET_STACK removes the values pushed in TRY or previous * CATCH Blocks */ RESET_STACK(); + /* Reset the polymorphic flag the way `WASM_OP_ELSE` + * does: the catch body is a freshly-reachable region, + * not a continuation of the (dead) try body after a + * throw. Without this reset, the catch body's END + * runs `check_block_stack` in polymorphic mode, which + * emits a `POP_OFFSET_TYPE` operand byte for each + * return-cell — those bytes land between the auto- + * emitted END label and the case body's + * `skip_label()`, shifting the re-emitted END label + * forward by `2 * return_cell_num` bytes and leaving + * a corrupt handler-ptr at the originally-recorded + * `handler_pc`. (The same bug latent in non-EH + * polymorphic blocks doesn't bite because their END + * gets stripped from the IR entirely; the EH path's + * runtime needs the END opcode to actually exist for + * the eh-stack pop.) */ + SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE(false); #if WASM_ENABLE_GC != 0 WASMRefType *ref_type; @@ -12996,7 +13094,58 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; } + /* Same previous-body-stack validation as in CATCH. */ + if (!check_block_stack(loader_ctx, cur_block, error_buf, + error_buf_size)) + goto fail; + #if WASM_ENABLE_FAST_INTERP != 0 + /* Same COPY-to-block-dynamic_offset shape as CATCH + * (see the long comment in the CATCH case for the + * rationale and pass-1/pass-2 alignment argument). + * catch_all is the only place the body-COPY can run + * for a try with a result-type and only a catch_all, + * so without this emit a result-typed + * `try (result T) ... catch_all` would lose the try + * body's value on the normal-flow path. */ + { + uint8 *return_types = NULL; +#if WASM_ENABLE_GC == 0 + uint32 return_count = block_type_get_result_types( + &cur_block->block_type, &return_types); +#else + WASMRefTypeMap *return_reftype_maps = NULL; + uint32 return_reftype_map_count = 0; + uint32 return_count = block_type_get_result_types( + &cur_block->block_type, &return_types, + &return_reftype_maps, &return_reftype_map_count); +#endif + if (return_count == 1) { + uint8 cell = + (uint8)wasm_value_type_cell_num(return_types[0]); + int16 src = *(loader_ctx->frame_offset - cell); + int16 dst = cur_block->dynamic_offset; + if (src != dst) { + skip_label(); + if (cell == 4) + emit_label(EXT_OP_COPY_STACK_TOP_V128); + else if (cell == 2) + emit_label(EXT_OP_COPY_STACK_TOP_I64); + else + emit_label(EXT_OP_COPY_STACK_TOP); + emit_operand(loader_ctx, src); + emit_operand(loader_ctx, dst); + emit_label(opcode); + } + } + else if (return_count > 1) { + set_error_buf(error_buf, error_buf_size, + "multi-return try-region not " + "supported in fast interpreter"); + goto fail; + } + } + /* Emit `` after the auto-emitted CATCH_ALL * opcode in BOTH traverses (pass 1's size accounting * must include this or pass 2 overruns @@ -13023,6 +13172,9 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, /* RESET_STACK removes the values pushed in TRY or previous * CATCH Blocks */ RESET_STACK(); + /* Same polymorphic reset as `WASM_OP_CATCH` — see the + * matching comment there for the rationale. */ + SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE(false); /* catch_all has no tagtype and therefore no parameters */ break; From 924217e63fbd299e0c5c800cc94f23c1d906e66b Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 10:29:47 -0700 Subject: [PATCH 10/16] fast-interp: warn on br/br_if/br_table across try-region boundary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a load-time warning when a br / br_if / br_table opcode crosses one or more LABEL_TYPE_TRY / _CATCH / _CATCH_ALL frames, because the runtime br doesn't pop the eh-stack — each crossed try-region leaks one eh-stack entry that survives until frame teardown. The simple case (single br out of a try; e.g. the `br_out_of_try_pops_eh_stack` integration test) is benign: the per-frame eh-stack reservation (`exception_handler_count * EH_ENTRY_CELLS` cells, covering every static try-block in the function) leaves room for one stale entry alongside any subsequent sibling try's push, and the top-down walker iterates from `eh_count` down so sibling-try throws still match the most recent push first. The stale entry dies when the frame is freed at function return. The pathological case — `loop { try { br_to_loop_top } catch }` — leaks one entry per iteration and eventually overflows the static reservation. `bh_assert(eh_count < exception_handler_ count)` would catch this, but `bh_assert` is a no-op in release builds (`BH_DEBUG` is unset there), so the out-of-bounds writes go through silently. The warning surfaces the shape in load-time diagnostics so a real embedder sees it before the hard-to-diagnose runtime corruption. `count_try_blocks_crossed(cur_block, target_block)` walks csp positions from cur_block down to target_block inclusive (target included because br to a non-LOOP target lands AFTER target's end, skipping it; LOOP targets aren't try-typed so the inclusive vs exclusive distinction doesn't change the count). The check fires only in pass 1 (`loader_ctx->p_code_compiled == NULL`) so each br site logs once even though wasm_loader_prepare_bytecode runs the bytecode twice. No hot-op cost — this is loader-time only. Verified: porf-accurate doesn't trigger the warning (no br-across-try patterns in the Porffor emit shape, consistent with the PMU profile showing zero hot-op overhead from EH). `br_out_of_try_pops_eh_stack` integration test triggers the warning once and still passes. --- core/iwasm/interpreter/wasm_loader.c | 85 ++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index ba9122e4f7..a20cf80012 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -11307,6 +11307,39 @@ check_branch_block(WASMLoaderContext *loader_ctx, uint8 **p_buf, uint8 *buf_end, } #if WASM_ENABLE_EXCE_HANDLING != 0 +/* Returns the number of LABEL_TYPE_TRY / _CATCH / _CATCH_ALL + * frames whose END the runtime br will SKIP — i.e. the count of + * such frames at csp positions `cur_block` down to `target_block` + * inclusive (target_block included because br to a non-LOOP + * target lands AFTER target's end, skipping it; LOOP targets + * aren't try-typed so the inclusive vs exclusive distinction + * doesn't matter for them). The runtime br jumps directly to the + * target's resolved pc without decrementing `frame->eh_count`, + * so each such frame represents one stale eh-stack entry that + * survives the br. A single leaked entry is benign — frame + * allocation reserves `exception_handler_count * EH_ENTRY_CELLS` + * cells, the walker iterates top-down so sibling-try throws + * still match correctly, and the stale entry dies at frame + * teardown. But a br to a surrounding LOOP re-pushes one entry + * every iteration, eventually overflowing the static reservation; + * the resulting out-of-bounds writes go through silently in + * release builds (`bh_assert` is a no-op without `BH_DEBUG`). + * Caller logs a warning so the shape shows up in load-time + * diagnostics. */ +static uint32 +count_try_blocks_crossed(BranchBlock *cur_block, BranchBlock *target_block) +{ + BranchBlock *b; + uint32 count = 0; + for (b = cur_block; b >= target_block; b--) { + if (b->label_type == LABEL_TYPE_TRY || b->label_type == LABEL_TYPE_CATCH + || b->label_type == LABEL_TYPE_CATCH_ALL) { + count++; + } + } + return count; +} + static BranchBlock * check_branch_block_for_delegate(WASMLoaderContext *loader_ctx, uint8 **p_buf, uint8 *buf_end, char *error_buf, @@ -13399,6 +13432,29 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, error_buf, error_buf_size))) goto fail; +#if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 + /* Warn (pass 1 only — once per br site) when a br + * skips over a try-region's END. The runtime br + * doesn't pop eh-stack entries, so each leaked entry + * relies on the static + * `exception_handler_count * EH_ENTRY_CELLS` cell + * reservation per frame to absorb it. Pathological + * shape: `loop { try { br_to_loop_top } catch }` + * leaks one entry per iteration and eventually writes + * past that reservation. See `count_try_blocks_ + * crossed` for the full mechanism. */ + if (loader_ctx->p_code_compiled == NULL) { + uint32 leaked = count_try_blocks_crossed( + loader_ctx->frame_csp - 1, frame_csp_tmp); + if (leaked > 0) { + LOG_WARNING("wasm fast-interp: br at func[%u] crosses " + "%u try-region(s); each leaks one " + "eh-stack entry until frame teardown", + cur_func_idx, leaked); + } + } +#endif + RESET_STACK(); SET_CUR_BLOCK_STACK_POLYMORPHIC_STATE(true); break; @@ -13413,6 +13469,20 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, error_buf, error_buf_size))) goto fail; +#if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 + if (loader_ctx->p_code_compiled == NULL) { + uint32 leaked = count_try_blocks_crossed( + loader_ctx->frame_csp - 1, frame_csp_tmp); + if (leaked > 0) { + LOG_WARNING( + "wasm fast-interp: br_if at func[%u] crosses " + "%u try-region(s); each leaks one " + "eh-stack entry until frame teardown", + cur_func_idx, leaked); + } + } +#endif + break; } @@ -13479,6 +13549,21 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; } +#if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 + if (loader_ctx->p_code_compiled == NULL) { + uint32 leaked = count_try_blocks_crossed( + loader_ctx->frame_csp - 1, frame_csp_tmp); + if (leaked > 0) { + LOG_WARNING( + "wasm fast-interp: br_table[%u] at " + "func[%u] crosses %u try-region(s); each " + "leaks one eh-stack entry until frame " + "teardown", + i, cur_func_idx, leaked); + } + } +#endif + #if WASM_ENABLE_FAST_INTERP == 0 if (br_table_cache) { br_table_cache->br_depths[i] = depth; From 5b122fe8deb9c71bb610f99dd834b7bbf1c69efd Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 11:14:47 -0700 Subject: [PATCH 11/16] fast-interp: __builtin_expect cold-path hints on CALL_INDIRECT bounds checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marks the four structurally-cold paths in WASM_OP_CALL_INDIRECT — out-of-bounds table index, uninitialized element, unknown function (post-table lookup), indirect-call type mismatch — with `__builtin_expect(cond, 0)`. Well-formed wasm modules pass all four on every dispatched CALL_INDIRECT; the hint lets the compiler: (a) provide a static-bias fallback for the branch predictor on unseen call sites (first-iteration impact only — Apple Silicon's predictor learns the bias dynamically after a few hits anyway); (b) lay out the error-handling tail away from the hot path so each pass-through case stays in straight-line I-cache. Measured on iPhone 12 (A14, Icestorm E-cores) with the graphql-validation workloads — bucket-share deltas are within run-to-run noise on both Porffor and AS, but the Porffor bottleneck is `Processing` (56.78%, backend / load-store saturation) not branch prediction (4.19% Discarded). AS's E-core shows the structural opportunity (27.22% Discarded) but that's the goto-indirect-branch in FETCH_OPCODE_AND_DISPATCH, not the direct branches inside CALL_INDIRECT. Kept as documentation-as-code: the cold-path semantic is real (spec-required traps that ~never fire on validated modules), and the compiler-time cost is zero. Full PMU writeup in out/eh-pmu-iphone12-2026-05-18.md (gitignored). No correctness change. No hot-op runtime cost. Doesn't affect EH code paths. --- core/iwasm/interpreter/wasm_interp_fast.c | 34 +++++++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 1f587b1032..f9195befbe 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -1813,7 +1813,27 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, val = GET_OPERAND(uint32, I32, 0); frame_ip += 2; - if ((uint32)val >= tbl_inst->cur_size) { + /* Bounds / null / type-mismatch checks below are + * structurally cold paths — well-formed wasm modules + * pass them on every dispatched CALL_INDIRECT. Marking + * them `__builtin_expect(cond, 0)` lets the compiler + * (a) hint the branch predictor with a static-bias + * fallback for unseen call sites, and (b) lay out the + * error-handling tail away from the hot path so each + * fall-through case stays in one straight-line I-cache + * line. Apple Silicon E-cores (Icestorm, iPhone 12) + * showed ~27 % `Discarded` (bad-spec / mispredict) + * on the AS variant of graphql-validation under + * fast-interp, where megamorphic vtable dispatch + * hits CALL_INDIRECT thousands of times; the layout + * hint matters more than the branch hint on Apple's + * sophisticated predictor. PMU bucket shares stay + * within run-to-run noise on both Porffor and AS + * graphql-validation workloads, so the change is + * documentation-as-code more than a speedup — + * keep it because the cold-path semantic is real + * and the cost is zero. */ + if (__builtin_expect((uint32)val >= tbl_inst->cur_size, 0)) { wasm_set_exception(module, "undefined element"); goto got_exception; } @@ -1821,13 +1841,13 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, /* clang-format off */ #if WASM_ENABLE_GC == 0 fidx = (uint32)tbl_inst->elems[val]; - if (fidx == (uint32)-1) { + if (__builtin_expect(fidx == (uint32)-1, 0)) { wasm_set_exception(module, "uninitialized element"); goto got_exception; } #else func_obj = (WASMFuncObjectRef)tbl_inst->elems[val]; - if (!func_obj) { + if (__builtin_expect(!func_obj, 0)) { wasm_set_exception(module, "uninitialized element"); goto got_exception; } @@ -1840,7 +1860,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * another module. in that case, we don't validate * the elem value while loading */ - if (fidx >= module->e->function_count) { + if (__builtin_expect(fidx >= module->e->function_count, 0)) { wasm_set_exception(module, "unknown function"); goto got_exception; } @@ -1855,12 +1875,14 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, /* clang-format off */ #if WASM_ENABLE_GC == 0 - if (cur_type != cur_func_type) { + if (__builtin_expect(cur_type != cur_func_type, 0)) { wasm_set_exception(module, "indirect call type mismatch"); goto got_exception; } #else - if (!wasm_func_type_is_super_of(cur_type, cur_func_type)) { + if (__builtin_expect( + !wasm_func_type_is_super_of(cur_type, cur_func_type), + 0)) { wasm_set_exception(module, "indirect call type mismatch"); goto got_exception; } From 04625e71590708636318fe06d91ad1586eb6ed5d Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 16:09:03 -0700 Subject: [PATCH 12/16] test_wamr.sh: enable wasm spec EH suite for fast-interp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy exception-handling spec test suite was previously hardcoded to skip every running mode except classic-interp: if [[ "${RUNNING_MODE}" != "classic-interp" ]]; then echo "support exception handling in classic-interp" return 0 fi Now that fast-interp supports the full legacy-EH proposal (TRY / CATCH / CATCH_ALL / RETHROW / DELEGATE / tag-with-params), the gate should allow both modes. This matches the parallel `ENABLE_GC` block a few lines down that already lists `classic-interp` AND `fast-interp` as acceptable. After this change, `./test_wamr.sh -t fast-interp -m exception-handling` runs the upstream WebAssembly spec EH suite against the fast interpreter — the same suite already validated against classic interp. --- tests/wamr-test-suites/test_wamr.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/wamr-test-suites/test_wamr.sh b/tests/wamr-test-suites/test_wamr.sh index 97dc84d548..c4b6efa903 100755 --- a/tests/wamr-test-suites/test_wamr.sh +++ b/tests/wamr-test-suites/test_wamr.sh @@ -919,8 +919,9 @@ function do_execute_in_running_mode() # keep alpha order if [[ ${ENABLE_EH} -eq 1 ]]; then - if [[ "${RUNNING_MODE}" != "classic-interp" ]]; then - echo "support exception handling in classic-interp" + if [[ "${RUNNING_MODE}" != "classic-interp" \ + && "${RUNNING_MODE}" != "fast-interp" ]]; then + echo "support exception handling in classic-interp and fast-interp" return 0; fi fi From 57f4169ee17a336192e9f6bdf74a526aae792db0 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 22:12:52 -0700 Subject: [PATCH 13/16] fast-interp: unwind skipped EH entries on outer-catch dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a throw from a nested try is caught by an OUTER handler, the walker previously left the inner-try entries between the throw site and the matched outer entry on the eh-stack. The matched entry got its `EH_TRY_CATCH_STATE_BIT` set, but `frame->eh_count` stayed unchanged. After the outer catch body's END decremented eh_count by one, the inner-try slot remained at the top of the eh-stack with the matched outer entry now sitting *under* it (in-progress bit set). A subsequent throw inside (or after) the outer catch body would walk that stale state. The walker SKIPs entries with the state bit set, so the outer entry was correctly ignored — but the inner-try entry (no state bit) was treated as live. If the inner try's typed catch happened to match the new tag, the walker dispatched against that stale entry — an out-of-scope catch. Worse, in a tight loop of `outer try { inner try { throw } catch_other catch_outer { ... } }`, every iteration leaked one inner-try entry. After more iterations than the function's `exception_handler_count`, the next TRY push wrote past the static eh-stack reservation (silently in release builds since `bh_assert` is a no-op without `BH_DEBUG`). Fix: at each match-and-dispatch site in `find_a_catch_handler` — both the typed-catch branch and the catch_all branch — set `frame->eh_count = i;` before jumping to the handler. `i` is the loop counter, which equals the index of the matched entry plus one. This pops the nested-try entries above the match in a single indexed store. The matched entry stays at index i-1 with its state bit set; the catch body's END pops it normally when the body completes. Cost shape: one extra indexed store on the cold throw path, only when a typed catch or catch_all matches. CALL / LOAD / STORE handlers are untouched. Test added in the external integration suite at `crates/benchmark-core/tests/eh_correctness.rs:: outer_catch_unwinds_inner_eh_entries`. The test pattern is: outer try catches `$err`; inner try has a catch for `$err2`. Inner throw of `$err` is caught by outer. Outer catch body re-throws `$err2`, which must propagate UNCAUGHT (inner try is out of scope). Pre-fix walker found the stale inner catch and dispatched to it, producing a Ok(99) instead of the trap; post-fix the walker has no in-scope entries and the throw escapes correctly. Codex P1 review feedback on rebeckerspecialties/wasm-micro- runtime PR #2: "Unwind skipped EH entries before dispatching catches". --- core/iwasm/interpreter/wasm_interp_fast.c | 25 +++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index f9195befbe..ee9cd4bd13 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -2036,6 +2036,25 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, frame_lp[throw_src_offsets[c]]; } } + /* Pop the inner eh-stack entries that the + * throw is jumping past. When the match is + * at the topmost entry this is a no-op + * (i == frame->eh_count). When the match is + * an outer entry, the nested-try entries + * above it (indices i .. eh_count-1) are + * out of scope after the catch-dispatch; + * leaving them counted would let a + * subsequent throw inside the catch body + * see stale in-scope entries (and a tight + * loop of throw → outer-catch → throw + * would eventually overflow the fixed + * reservation). The matched entry stays + * at index i-1 with its state bit set; the + * catch body's END pops it when it + * completes. Cost: one indexed store on + * the cold throw path; CALL / LOAD / STORE + * untouched. */ + frame->eh_count = i; frame_ip = entry->catches[j].handler_pc; HANDLE_OP_END(); } @@ -2048,6 +2067,12 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * documented as a known limitation. */ cells[0] = packed | EH_TRY_CATCH_STATE_BIT; cells[1] = exception_tag_index; + /* Same unwind as the typed-catch path above — + * pop any nested-try entries the throw is + * jumping past so a subsequent throw inside + * this catch_all body doesn't dispatch + * against stale inner entries. */ + frame->eh_count = i; frame_ip = entry->catch_all_pc; HANDLE_OP_END(); } From 54bac97f9dde4205d937f4c163d25d08230b1fc3 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 22:13:08 -0700 Subject: [PATCH 14/16] fast-interp: trap on cross-function exception payload propagation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The walker's "no handler in this frame" path previously set `prev_frame->exception_raised = true` and let `return_func` forward the throw to the caller, regardless of payload size. This silently lost the payload: the source cells (`throw_src_offsets`) live in *this* frame's `frame_lp`, which return_func is about to tear down. The caller's `find_a_catch_handler` then ran with `throw_param_cell_num = 0`, which made any typed catch in the caller bind uninitialized destination slots — the catch body would either see garbage in its payload locals or, if the typed catch's slots were used as struct-of-pointers, dereference freed memory. Cross-function payload preservation would require a per-thread scratch buffer to ferry the payload across the frame boundary (callee's frame_lp → buffer → caller's frame_lp), plus a small change to return_func to populate it before tearing down the callee. That's a meaningful design lift and out of scope for this commit. Safe action for now: when a payload-bearing throw escapes its callee (i.e. `throw_param_cell_num > 0` and we're about to return to a caller frame), trap to the host with the diagnostic `"cross-function exception payload not supported by fast- interp"`. Same-function payload routing (the common Porffor / AS shape, where a JS throw is caught by an in-function catch the JS-to-wasm compiler emitted) is unaffected — that path dispatches via the same-function match in the walker before this branch runs. A `catch_all` in the caller would technically tolerate a zero-payload bind, but the typed-vs-catch_all choice happens in the caller's walker, which we can't peek into here without coupling the frames. Trap unconditionally for payload-bearing cross-frame throws. Tests: * `cross_function_tag_with_params` stays `#[ignore]` — that's the eventual-success-case for when cross-frame payload routing is implemented. * `cross_function_tag_with_params_traps` (new) asserts the current trap-with-expected-message contract on the same module shape. Codex P1 review feedback on rebeckerspecialties/wasm-benchmark PR #3 (patch 0007 line 306): "Preserve cross-frame exception payloads". --- core/iwasm/interpreter/wasm_interp_fast.c | 30 ++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index ee9cd4bd13..d44e21ed40 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -2083,8 +2083,36 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, * re-enters this label with the caller's frame in * scope. If we're already at the top of the wasm * stack, the existing got_exception path lets the - * host observe the trap via wasm_runtime_get_exception. */ + * host observe the trap via wasm_runtime_get_exception. + * + * Tag-with-params payload is intentionally NOT + * preserved across the frame boundary: the source + * cells (throw_src_offsets) live in *this* frame's + * frame_lp, which return_func is about to tear down. + * A caller-side typed catch would then bind + * uninitialized destination slots, producing wrong + * results in the catch body (or, if the typed catch + * uses the slots as a struct-of-pointers, memory + * corruption). The safe action when a payload- + * bearing throw escapes its callee is to trap to the + * host with a clear diagnostic. Same-function + * payload routing (the common Porffor / AS shape) + * is unaffected — it dispatches via the loop above + * before this branch runs. catch_all in the caller + * would technically tolerate a zero-payload bind, + * but the typed-vs-catch_all choice happens in the + * caller's walker, which we can't peek into here + * without coupling the frames; trap unconditionally + * for payload-bearing throws and let the test + * `cross_function_tag_with_params` document the + * shape. */ if (prev_frame && prev_frame->ip) { + if (throw_param_cell_num > 0) { + wasm_set_exception(module, + "cross-function exception payload " + "not supported by fast-interp"); + goto got_exception; + } prev_frame->tag_index = exception_tag_index; prev_frame->exception_raised = true; goto return_func; From 231862aa6455dc4142e4701759e7150b6589516f Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Mon, 18 May 2026 22:13:24 -0700 Subject: [PATCH 15/16] fast-interp: reject br/br_if/br_table to loop entry from inside try-region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a br skips over a try-region's END, the runtime br doesn't pop eh-stack entries. For a one-shot br to a block / function-end / catch, the leaked entry is absorbed by the static `exception_handler_count * EH_ENTRY_CELLS` reservation and dies at frame teardown — a load-time `LOG_WARNING` surfaces the shape for embedders. If the br target is a LOOP entry, however, every iteration's TRY push adds one more entry to the eh-stack. After more iterations than the function's `exception_handler_count`, the next TRY push writes past the static reservation. `bh_assert(eh_count < count)` catches this in debug builds, but is a no-op without `BH_DEBUG` — release builds silently corrupt whatever sat past the reservation in the frame allocation. This commit changes that pathological shape from "log a warning and accept" to "fail load with an explicit error". The check sits next to the existing `count_try_blocks_crossed > 0` warning at all three branch sites (BR, BR_IF, BR_TABLE) and only fires when `frame_csp_tmp->label_type == LABEL_TYPE_LOOP`. The error message is identical at each site modulo opcode name: "br[_if|_table] to loop entry from inside try-region not supported in fast interpreter (would leak eh-stack entries per iteration)" Emitting a synthetic eh-stack pop at the br site would be the other fix and would let valid modules with this shape run, but it complicates the rewritten IR's br-info layout (the br dispatch currently emits a single uint32 depth; a pop-count immediate would need a per-target lookup) and the shape is rare in practice. Rejecting at load is the conservative, App-Store-safe choice — embedders see a deterministic error rather than silent memory corruption. Test added in the external integration suite: the previously- ignored `br_out_of_try_inside_loop` became `br_out_of_try_inside_loop_rejected`, which asserts the loader fails with the expected error string. Codex P1 review feedback on both PRs ("Reject branches that leak EH entries" / "Reject branches that leak EH stack entries"). --- core/iwasm/interpreter/wasm_loader.c | 67 +++++++++++++++++++++------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index a20cf80012..6f83a3715a 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -13433,20 +13433,37 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; #if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 - /* Warn (pass 1 only — once per br site) when a br - * skips over a try-region's END. The runtime br - * doesn't pop eh-stack entries, so each leaked entry - * relies on the static - * `exception_handler_count * EH_ENTRY_CELLS` cell - * reservation per frame to absorb it. Pathological - * shape: `loop { try { br_to_loop_top } catch }` - * leaks one entry per iteration and eventually writes - * past that reservation. See `count_try_blocks_ - * crossed` for the full mechanism. */ - if (loader_ctx->p_code_compiled == NULL) { + /* When a br skips over a try-region's END, the + * runtime br doesn't pop eh-stack entries. For a + * one-shot br to a block / function-end / catch, + * the leaked entry is absorbed by the static + * `exception_handler_count * EH_ENTRY_CELLS` + * reservation and dies at frame teardown — log + * a warning so the shape shows up in load-time + * diagnostics, but accept the module. + * + * If the br target is a LOOP entry, however, + * every iteration's TRY push adds one more entry + * to the eh-stack and eventually overwrites past + * the static reservation (silently in release + * builds since `bh_assert` is a no-op without + * `BH_DEBUG`). Reject those modules at load time + * — emitting cleanup at the br site would be the + * other fix, but it complicates the hot dispatch + * loop and the shape is rare in practice. */ + { uint32 leaked = count_try_blocks_crossed( loader_ctx->frame_csp - 1, frame_csp_tmp); - if (leaked > 0) { + if (leaked > 0 + && frame_csp_tmp->label_type == LABEL_TYPE_LOOP) { + set_error_buf(error_buf, error_buf_size, + "br to loop entry from inside " + "try-region not supported in fast " + "interpreter (would leak eh-stack " + "entries per iteration)"); + goto fail; + } + if (leaked > 0 && loader_ctx->p_code_compiled == NULL) { LOG_WARNING("wasm fast-interp: br at func[%u] crosses " "%u try-region(s); each leaks one " "eh-stack entry until frame teardown", @@ -13470,10 +13487,19 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, goto fail; #if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 - if (loader_ctx->p_code_compiled == NULL) { + { uint32 leaked = count_try_blocks_crossed( loader_ctx->frame_csp - 1, frame_csp_tmp); - if (leaked > 0) { + if (leaked > 0 + && frame_csp_tmp->label_type == LABEL_TYPE_LOOP) { + set_error_buf(error_buf, error_buf_size, + "br_if to loop entry from inside " + "try-region not supported in fast " + "interpreter (would leak eh-stack " + "entries per iteration)"); + goto fail; + } + if (leaked > 0 && loader_ctx->p_code_compiled == NULL) { LOG_WARNING( "wasm fast-interp: br_if at func[%u] crosses " "%u try-region(s); each leaks one " @@ -13550,10 +13576,19 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func, } #if WASM_ENABLE_EXCE_HANDLING != 0 && WASM_ENABLE_FAST_INTERP != 0 - if (loader_ctx->p_code_compiled == NULL) { + { uint32 leaked = count_try_blocks_crossed( loader_ctx->frame_csp - 1, frame_csp_tmp); - if (leaked > 0) { + if (leaked > 0 + && frame_csp_tmp->label_type == LABEL_TYPE_LOOP) { + set_error_buf(error_buf, error_buf_size, + "br_table to loop entry from inside " + "try-region not supported in fast " + "interpreter (would leak eh-stack " + "entries per iteration)"); + goto fail; + } + if (leaked > 0 && loader_ctx->p_code_compiled == NULL) { LOG_WARNING( "wasm fast-interp: br_table[%u] at " "func[%u] crosses %u try-region(s); each " From 0411662d4c2f52625a356ff0583c9d2d970cb00f Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 12:57:46 -0700 Subject: [PATCH 16/16] fixup: MSVC `__builtin_expect` shim for the cold-path hints Windows MSVC build of upstream PR #4949 failed with `LNK2019: unresolved external symbol __builtin_expect` because `__builtin_expect` is a GCC/Clang builtin and MSVC has nothing equivalent. The branch-predictor hints are an optimization, not correctness, so the simplest portable fix is a no-op fallback gated on `!defined(__GNUC__) && !defined(__clang__)`. Lives at the top of `wasm_interp_fast.c` rather than in `bh_platform.h` to avoid touching the shared header for a local cold-path concern. --- core/iwasm/interpreter/wasm_interp_fast.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index d44e21ed40..31343f5723 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -25,6 +25,15 @@ #include "simde/wasm/simd128.h" #endif +/* MSVC has no `__builtin_expect`; the cold-path hints below are + * GCC/Clang only. Provide a no-op fallback so the loop still + * compiles on the Windows MSVC build. Branch-predictor hints are + * an optimization, not correctness, so dropping them on MSVC is + * fine. */ +#if !defined(__GNUC__) && !defined(__clang__) +#define __builtin_expect(expr, expected) (expr) +#endif + typedef int32 CellType_I32; typedef int64 CellType_I64; typedef float32 CellType_F32;