From 9f511cad1830748e1da64cf63ab2ff35d16a0e48 Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 20:54:44 -0300 Subject: [PATCH 1/9] Add autoresearch setup for heapsort benchmark --- autoresearch.md | 36 ++++++++++++++++++++++++++++++++++ autoresearch.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 autoresearch.md create mode 100755 autoresearch.sh diff --git a/autoresearch.md b/autoresearch.md new file mode 100644 index 0000000..94ac72e --- /dev/null +++ b/autoresearch.md @@ -0,0 +1,36 @@ +# Autoresearch: reduce Wasmoon heapsort benchmark time + +## Objective +Optimize the Wasmoon runtime path used by the plain heapsort benchmark: load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce average runtime for this benchmark on the current machine. + +## Metrics +- **Primary**: wasmoon_heapsort_avg_ms (ms, lower is better) +- **Secondary**: wasmoon_heapsort_stddev_ms, iterations, warmup + +## How to Run +`./autoresearch.sh` — builds the project, runs a focused benchmark, and prints `METRIC name=value` lines. + +## Files in Scope +- `src/module.ts` — JS↔C binding wrappers and helper utilities around `ccall` +- `src/thread.ts` — stack operations, string loading, execution helpers +- `src/global.ts` — state creation and global helpers +- `src/engine.ts` — engine setup and state lifecycle +- `src/type-extensions/*.ts` — only if profiling suggests extension registration / value conversion overhead matters +- `bench/heapsort.lua` — benchmark workload, read-only unless a benchmark bug is found +- `autoresearch.sh` — benchmark driver +- `autoresearch.md` — session state and findings +- `autoresearch.ideas.md` — backlog for promising ideas + +## Off Limits +- `lua/` C sources and wasm build artifacts for this session +- public API behavior changes unless benchmark gains are substantial and correctness is preserved +- new dependencies + +## Constraints +- Keep benchmark semantics the same: fresh state, load heapsort script, execute returned function +- No new dependencies +- Prefer simple changes with measurable wins +- Avoid benchmark-only cheats that would not help real users + +## What's Been Tried +- Initial setup only. No experiments yet. diff --git a/autoresearch.sh b/autoresearch.sh new file mode 100755 index 0000000..8ac09de --- /dev/null +++ b/autoresearch.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -euo pipefail + +npm run build >/dev/null + +node --input-type=module <<'EOF' +import { readFileSync } from 'node:fs' +import path from 'node:path' +import { performance } from 'node:perf_hooks' +import { fileURLToPath, pathToFileURL } from 'node:url' + +const root = process.cwd() +const heapsort = readFileSync(path.join(root, 'bench', 'heapsort.lua'), 'utf8') +const distIndex = pathToFileURL(path.join(root, 'dist', 'index.js')).href +const { Lua } = await import(distIndex) + +function stats(times) { + const avg = times.reduce((sum, t) => sum + t, 0) / times.length + const variance = times.reduce((sum, t) => sum + (t - avg) ** 2, 0) / times.length + return { avg, stddev: Math.sqrt(variance) } +} + +const iterations = 60 +const warmup = 8 +const lua = await Lua.load() + +async function runIteration() { + const state = lua.createState() + state.global.lua.luaL_loadstring(state.global.address, heapsort) + state.global.lua.lua_callk(state.global.address, 0, 1, 0, null) + state.global.lua.lua_callk(state.global.address, 0, 0, 0, null) + state.global.close() +} + +for (let i = 0; i < warmup; i++) { + await runIteration() +} + +const times = [] +for (let i = 0; i < iterations; i++) { + const start = performance.now() + await runIteration() + times.push(performance.now() - start) +} + +const { avg, stddev } = stats(times) +console.log(`METRIC wasmoon_heapsort_avg_ms=${avg.toFixed(6)}`) +console.log(`METRIC wasmoon_heapsort_stddev_ms=${stddev.toFixed(6)}`) +console.log(`METRIC iterations=${iterations}`) +console.log(`METRIC warmup=${warmup}`) +EOF From 0411406e8dc9e48e697aa0a924441912d20e3029 Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 20:55:21 -0300 Subject: [PATCH 2/9] Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":14.775872} --- autoresearch.jsonl | 1 + 1 file changed, 1 insertion(+) create mode 100644 autoresearch.jsonl diff --git a/autoresearch.jsonl b/autoresearch.jsonl new file mode 100644 index 0000000..3088fbb --- /dev/null +++ b/autoresearch.jsonl @@ -0,0 +1 @@ +{"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} From 9c864c74dbaf145481ca672ba306632afa61fa17 Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 20:56:49 -0300 Subject: [PATCH 3/9] Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":13.135759} --- autoresearch.jsonl | 1 + src/module.ts | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 3088fbb..96a15ce 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -1 +1,2 @@ {"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} +{"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0} diff --git a/src/module.ts b/src/module.ts index a4bde12..5b694b2 100755 --- a/src/module.ts +++ b/src/module.ts @@ -330,7 +330,7 @@ export default class LuaModule { this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number']) this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string']) this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string']) - this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string']) + this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number']) this.luaL_newstate = this.cwrap('luaL_newstate', 'number', []) this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number']) this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string']) From 085fc65ebb188c346ce4572a557853206cfbe9ce Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 20:58:34 -0300 Subject: [PATCH 4/9] Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":12.001671} --- autoresearch.jsonl | 2 ++ autoresearch.md | 4 +++- src/module.ts | 30 +++++++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 96a15ce..11b71b9 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -1,2 +1,4 @@ {"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} {"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0} +{"run":2,"commit":"9c864c7","metric":13.135759,"metrics":{},"status":"keep","description":"Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling","timestamp":1773446209778,"segment":0} +{"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0} diff --git a/autoresearch.md b/autoresearch.md index 94ac72e..b5b0927 100644 --- a/autoresearch.md +++ b/autoresearch.md @@ -33,4 +33,6 @@ Optimize the Wasmoon runtime path used by the plain heapsort benchmark: load the - Avoid benchmark-only cheats that would not help real users ## What's Been Tried -- Initial setup only. No experiments yet. +- Baseline from `./autoresearch.sh`: `14.775872ms` average over 60 iterations / 8 warmup. +- `src/module.ts`: changed `luaL_loadstring` binding from `['number', 'string']` to `['number', 'string|number']` so large chunks can use the optimized direct-buffer path in `cwrap`. This improved the benchmark to `13.135759ms` (~11.1% faster). +- Quick profiling notes: state creation cost exists but warmed `createState()` overhead looks much smaller than total benchmark time, so the hottest path appears closer to chunk loading / execution than to engine construction alone. diff --git a/src/module.ts b/src/module.ts index 5b694b2..6970a32 100755 --- a/src/module.ts +++ b/src/module.ts @@ -28,6 +28,8 @@ interface LuaEmscriptenModule extends EmscriptenModule { UTF8ToString: typeof UTF8ToString ENV: EnvironmentVariables _realloc: (pointer: number, size: number) => number + _lua_callk?: (L: LuaState, nargs: number, nresults: number, ctx: number, k: number) => void + _lua_pcallk?: (L: LuaState, nargs: number, nresults: number, errfunc: number, ctx: number, k: number) => number } interface ReferenceMetadata { @@ -330,7 +332,25 @@ export default class LuaModule { this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number']) this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string']) this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string']) - this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number']) + const luaLLoadString = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number']) + this.luaL_loadstring = (L, s) => { + if (typeof s === 'number' || s === null) { + return luaLLoadString(L, s) + } + + const size = this._emscripten.lengthBytesUTF8(s) + if (size <= 1024) { + return luaLLoadString(L, s) + } + + const bufferPointer = this._emscripten._malloc(size + 1) + try { + this._emscripten.stringToUTF8(s, bufferPointer, size + 1) + return this.luaL_loadbufferx(L, bufferPointer, size, bufferPointer, null) + } finally { + this._emscripten._free(bufferPointer) + } + } this.luaL_newstate = this.cwrap('luaL_newstate', 'number', []) this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number']) this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string']) @@ -411,8 +431,12 @@ export default class LuaModule { this.lua_rawsetp = this.cwrap('lua_rawsetp', null, ['number', 'number', 'number']) this.lua_setmetatable = this.cwrap('lua_setmetatable', 'number', ['number', 'number']) this.lua_setiuservalue = this.cwrap('lua_setiuservalue', 'number', ['number', 'number', 'number']) - this.lua_callk = this.cwrap('lua_callk', null, ['number', 'number', 'number', 'number', 'number']) - this.lua_pcallk = this.cwrap('lua_pcallk', 'number', ['number', 'number', 'number', 'number', 'number', 'number']) + this.lua_callk = module._lua_callk + ? (L, nargs, nresults, ctx, k) => module._lua_callk!(L, nargs, nresults, ctx, k ?? 0) + : this.cwrap('lua_callk', null, ['number', 'number', 'number', 'number', 'number']) + this.lua_pcallk = module._lua_pcallk + ? (L, nargs, nresults, errfunc, ctx, k) => module._lua_pcallk!(L, nargs, nresults, errfunc, ctx, k ?? 0) + : this.cwrap('lua_pcallk', 'number', ['number', 'number', 'number', 'number', 'number', 'number']) this.lua_load = this.cwrap('lua_load', 'number', ['number', 'number', 'number', 'string', 'string']) this.lua_dump = this.cwrap('lua_dump', 'number', ['number', 'number', 'number', 'number']) this.lua_yieldk = this.cwrap('lua_yieldk', 'number', ['number', 'number', 'number', 'number']) From 9a1c9e4d8f7b87eabbfb4584e23e481009621efa Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 21:04:16 -0300 Subject: [PATCH 5/9] Call exported wasm luaL_loadstring directly with a manually allocated UTF-8 buffer, avoiding ccall overhead on the hot chunk-load path\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":11.751988} --- autoresearch.jsonl | 2 ++ src/module.ts | 35 ++++++++++++++++------------------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 11b71b9..9f9427c 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -2,3 +2,5 @@ {"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0} {"run":2,"commit":"9c864c7","metric":13.135759,"metrics":{},"status":"keep","description":"Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling","timestamp":1773446209778,"segment":0} {"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0} +{"run":4,"commit":"085fc65","metric":12.001671,"metrics":{},"status":"keep","description":"Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths","timestamp":1773446314857,"segment":0} +{"run":5,"commit":"085fc65","metric":15.1674,"metrics":{},"status":"discard","description":"Tried broader direct-export bindings for state/stack helpers (luaL_newstate/lua_close/lua_newthread/lua_absindex/lua_gettop/lua_settop/lua_rotate), but it regressed badly and increased variance","timestamp":1773446391365,"segment":0} diff --git a/src/module.ts b/src/module.ts index 6970a32..96d294e 100755 --- a/src/module.ts +++ b/src/module.ts @@ -30,6 +30,7 @@ interface LuaEmscriptenModule extends EmscriptenModule { _realloc: (pointer: number, size: number) => number _lua_callk?: (L: LuaState, nargs: number, nresults: number, ctx: number, k: number) => void _lua_pcallk?: (L: LuaState, nargs: number, nresults: number, errfunc: number, ctx: number, k: number) => number + _luaL_loadstring?: (L: LuaState, s: number) => LuaReturn } interface ReferenceMetadata { @@ -332,25 +333,21 @@ export default class LuaModule { this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number']) this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string']) this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string']) - const luaLLoadString = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number']) - this.luaL_loadstring = (L, s) => { - if (typeof s === 'number' || s === null) { - return luaLLoadString(L, s) - } - - const size = this._emscripten.lengthBytesUTF8(s) - if (size <= 1024) { - return luaLLoadString(L, s) - } - - const bufferPointer = this._emscripten._malloc(size + 1) - try { - this._emscripten.stringToUTF8(s, bufferPointer, size + 1) - return this.luaL_loadbufferx(L, bufferPointer, size, bufferPointer, null) - } finally { - this._emscripten._free(bufferPointer) - } - } + const luaLLoadString = module._luaL_loadstring + ? (L: LuaState, s: string | number | null) => { + if (typeof s === 'number' || s === null) { + return module._luaL_loadstring!(L, s ?? 0) + } + + const bufferPointer = this._emscripten.stringToNewUTF8(s) + try { + return module._luaL_loadstring!(L, bufferPointer) + } finally { + this._emscripten._free(bufferPointer) + } + } + : this.cwrap('luaL_loadstring', 'number', ['number', 'string|number']) + this.luaL_loadstring = (L, s) => luaLLoadString(L, s) this.luaL_newstate = this.cwrap('luaL_newstate', 'number', []) this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number']) this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string']) From 46d08e6b759a56fda497fa581eece2b9ceed7fb9 Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 22:04:01 -0300 Subject: [PATCH 6/9] Retarget autoresearch to wasm build optimization --- autoresearch.ideas.md | 2 ++ autoresearch.md | 43 +++++++++++++++++++++---------------------- autoresearch.sh | 30 ++++++++++++++++++++++++++++-- 3 files changed, 51 insertions(+), 24 deletions(-) create mode 100644 autoresearch.ideas.md diff --git a/autoresearch.ideas.md b/autoresearch.ideas.md new file mode 100644 index 0000000..18bdea2 --- /dev/null +++ b/autoresearch.ideas.md @@ -0,0 +1,2 @@ +- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but it touches off-limits C/wasm internals for this session. +- Investigate Lua/emscripten build-level optimizations (compile flags, LTO, allocator choices) for state creation and bytecode execution. Promising, but also off-limits for this session. diff --git a/autoresearch.md b/autoresearch.md index b5b0927..d46dc1e 100644 --- a/autoresearch.md +++ b/autoresearch.md @@ -1,38 +1,37 @@ -# Autoresearch: reduce Wasmoon heapsort benchmark time +# Autoresearch: optimize compiled Lua/wasm runtime for Wasmoon heapsort ## Objective -Optimize the Wasmoon runtime path used by the plain heapsort benchmark: load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce average runtime for this benchmark on the current machine. +Optimize the runtime performance of the compiled Lua WebAssembly build used by Wasmoon on the focused heapsort benchmark. The workload is: build the wasm, bundle the JS bridge, load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce benchmark runtime without cheating by changing benchmark semantics. ## Metrics - **Primary**: wasmoon_heapsort_avg_ms (ms, lower is better) -- **Secondary**: wasmoon_heapsort_stddev_ms, iterations, warmup +- **Secondary**: wasmoon_heapsort_stddev_ms, wasm_build_seconds, glue_wasm_kb, iterations, warmup ## How to Run -`./autoresearch.sh` — builds the project, runs a focused benchmark, and prints `METRIC name=value` lines. +`./autoresearch.sh` — rebuilds the wasm/runtime, rebuilds JS, runs the focused benchmark, and prints `METRIC name=value` lines. ## Files in Scope -- `src/module.ts` — JS↔C binding wrappers and helper utilities around `ccall` -- `src/thread.ts` — stack operations, string loading, execution helpers -- `src/global.ts` — state creation and global helpers -- `src/engine.ts` — engine setup and state lifecycle -- `src/type-extensions/*.ts` — only if profiling suggests extension registration / value conversion overhead matters -- `bench/heapsort.lua` — benchmark workload, read-only unless a benchmark bug is found -- `autoresearch.sh` — benchmark driver -- `autoresearch.md` — session state and findings -- `autoresearch.ideas.md` — backlog for promising ideas +- `utils/build-wasm.sh` — emcc flags, exported symbols, runtime settings, allocator, optimization knobs +- `utils/build-wasm.js` — wasm build launcher / Docker fallback +- `lua/*.c` / `lua/*.h` — Lua runtime implementation, only for broadly justifiable runtime improvements +- `rolldown.config.ts` — only if wasm packaging/bundling materially affects runtime loading behavior +- `src/module.ts` / `src/*.ts` — only if needed to adapt to safe wasm-build changes +- `autoresearch.sh` — benchmark driver for this session +- `autoresearch.md` — session context +- `autoresearch.ideas.md` — deferred ideas ## Off Limits -- `lua/` C sources and wasm build artifacts for this session -- public API behavior changes unless benchmark gains are substantial and correctness is preserved -- new dependencies +- Benchmark workload semantics in `bench/heapsort.lua` +- Fake optimizations that skip work, cache results across iterations, or otherwise cheat the benchmark +- New dependencies ## Constraints - Keep benchmark semantics the same: fresh state, load heapsort script, execute returned function -- No new dependencies -- Prefer simple changes with measurable wins -- Avoid benchmark-only cheats that would not help real users +- No benchmark-only cheating or semantic shortcuts +- Prefer broadly useful speedups over highly workload-specific tricks +- Avoid changing public API behavior unless clearly safe ## What's Been Tried -- Baseline from `./autoresearch.sh`: `14.775872ms` average over 60 iterations / 8 warmup. -- `src/module.ts`: changed `luaL_loadstring` binding from `['number', 'string']` to `['number', 'string|number']` so large chunks can use the optimized direct-buffer path in `cwrap`. This improved the benchmark to `13.135759ms` (~11.1% faster). -- Quick profiling notes: state creation cost exists but warmed `createState()` overhead looks much smaller than total benchmark time, so the hottest path appears closer to chunk loading / execution than to engine construction alone. +- Previous JS-glue-focused session got the benchmark from `14.775872ms` to `11.751988ms` by reducing JS↔wasm overhead (`lua_callk`/`lua_pcallk` raw exports and direct exported `luaL_loadstring`). +- Profiling after those wins showed the remaining time is dominated by Lua execution itself, so wasm/compiler/runtime changes are now the most promising path. +- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. This session focuses on the latter first. diff --git a/autoresearch.sh b/autoresearch.sh index 8ac09de..5a7f4a4 100755 --- a/autoresearch.sh +++ b/autoresearch.sh @@ -1,13 +1,37 @@ #!/bin/bash set -euo pipefail +build_start=$(python3 - <<'PY' +import time +print(time.time()) +PY +) + +npm run build:wasm >/dev/null npm run build >/dev/null -node --input-type=module <<'EOF' +build_end=$(python3 - <<'PY' +import time +print(time.time()) +PY +) + +wasm_build_seconds=$(python3 - < Date: Fri, 13 Mar 2026 22:05:34 -0300 Subject: [PATCH 7/9] Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":12.639066} --- autoresearch.jsonl | 17 +++++++++++++++++ autoresearch.sh | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 9f9427c..475d13c 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -4,3 +4,20 @@ {"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0} {"run":4,"commit":"085fc65","metric":12.001671,"metrics":{},"status":"keep","description":"Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths","timestamp":1773446314857,"segment":0} {"run":5,"commit":"085fc65","metric":15.1674,"metrics":{},"status":"discard","description":"Tried broader direct-export bindings for state/stack helpers (luaL_newstate/lua_close/lua_newthread/lua_absindex/lua_gettop/lua_settop/lua_rotate), but it regressed badly and increased variance","timestamp":1773446391365,"segment":0} +{"run":6,"commit":"9a1c9e4","metric":11.751988,"metrics":{},"status":"keep","description":"Call exported wasm luaL_loadstring directly with a manually allocated UTF-8 buffer, avoiding ccall overhead on the hot chunk-load path","timestamp":1773446656315,"segment":0} +{"run":7,"commit":"9a1c9e4","metric":11.952811,"metrics":{},"status":"discard","description":"Tried direct-export binding for luaL_openselectedlibs to cut state setup overhead, but the heapsort benchmark regressed slightly","timestamp":1773446733027,"segment":0} +{"run":8,"commit":"9a1c9e4","metric":11.838151,"metrics":{},"status":"discard","description":"Tried caching the most recent UTF-8 buffer for luaL_loadstring to avoid repeated string encoding, but it did not beat the current best on the heapsort workload","timestamp":1773446800152,"segment":0} +{"run":9,"commit":"9a1c9e4","metric":12.8011,"metrics":{},"status":"discard","description":"Tried direct-export binding for lua_close, but closing states through the raw export regressed the heapsort benchmark noticeably","timestamp":1773446842368,"segment":0} +{"run":10,"commit":"9a1c9e4","metric":12.570699,"metrics":{},"status":"discard","description":"Tried lazily creating FunctionTypeExtension callback threads to reduce state setup work, but the heapsort benchmark regressed","timestamp":1773446931224,"segment":0} +{"run":11,"commit":"9a1c9e4","metric":13.353136,"metrics":{},"status":"discard","description":"Tried lazy-instantiating type extensions so plain states avoid eager metatable/setup work, but the heapsort benchmark regressed significantly","timestamp":1773447072737,"segment":0} +{"run":12,"commit":"9a1c9e4","metric":13.977992,"metrics":{},"status":"discard","description":"Tried calling exported wasm luaL_newstate directly to cut state-creation ccall overhead, but it regressed the heapsort benchmark","timestamp":1773447131145,"segment":0} +{"run":13,"commit":"9a1c9e4","metric":14.443619,"metrics":{},"status":"discard","description":"Tried implementing luaL_loadstring via direct exported luaL_loadbufferx with JS-computed byte length to avoid C strlen, but the extra JS-side encoding/allocation cost regressed badly","timestamp":1773447207802,"segment":0} +{"run":14,"commit":"9a1c9e4","metric":12.655008,"metrics":{},"status":"discard","description":"Tried reusing a heap buffer for direct luaL_loadstring UTF-8 encoding to avoid malloc/free churn, but it still regressed versus the current best","timestamp":1773447268974,"segment":0} +{"run":15,"commit":"9a1c9e4","metric":14.450232,"metrics":{},"status":"discard","description":"Tried assigning raw wasm exports directly for lua_callk/lua_pcallk and relying on JS null-to-zero coercion, but it regressed sharply despite being functionally correct","timestamp":1773447315203,"segment":0} +{"run":16,"commit":"9a1c9e4","metric":13.518631,"metrics":{},"status":"discard","description":"Tried specializing lua_callk/lua_pcallk wrappers for the common no-continuation case by always passing k=0, but that still regressed on the heapsort workload","timestamp":1773447358877,"segment":0} +{"run":17,"commit":"9a1c9e4","metric":14.026144,"metrics":{},"status":"discard","description":"Tried hoisting raw wasm export references into local constants inside wrapper setup to cut per-call property lookups, but it regressed substantially","timestamp":1773447416704,"segment":0} +{"run":18,"commit":"9a1c9e4","metric":14.448888,"metrics":{},"status":"discard","description":"Tried insertion-ordered type extension registration to avoid sorting on each createState, but it regressed sharply on the heapsort benchmark","timestamp":1773447523603,"segment":0} +{"run":19,"commit":"9a1c9e4","metric":14.363467,"metrics":{},"status":"discard","description":"Tried an ASCII fast path for direct exported luaL_loadstring using manual malloc/stringToUTF8 to avoid stringToNewUTF8 work, but it regressed badly","timestamp":1773447568437,"segment":0} +{"run":20,"commit":"9a1c9e4","metric":14.243077,"metrics":{},"status":"discard","description":"Tried hoisting only the raw exported luaL_loadstring reference into a local constant to reduce property lookups, but it still regressed heavily","timestamp":1773447607620,"segment":0} +{"type":"config","name":"Optimize compiled Lua/wasm runtime for Wasmoon heapsort","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} +{"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0} diff --git a/autoresearch.sh b/autoresearch.sh index 5a7f4a4..4f08162 100755 --- a/autoresearch.sh +++ b/autoresearch.sh @@ -27,7 +27,7 @@ print(round(Path('build/glue.wasm').stat().st_size / 1024, 3)) PY ) -node --input-type=module < Date: Fri, 13 Mar 2026 22:07:34 -0300 Subject: [PATCH 8/9] Switch release wasm build from -O3 to -O2; this reduced runtime, build time, and wasm size on the heapsort workload\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":11.794092} --- autoresearch.jsonl | 2 ++ utils/build-wasm.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 475d13c..1b64dc6 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -21,3 +21,5 @@ {"run":20,"commit":"9a1c9e4","metric":14.243077,"metrics":{},"status":"discard","description":"Tried hoisting only the raw exported luaL_loadstring reference into a local constant to reduce property lookups, but it still regressed heavily","timestamp":1773447607620,"segment":0} {"type":"config","name":"Optimize compiled Lua/wasm runtime for Wasmoon heapsort","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} {"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0} +{"run":2,"commit":"d6332fa","metric":12.639066,"metrics":{},"status":"keep","description":"Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark","timestamp":1773450334225,"segment":0} +{"run":3,"commit":"d6332fa","metric":13.779871,"metrics":{},"status":"discard","description":"Tried enabling LTO in the release emcc build (-flto), but runtime regressed and wasm size/build time increased substantially","timestamp":1773450410919,"segment":0} diff --git a/utils/build-wasm.sh b/utils/build-wasm.sh index b044209..715d44c 100755 --- a/utils/build-wasm.sh +++ b/utils/build-wasm.sh @@ -9,7 +9,7 @@ if [ "$1" == "dev" ]; then extension="-O0 -g3 -s ASSERTIONS=1 -s SAFE_HEAP=1 -s STACK_OVERFLOW_CHECK=2" else - extension="-O3" + extension="-O2" fi emcc \ From f5982c8b2ebf8a3b3b95ad6eda889b1f63fd135d Mon Sep 17 00:00:00 2001 From: Gabriel Francisco Date: Fri, 13 Mar 2026 22:29:58 -0300 Subject: [PATCH 9/9] research --- autoresearch.ideas.md | 5 +++-- autoresearch.jsonl | 17 +++++++++++++++++ autoresearch.md | 7 ++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/autoresearch.ideas.md b/autoresearch.ideas.md index 18bdea2..b95e8a9 100644 --- a/autoresearch.ideas.md +++ b/autoresearch.ideas.md @@ -1,2 +1,3 @@ -- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but it touches off-limits C/wasm internals for this session. -- Investigate Lua/emscripten build-level optimizations (compile flags, LTO, allocator choices) for state creation and bytecode execution. Promising, but also off-limits for this session. +- Validate the `-O2` release build win against additional Lua workloads (interop-heavy, library-heavy, allocation-heavy) before treating it as a universal default. Current evidence is strong for heapsort, but exploratory checks already show mixed results versus `-O3` on string-heavy code, so this remains highly relevant. +- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but needs careful validation across non-benchmark workloads. +- If broader validation still points to VM-bound execution, profile opcode mix / hot VM handlers on wasm and revisit targeted `lvm.c` or `ltable.h` changes with real evidence instead of micro-tweaks. diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 1b64dc6..a40d4ad 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -23,3 +23,20 @@ {"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0} {"run":2,"commit":"d6332fa","metric":12.639066,"metrics":{},"status":"keep","description":"Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark","timestamp":1773450334225,"segment":0} {"run":3,"commit":"d6332fa","metric":13.779871,"metrics":{},"status":"discard","description":"Tried enabling LTO in the release emcc build (-flto), but runtime regressed and wasm size/build time increased substantially","timestamp":1773450410919,"segment":0} +{"run":4,"commit":"16917a0","metric":11.794092,"metrics":{},"status":"keep","description":"Switch release wasm build from -O3 to -O2; this reduced runtime, build time, and wasm size on the heapsort workload","timestamp":1773450454889,"segment":0} +{"run":5,"commit":"16917a0","metric":12.071492,"metrics":{},"status":"discard","description":"Tried -Os for the release wasm build; it shrank the binary a lot but was slower than the -O2 build on heapsort","timestamp":1773450490273,"segment":0} +{"run":6,"commit":"16917a0","metric":13.675405,"metrics":{},"status":"discard","description":"Tried -O1 for the release wasm build; it compiled faster but hurt runtime badly and grew the wasm versus -O2","timestamp":1773450519030,"segment":0} +{"run":7,"commit":"16917a0","metric":0,"metrics":{},"status":"crash","description":"Tried dlmalloc instead of emmalloc in the -O2 wasm build; benchmark crashed with out-of-bounds memory access during state teardown","timestamp":1773450580128,"segment":0} +{"run":8,"commit":"16917a0","metric":12.392698,"metrics":{},"status":"discard","description":"Tried adding -fno-exceptions and unwind-table stripping flags on top of -O2, but runtime regressed with no size win","timestamp":1773450656108,"segment":0} +{"run":9,"commit":"16917a0","metric":12.682354,"metrics":{},"status":"discard","description":"Tried -fno-inline-functions on top of -O2 to shrink code and maybe help wasm locality, but runtime regressed noticeably","timestamp":1773450699747,"segment":0} +{"run":10,"commit":"16917a0","metric":13.465365,"metrics":{},"status":"discard","description":"Tried setting INITIAL_MEMORY=32MB while keeping memory growth enabled, but runtime regressed badly on the heapsort workload","timestamp":1773450739008,"segment":0} +{"run":11,"commit":"16917a0","metric":14.395206,"metrics":{},"status":"discard","description":"Tried SUPPORT_LONGJMP=wasm in the -O2 build, but it regressed heavily and increased build overhead","timestamp":1773450788238,"segment":0} +{"run":12,"commit":"16917a0","metric":13.94806,"metrics":{},"status":"discard","description":"Tried adding -DNDEBUG to the -O2 build, but it regressed badly with much higher variance and no size change","timestamp":1773450864416,"segment":0} +{"run":13,"commit":"16917a0","metric":12.710287,"metrics":{},"status":"discard","description":"Tried disabling Lua VM jump tables (-DLUA_USE_JUMPTABLE=0) in the -O2 build; it produced a slightly smaller wasm but slower interpreter execution","timestamp":1773450986537,"segment":0} +{"run":14,"commit":"16917a0","metric":13.131162,"metrics":{},"status":"discard","description":"Misconfigured combo run: added no-continuation C helpers for lua_call/lua_pcall while jump tables were still disabled from a prior test; overall result regressed, so discard and rerun cleanly","timestamp":1773451119792,"segment":0} +{"run":15,"commit":"16917a0","metric":14.295386,"metrics":{},"status":"discard","description":"Tried new C helpers for no-continuation lua_call/lua_pcall and bound module.ts to them, but the extra helper layer regressed badly versus direct raw exports","timestamp":1773451151734,"segment":0} +{"run":16,"commit":"16917a0","metric":14.501163,"metrics":{},"status":"discard","description":"Tried adding likely() branch hints to Lua array fastgeti/fastseti in ltable.h, but it regressed badly on the heapsort workload","timestamp":1773451219245,"segment":0} +{"run":17,"commit":"16917a0","metric":12.415745,"metrics":{},"status":"discard","description":"Tried fixed 64MB initial memory with memory growth disabled to reduce allocator/growth overhead, but it regressed and would also tighten memory semantics","timestamp":1773451294439,"segment":0} +{"run":18,"commit":"16917a0","metric":14.639167,"metrics":{},"status":"discard","description":"Tried reordering luaH_fastseti to check the array slot tag before the metatable fast-path test, but it regressed badly on the heapsort workload","timestamp":1773451340612,"segment":0} +{"run":19,"commit":"16917a0","metric":15.054911,"metrics":{},"status":"discard","description":"Tried mimalloc as the wasm allocator on top of -O2; it increased build time and wasm size and regressed runtime badly","timestamp":1773451479476,"segment":0} +{"run":20,"commit":"16917a0","metric":12.867971,"metrics":{},"status":"discard","description":"Tried -O3 -fno-inline-functions as a middle ground between O2 and O3, but it remained slower than the current -O2 best on heapsort","timestamp":1773451588143,"segment":0} diff --git a/autoresearch.md b/autoresearch.md index d46dc1e..9930b42 100644 --- a/autoresearch.md +++ b/autoresearch.md @@ -34,4 +34,9 @@ Optimize the runtime performance of the compiled Lua WebAssembly build used by W ## What's Been Tried - Previous JS-glue-focused session got the benchmark from `14.775872ms` to `11.751988ms` by reducing JS↔wasm overhead (`lua_callk`/`lua_pcallk` raw exports and direct exported `luaL_loadstring`). - Profiling after those wins showed the remaining time is dominated by Lua execution itself, so wasm/compiler/runtime changes are now the most promising path. -- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. This session focuses on the latter first. +- For the rebuilt-from-scratch wasm session, default release build (`-O3`) baseline was `12.639066ms`, `7.585s` wasm build time, `277.404kb` wasm. +- Best wasm-build improvement so far: changing release build from `-O3` to `-O2` improved runtime to `11.794092ms`, while also reducing build time to `6.413s` and wasm size to `274.129kb`. +- Cross-checking outside the primary metric suggests some overfitting risk: on an exploratory numeric-heavy script `-O2` slightly beat `-O3`, but on an exploratory string-heavy script `-O3` beat `-O2`. So `-O2` is a strong win for the heapsort/numeric path, not yet a universally proven default. +- Discarded build-flag experiments: `-flto`, `-Os`, `-O1`, `-DNDEBUG`, `-fno-exceptions`/unwind stripping, `-fno-inline-functions`, `INITIAL_MEMORY=32MB`, `SUPPORT_LONGJMP=wasm`, and fixed 64MB memory without growth. All regressed runtime, and some hurt size/build time or semantics. +- Discarded runtime/source experiments: disabling Lua VM jump tables, adding no-continuation C helpers for `lua_call`/`lua_pcall`, adding likely() hints to array fast paths, and reordering `luaH_fastseti` fast-path checks. All regressed on the benchmark. +- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. Build-level tuning found a real win (`-O2`), but the remaining promising paths now look more invasive and should be validated against more than one workload to avoid overfitting.