diff --git a/autoresearch.ideas.md b/autoresearch.ideas.md new file mode 100644 index 0000000..b95e8a9 --- /dev/null +++ b/autoresearch.ideas.md @@ -0,0 +1,3 @@ +- Validate the `-O2` release build win against additional Lua workloads (interop-heavy, library-heavy, allocation-heavy) before treating it as a universal default. Current evidence is strong for heapsort, but exploratory checks already show mixed results versus `-O3` on string-heavy code, so this remains highly relevant. +- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but needs careful validation across non-benchmark workloads. +- If broader validation still points to VM-bound execution, profile opcode mix / hot VM handlers on wasm and revisit targeted `lvm.c` or `ltable.h` changes with real evidence instead of micro-tweaks. diff --git a/autoresearch.jsonl b/autoresearch.jsonl new file mode 100644 index 0000000..a40d4ad --- /dev/null +++ b/autoresearch.jsonl @@ -0,0 +1,42 @@ +{"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} +{"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0} +{"run":2,"commit":"9c864c7","metric":13.135759,"metrics":{},"status":"keep","description":"Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling","timestamp":1773446209778,"segment":0} +{"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0} +{"run":4,"commit":"085fc65","metric":12.001671,"metrics":{},"status":"keep","description":"Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths","timestamp":1773446314857,"segment":0} +{"run":5,"commit":"085fc65","metric":15.1674,"metrics":{},"status":"discard","description":"Tried broader direct-export bindings for state/stack helpers (luaL_newstate/lua_close/lua_newthread/lua_absindex/lua_gettop/lua_settop/lua_rotate), but it regressed badly and increased variance","timestamp":1773446391365,"segment":0} +{"run":6,"commit":"9a1c9e4","metric":11.751988,"metrics":{},"status":"keep","description":"Call exported wasm luaL_loadstring directly with a manually allocated UTF-8 buffer, avoiding ccall overhead on the hot chunk-load path","timestamp":1773446656315,"segment":0} +{"run":7,"commit":"9a1c9e4","metric":11.952811,"metrics":{},"status":"discard","description":"Tried direct-export binding for luaL_openselectedlibs to cut state setup overhead, but the heapsort benchmark regressed slightly","timestamp":1773446733027,"segment":0} +{"run":8,"commit":"9a1c9e4","metric":11.838151,"metrics":{},"status":"discard","description":"Tried caching the most recent UTF-8 buffer for luaL_loadstring to avoid repeated string encoding, but it did not beat the current best on the heapsort workload","timestamp":1773446800152,"segment":0} +{"run":9,"commit":"9a1c9e4","metric":12.8011,"metrics":{},"status":"discard","description":"Tried direct-export binding for lua_close, but closing states through the raw export regressed the heapsort benchmark noticeably","timestamp":1773446842368,"segment":0} +{"run":10,"commit":"9a1c9e4","metric":12.570699,"metrics":{},"status":"discard","description":"Tried lazily creating FunctionTypeExtension callback threads to reduce state setup work, but the heapsort benchmark regressed","timestamp":1773446931224,"segment":0} +{"run":11,"commit":"9a1c9e4","metric":13.353136,"metrics":{},"status":"discard","description":"Tried lazy-instantiating type extensions so plain states avoid eager metatable/setup work, but the heapsort benchmark regressed significantly","timestamp":1773447072737,"segment":0} +{"run":12,"commit":"9a1c9e4","metric":13.977992,"metrics":{},"status":"discard","description":"Tried calling exported wasm luaL_newstate directly to cut state-creation ccall overhead, but it regressed the heapsort benchmark","timestamp":1773447131145,"segment":0} +{"run":13,"commit":"9a1c9e4","metric":14.443619,"metrics":{},"status":"discard","description":"Tried implementing luaL_loadstring via direct exported luaL_loadbufferx with JS-computed byte length to avoid C strlen, but the extra JS-side encoding/allocation cost regressed badly","timestamp":1773447207802,"segment":0} +{"run":14,"commit":"9a1c9e4","metric":12.655008,"metrics":{},"status":"discard","description":"Tried reusing a heap buffer for direct luaL_loadstring UTF-8 encoding to avoid malloc/free churn, but it still regressed versus the current best","timestamp":1773447268974,"segment":0} +{"run":15,"commit":"9a1c9e4","metric":14.450232,"metrics":{},"status":"discard","description":"Tried assigning raw wasm exports directly for lua_callk/lua_pcallk and relying on JS null-to-zero coercion, but it regressed sharply despite being functionally correct","timestamp":1773447315203,"segment":0} +{"run":16,"commit":"9a1c9e4","metric":13.518631,"metrics":{},"status":"discard","description":"Tried specializing lua_callk/lua_pcallk wrappers for the common no-continuation case by always passing k=0, but that still regressed on the heapsort workload","timestamp":1773447358877,"segment":0} +{"run":17,"commit":"9a1c9e4","metric":14.026144,"metrics":{},"status":"discard","description":"Tried hoisting raw wasm export references into local constants inside wrapper setup to cut per-call property lookups, but it regressed substantially","timestamp":1773447416704,"segment":0} +{"run":18,"commit":"9a1c9e4","metric":14.448888,"metrics":{},"status":"discard","description":"Tried insertion-ordered type extension registration to avoid sorting on each createState, but it regressed sharply on the heapsort benchmark","timestamp":1773447523603,"segment":0} +{"run":19,"commit":"9a1c9e4","metric":14.363467,"metrics":{},"status":"discard","description":"Tried an ASCII fast path for direct exported luaL_loadstring using manual malloc/stringToUTF8 to avoid stringToNewUTF8 work, but it regressed badly","timestamp":1773447568437,"segment":0} +{"run":20,"commit":"9a1c9e4","metric":14.243077,"metrics":{},"status":"discard","description":"Tried hoisting only the raw exported luaL_loadstring reference into a local constant to reduce property lookups, but it still regressed heavily","timestamp":1773447607620,"segment":0} +{"type":"config","name":"Optimize compiled Lua/wasm runtime for Wasmoon heapsort","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"} +{"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0} +{"run":2,"commit":"d6332fa","metric":12.639066,"metrics":{},"status":"keep","description":"Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark","timestamp":1773450334225,"segment":0} +{"run":3,"commit":"d6332fa","metric":13.779871,"metrics":{},"status":"discard","description":"Tried enabling LTO in the release emcc build (-flto), but runtime regressed and wasm size/build time increased substantially","timestamp":1773450410919,"segment":0} +{"run":4,"commit":"16917a0","metric":11.794092,"metrics":{},"status":"keep","description":"Switch release wasm build from -O3 to -O2; this reduced runtime, build time, and wasm size on the heapsort workload","timestamp":1773450454889,"segment":0} +{"run":5,"commit":"16917a0","metric":12.071492,"metrics":{},"status":"discard","description":"Tried -Os for the release wasm build; it shrank the binary a lot but was slower than the -O2 build on heapsort","timestamp":1773450490273,"segment":0} +{"run":6,"commit":"16917a0","metric":13.675405,"metrics":{},"status":"discard","description":"Tried -O1 for the release wasm build; it compiled faster but hurt runtime badly and grew the wasm versus -O2","timestamp":1773450519030,"segment":0} +{"run":7,"commit":"16917a0","metric":0,"metrics":{},"status":"crash","description":"Tried dlmalloc instead of emmalloc in the -O2 wasm build; benchmark crashed with out-of-bounds memory access during state teardown","timestamp":1773450580128,"segment":0} +{"run":8,"commit":"16917a0","metric":12.392698,"metrics":{},"status":"discard","description":"Tried adding -fno-exceptions and unwind-table stripping flags on top of -O2, but runtime regressed with no size win","timestamp":1773450656108,"segment":0} +{"run":9,"commit":"16917a0","metric":12.682354,"metrics":{},"status":"discard","description":"Tried -fno-inline-functions on top of -O2 to shrink code and maybe help wasm locality, but runtime regressed noticeably","timestamp":1773450699747,"segment":0} +{"run":10,"commit":"16917a0","metric":13.465365,"metrics":{},"status":"discard","description":"Tried setting INITIAL_MEMORY=32MB while keeping memory growth enabled, but runtime regressed badly on the heapsort workload","timestamp":1773450739008,"segment":0} +{"run":11,"commit":"16917a0","metric":14.395206,"metrics":{},"status":"discard","description":"Tried SUPPORT_LONGJMP=wasm in the -O2 build, but it regressed heavily and increased build overhead","timestamp":1773450788238,"segment":0} +{"run":12,"commit":"16917a0","metric":13.94806,"metrics":{},"status":"discard","description":"Tried adding -DNDEBUG to the -O2 build, but it regressed badly with much higher variance and no size change","timestamp":1773450864416,"segment":0} +{"run":13,"commit":"16917a0","metric":12.710287,"metrics":{},"status":"discard","description":"Tried disabling Lua VM jump tables (-DLUA_USE_JUMPTABLE=0) in the -O2 build; it produced a slightly smaller wasm but slower interpreter execution","timestamp":1773450986537,"segment":0} +{"run":14,"commit":"16917a0","metric":13.131162,"metrics":{},"status":"discard","description":"Misconfigured combo run: added no-continuation C helpers for lua_call/lua_pcall while jump tables were still disabled from a prior test; overall result regressed, so discard and rerun cleanly","timestamp":1773451119792,"segment":0} +{"run":15,"commit":"16917a0","metric":14.295386,"metrics":{},"status":"discard","description":"Tried new C helpers for no-continuation lua_call/lua_pcall and bound module.ts to them, but the extra helper layer regressed badly versus direct raw exports","timestamp":1773451151734,"segment":0} +{"run":16,"commit":"16917a0","metric":14.501163,"metrics":{},"status":"discard","description":"Tried adding likely() branch hints to Lua array fastgeti/fastseti in ltable.h, but it regressed badly on the heapsort workload","timestamp":1773451219245,"segment":0} +{"run":17,"commit":"16917a0","metric":12.415745,"metrics":{},"status":"discard","description":"Tried fixed 64MB initial memory with memory growth disabled to reduce allocator/growth overhead, but it regressed and would also tighten memory semantics","timestamp":1773451294439,"segment":0} +{"run":18,"commit":"16917a0","metric":14.639167,"metrics":{},"status":"discard","description":"Tried reordering luaH_fastseti to check the array slot tag before the metatable fast-path test, but it regressed badly on the heapsort workload","timestamp":1773451340612,"segment":0} +{"run":19,"commit":"16917a0","metric":15.054911,"metrics":{},"status":"discard","description":"Tried mimalloc as the wasm allocator on top of -O2; it increased build time and wasm size and regressed runtime badly","timestamp":1773451479476,"segment":0} +{"run":20,"commit":"16917a0","metric":12.867971,"metrics":{},"status":"discard","description":"Tried -O3 -fno-inline-functions as a middle ground between O2 and O3, but it remained slower than the current -O2 best on heapsort","timestamp":1773451588143,"segment":0} diff --git a/autoresearch.md b/autoresearch.md new file mode 100644 index 0000000..9930b42 --- /dev/null +++ b/autoresearch.md @@ -0,0 +1,42 @@ +# Autoresearch: optimize compiled Lua/wasm runtime for Wasmoon heapsort + +## Objective +Optimize the runtime performance of the compiled Lua WebAssembly build used by Wasmoon on the focused heapsort benchmark. The workload is: build the wasm, bundle the JS bridge, load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce benchmark runtime without cheating by changing benchmark semantics. + +## Metrics +- **Primary**: wasmoon_heapsort_avg_ms (ms, lower is better) +- **Secondary**: wasmoon_heapsort_stddev_ms, wasm_build_seconds, glue_wasm_kb, iterations, warmup + +## How to Run +`./autoresearch.sh` — rebuilds the wasm/runtime, rebuilds JS, runs the focused benchmark, and prints `METRIC name=value` lines. + +## Files in Scope +- `utils/build-wasm.sh` — emcc flags, exported symbols, runtime settings, allocator, optimization knobs +- `utils/build-wasm.js` — wasm build launcher / Docker fallback +- `lua/*.c` / `lua/*.h` — Lua runtime implementation, only for broadly justifiable runtime improvements +- `rolldown.config.ts` — only if wasm packaging/bundling materially affects runtime loading behavior +- `src/module.ts` / `src/*.ts` — only if needed to adapt to safe wasm-build changes +- `autoresearch.sh` — benchmark driver for this session +- `autoresearch.md` — session context +- `autoresearch.ideas.md` — deferred ideas + +## Off Limits +- Benchmark workload semantics in `bench/heapsort.lua` +- Fake optimizations that skip work, cache results across iterations, or otherwise cheat the benchmark +- New dependencies + +## Constraints +- Keep benchmark semantics the same: fresh state, load heapsort script, execute returned function +- No benchmark-only cheating or semantic shortcuts +- Prefer broadly useful speedups over highly workload-specific tricks +- Avoid changing public API behavior unless clearly safe + +## What's Been Tried +- Previous JS-glue-focused session got the benchmark from `14.775872ms` to `11.751988ms` by reducing JS↔wasm overhead (`lua_callk`/`lua_pcallk` raw exports and direct exported `luaL_loadstring`). +- Profiling after those wins showed the remaining time is dominated by Lua execution itself, so wasm/compiler/runtime changes are now the most promising path. +- For the rebuilt-from-scratch wasm session, default release build (`-O3`) baseline was `12.639066ms`, `7.585s` wasm build time, `277.404kb` wasm. +- Best wasm-build improvement so far: changing release build from `-O3` to `-O2` improved runtime to `11.794092ms`, while also reducing build time to `6.413s` and wasm size to `274.129kb`. +- Cross-checking outside the primary metric suggests some overfitting risk: on an exploratory numeric-heavy script `-O2` slightly beat `-O3`, but on an exploratory string-heavy script `-O3` beat `-O2`. So `-O2` is a strong win for the heapsort/numeric path, not yet a universally proven default. +- Discarded build-flag experiments: `-flto`, `-Os`, `-O1`, `-DNDEBUG`, `-fno-exceptions`/unwind stripping, `-fno-inline-functions`, `INITIAL_MEMORY=32MB`, `SUPPORT_LONGJMP=wasm`, and fixed 64MB memory without growth. All regressed runtime, and some hurt size/build time or semantics. +- Discarded runtime/source experiments: disabling Lua VM jump tables, adding no-continuation C helpers for `lua_call`/`lua_pcall`, adding likely() hints to array fast paths, and reordering `luaH_fastseti` fast-path checks. All regressed on the benchmark. +- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. Build-level tuning found a real win (`-O2`), but the remaining promising paths now look more invasive and should be validated against more than one workload to avoid overfitting. diff --git a/autoresearch.sh b/autoresearch.sh new file mode 100755 index 0000000..4f08162 --- /dev/null +++ b/autoresearch.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -euo pipefail + +build_start=$(python3 - <<'PY' +import time +print(time.time()) +PY +) + +npm run build:wasm >/dev/null +npm run build >/dev/null + +build_end=$(python3 - <<'PY' +import time +print(time.time()) +PY +) + +wasm_build_seconds=$(python3 - < sum + t, 0) / times.length + const variance = times.reduce((sum, t) => sum + (t - avg) ** 2, 0) / times.length + return { avg, stddev: Math.sqrt(variance) } +} + +const iterations = 60 +const warmup = 8 +const lua = await Lua.load() + +async function runIteration() { + const state = lua.createState() + state.global.lua.luaL_loadstring(state.global.address, heapsort) + state.global.lua.lua_callk(state.global.address, 0, 1, 0, null) + state.global.lua.lua_callk(state.global.address, 0, 0, 0, null) + state.global.close() +} + +for (let i = 0; i < warmup; i++) { + await runIteration() +} + +const times = [] +for (let i = 0; i < iterations; i++) { + const start = performance.now() + await runIteration() + times.push(performance.now() - start) +} + +const { avg, stddev } = stats(times) +console.log(`METRIC wasmoon_heapsort_avg_ms=${avg.toFixed(6)}`) +console.log(`METRIC wasmoon_heapsort_stddev_ms=${stddev.toFixed(6)}`) +console.log(`METRIC wasm_build_seconds=${Number(process.env.WASMOON_WASM_BUILD_SECONDS).toFixed(6)}`) +console.log(`METRIC glue_wasm_kb=${Number(process.env.WASMOON_GLUE_WASM_KB).toFixed(3)}`) +console.log(`METRIC iterations=${iterations}`) +console.log(`METRIC warmup=${warmup}`) +EOF diff --git a/src/module.ts b/src/module.ts index a4bde12..96d294e 100755 --- a/src/module.ts +++ b/src/module.ts @@ -28,6 +28,9 @@ interface LuaEmscriptenModule extends EmscriptenModule { UTF8ToString: typeof UTF8ToString ENV: EnvironmentVariables _realloc: (pointer: number, size: number) => number + _lua_callk?: (L: LuaState, nargs: number, nresults: number, ctx: number, k: number) => void + _lua_pcallk?: (L: LuaState, nargs: number, nresults: number, errfunc: number, ctx: number, k: number) => number + _luaL_loadstring?: (L: LuaState, s: number) => LuaReturn } interface ReferenceMetadata { @@ -330,7 +333,21 @@ export default class LuaModule { this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number']) this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string']) this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string']) - this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string']) + const luaLLoadString = module._luaL_loadstring + ? (L: LuaState, s: string | number | null) => { + if (typeof s === 'number' || s === null) { + return module._luaL_loadstring!(L, s ?? 0) + } + + const bufferPointer = this._emscripten.stringToNewUTF8(s) + try { + return module._luaL_loadstring!(L, bufferPointer) + } finally { + this._emscripten._free(bufferPointer) + } + } + : this.cwrap('luaL_loadstring', 'number', ['number', 'string|number']) + this.luaL_loadstring = (L, s) => luaLLoadString(L, s) this.luaL_newstate = this.cwrap('luaL_newstate', 'number', []) this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number']) this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string']) @@ -411,8 +428,12 @@ export default class LuaModule { this.lua_rawsetp = this.cwrap('lua_rawsetp', null, ['number', 'number', 'number']) this.lua_setmetatable = this.cwrap('lua_setmetatable', 'number', ['number', 'number']) this.lua_setiuservalue = this.cwrap('lua_setiuservalue', 'number', ['number', 'number', 'number']) - this.lua_callk = this.cwrap('lua_callk', null, ['number', 'number', 'number', 'number', 'number']) - this.lua_pcallk = this.cwrap('lua_pcallk', 'number', ['number', 'number', 'number', 'number', 'number', 'number']) + this.lua_callk = module._lua_callk + ? (L, nargs, nresults, ctx, k) => module._lua_callk!(L, nargs, nresults, ctx, k ?? 0) + : this.cwrap('lua_callk', null, ['number', 'number', 'number', 'number', 'number']) + this.lua_pcallk = module._lua_pcallk + ? (L, nargs, nresults, errfunc, ctx, k) => module._lua_pcallk!(L, nargs, nresults, errfunc, ctx, k ?? 0) + : this.cwrap('lua_pcallk', 'number', ['number', 'number', 'number', 'number', 'number', 'number']) this.lua_load = this.cwrap('lua_load', 'number', ['number', 'number', 'number', 'string', 'string']) this.lua_dump = this.cwrap('lua_dump', 'number', ['number', 'number', 'number', 'number']) this.lua_yieldk = this.cwrap('lua_yieldk', 'number', ['number', 'number', 'number', 'number']) diff --git a/utils/build-wasm.sh b/utils/build-wasm.sh index b044209..715d44c 100755 --- a/utils/build-wasm.sh +++ b/utils/build-wasm.sh @@ -9,7 +9,7 @@ if [ "$1" == "dev" ]; then extension="-O0 -g3 -s ASSERTIONS=1 -s SAFE_HEAP=1 -s STACK_OVERFLOW_CHECK=2" else - extension="-O3" + extension="-O2" fi emcc \