Skip to content
3 changes: 3 additions & 0 deletions autoresearch.ideas.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Validate the `-O2` release build win against additional Lua workloads (interop-heavy, library-heavy, allocation-heavy) before treating it as a universal default. Current evidence is strong for heapsort, but exploratory checks already show mixed results versus `-O3` on string-heavy code, so this remains highly relevant.
- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but needs careful validation across non-benchmark workloads.
- If broader validation still points to VM-bound execution, profile opcode mix / hot VM handlers on wasm and revisit targeted `lvm.c` or `ltable.h` changes with real evidence instead of micro-tweaks.
42 changes: 42 additions & 0 deletions autoresearch.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}
{"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0}
{"run":2,"commit":"9c864c7","metric":13.135759,"metrics":{},"status":"keep","description":"Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling","timestamp":1773446209778,"segment":0}
{"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0}
{"run":4,"commit":"085fc65","metric":12.001671,"metrics":{},"status":"keep","description":"Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths","timestamp":1773446314857,"segment":0}
{"run":5,"commit":"085fc65","metric":15.1674,"metrics":{},"status":"discard","description":"Tried broader direct-export bindings for state/stack helpers (luaL_newstate/lua_close/lua_newthread/lua_absindex/lua_gettop/lua_settop/lua_rotate), but it regressed badly and increased variance","timestamp":1773446391365,"segment":0}
{"run":6,"commit":"9a1c9e4","metric":11.751988,"metrics":{},"status":"keep","description":"Call exported wasm luaL_loadstring directly with a manually allocated UTF-8 buffer, avoiding ccall overhead on the hot chunk-load path","timestamp":1773446656315,"segment":0}
{"run":7,"commit":"9a1c9e4","metric":11.952811,"metrics":{},"status":"discard","description":"Tried direct-export binding for luaL_openselectedlibs to cut state setup overhead, but the heapsort benchmark regressed slightly","timestamp":1773446733027,"segment":0}
{"run":8,"commit":"9a1c9e4","metric":11.838151,"metrics":{},"status":"discard","description":"Tried caching the most recent UTF-8 buffer for luaL_loadstring to avoid repeated string encoding, but it did not beat the current best on the heapsort workload","timestamp":1773446800152,"segment":0}
{"run":9,"commit":"9a1c9e4","metric":12.8011,"metrics":{},"status":"discard","description":"Tried direct-export binding for lua_close, but closing states through the raw export regressed the heapsort benchmark noticeably","timestamp":1773446842368,"segment":0}
{"run":10,"commit":"9a1c9e4","metric":12.570699,"metrics":{},"status":"discard","description":"Tried lazily creating FunctionTypeExtension callback threads to reduce state setup work, but the heapsort benchmark regressed","timestamp":1773446931224,"segment":0}
{"run":11,"commit":"9a1c9e4","metric":13.353136,"metrics":{},"status":"discard","description":"Tried lazy-instantiating type extensions so plain states avoid eager metatable/setup work, but the heapsort benchmark regressed significantly","timestamp":1773447072737,"segment":0}
{"run":12,"commit":"9a1c9e4","metric":13.977992,"metrics":{},"status":"discard","description":"Tried calling exported wasm luaL_newstate directly to cut state-creation ccall overhead, but it regressed the heapsort benchmark","timestamp":1773447131145,"segment":0}
{"run":13,"commit":"9a1c9e4","metric":14.443619,"metrics":{},"status":"discard","description":"Tried implementing luaL_loadstring via direct exported luaL_loadbufferx with JS-computed byte length to avoid C strlen, but the extra JS-side encoding/allocation cost regressed badly","timestamp":1773447207802,"segment":0}
{"run":14,"commit":"9a1c9e4","metric":12.655008,"metrics":{},"status":"discard","description":"Tried reusing a heap buffer for direct luaL_loadstring UTF-8 encoding to avoid malloc/free churn, but it still regressed versus the current best","timestamp":1773447268974,"segment":0}
{"run":15,"commit":"9a1c9e4","metric":14.450232,"metrics":{},"status":"discard","description":"Tried assigning raw wasm exports directly for lua_callk/lua_pcallk and relying on JS null-to-zero coercion, but it regressed sharply despite being functionally correct","timestamp":1773447315203,"segment":0}
{"run":16,"commit":"9a1c9e4","metric":13.518631,"metrics":{},"status":"discard","description":"Tried specializing lua_callk/lua_pcallk wrappers for the common no-continuation case by always passing k=0, but that still regressed on the heapsort workload","timestamp":1773447358877,"segment":0}
{"run":17,"commit":"9a1c9e4","metric":14.026144,"metrics":{},"status":"discard","description":"Tried hoisting raw wasm export references into local constants inside wrapper setup to cut per-call property lookups, but it regressed substantially","timestamp":1773447416704,"segment":0}
{"run":18,"commit":"9a1c9e4","metric":14.448888,"metrics":{},"status":"discard","description":"Tried insertion-ordered type extension registration to avoid sorting on each createState, but it regressed sharply on the heapsort benchmark","timestamp":1773447523603,"segment":0}
{"run":19,"commit":"9a1c9e4","metric":14.363467,"metrics":{},"status":"discard","description":"Tried an ASCII fast path for direct exported luaL_loadstring using manual malloc/stringToUTF8 to avoid stringToNewUTF8 work, but it regressed badly","timestamp":1773447568437,"segment":0}
{"run":20,"commit":"9a1c9e4","metric":14.243077,"metrics":{},"status":"discard","description":"Tried hoisting only the raw exported luaL_loadstring reference into a local constant to reduce property lookups, but it still regressed heavily","timestamp":1773447607620,"segment":0}
{"type":"config","name":"Optimize compiled Lua/wasm runtime for Wasmoon heapsort","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}
{"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0}
{"run":2,"commit":"d6332fa","metric":12.639066,"metrics":{},"status":"keep","description":"Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark","timestamp":1773450334225,"segment":0}
{"run":3,"commit":"d6332fa","metric":13.779871,"metrics":{},"status":"discard","description":"Tried enabling LTO in the release emcc build (-flto), but runtime regressed and wasm size/build time increased substantially","timestamp":1773450410919,"segment":0}
{"run":4,"commit":"16917a0","metric":11.794092,"metrics":{},"status":"keep","description":"Switch release wasm build from -O3 to -O2; this reduced runtime, build time, and wasm size on the heapsort workload","timestamp":1773450454889,"segment":0}
{"run":5,"commit":"16917a0","metric":12.071492,"metrics":{},"status":"discard","description":"Tried -Os for the release wasm build; it shrank the binary a lot but was slower than the -O2 build on heapsort","timestamp":1773450490273,"segment":0}
{"run":6,"commit":"16917a0","metric":13.675405,"metrics":{},"status":"discard","description":"Tried -O1 for the release wasm build; it compiled faster but hurt runtime badly and grew the wasm versus -O2","timestamp":1773450519030,"segment":0}
{"run":7,"commit":"16917a0","metric":0,"metrics":{},"status":"crash","description":"Tried dlmalloc instead of emmalloc in the -O2 wasm build; benchmark crashed with out-of-bounds memory access during state teardown","timestamp":1773450580128,"segment":0}
{"run":8,"commit":"16917a0","metric":12.392698,"metrics":{},"status":"discard","description":"Tried adding -fno-exceptions and unwind-table stripping flags on top of -O2, but runtime regressed with no size win","timestamp":1773450656108,"segment":0}
{"run":9,"commit":"16917a0","metric":12.682354,"metrics":{},"status":"discard","description":"Tried -fno-inline-functions on top of -O2 to shrink code and maybe help wasm locality, but runtime regressed noticeably","timestamp":1773450699747,"segment":0}
{"run":10,"commit":"16917a0","metric":13.465365,"metrics":{},"status":"discard","description":"Tried setting INITIAL_MEMORY=32MB while keeping memory growth enabled, but runtime regressed badly on the heapsort workload","timestamp":1773450739008,"segment":0}
{"run":11,"commit":"16917a0","metric":14.395206,"metrics":{},"status":"discard","description":"Tried SUPPORT_LONGJMP=wasm in the -O2 build, but it regressed heavily and increased build overhead","timestamp":1773450788238,"segment":0}
{"run":12,"commit":"16917a0","metric":13.94806,"metrics":{},"status":"discard","description":"Tried adding -DNDEBUG to the -O2 build, but it regressed badly with much higher variance and no size change","timestamp":1773450864416,"segment":0}
{"run":13,"commit":"16917a0","metric":12.710287,"metrics":{},"status":"discard","description":"Tried disabling Lua VM jump tables (-DLUA_USE_JUMPTABLE=0) in the -O2 build; it produced a slightly smaller wasm but slower interpreter execution","timestamp":1773450986537,"segment":0}
{"run":14,"commit":"16917a0","metric":13.131162,"metrics":{},"status":"discard","description":"Misconfigured combo run: added no-continuation C helpers for lua_call/lua_pcall while jump tables were still disabled from a prior test; overall result regressed, so discard and rerun cleanly","timestamp":1773451119792,"segment":0}
{"run":15,"commit":"16917a0","metric":14.295386,"metrics":{},"status":"discard","description":"Tried new C helpers for no-continuation lua_call/lua_pcall and bound module.ts to them, but the extra helper layer regressed badly versus direct raw exports","timestamp":1773451151734,"segment":0}
{"run":16,"commit":"16917a0","metric":14.501163,"metrics":{},"status":"discard","description":"Tried adding likely() branch hints to Lua array fastgeti/fastseti in ltable.h, but it regressed badly on the heapsort workload","timestamp":1773451219245,"segment":0}
{"run":17,"commit":"16917a0","metric":12.415745,"metrics":{},"status":"discard","description":"Tried fixed 64MB initial memory with memory growth disabled to reduce allocator/growth overhead, but it regressed and would also tighten memory semantics","timestamp":1773451294439,"segment":0}
{"run":18,"commit":"16917a0","metric":14.639167,"metrics":{},"status":"discard","description":"Tried reordering luaH_fastseti to check the array slot tag before the metatable fast-path test, but it regressed badly on the heapsort workload","timestamp":1773451340612,"segment":0}
{"run":19,"commit":"16917a0","metric":15.054911,"metrics":{},"status":"discard","description":"Tried mimalloc as the wasm allocator on top of -O2; it increased build time and wasm size and regressed runtime badly","timestamp":1773451479476,"segment":0}
{"run":20,"commit":"16917a0","metric":12.867971,"metrics":{},"status":"discard","description":"Tried -O3 -fno-inline-functions as a middle ground between O2 and O3, but it remained slower than the current -O2 best on heapsort","timestamp":1773451588143,"segment":0}
42 changes: 42 additions & 0 deletions autoresearch.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Autoresearch: optimize compiled Lua/wasm runtime for Wasmoon heapsort

## Objective
Optimize the runtime performance of the compiled Lua WebAssembly build used by Wasmoon on the focused heapsort benchmark. The workload is: build the wasm, bundle the JS bridge, load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce benchmark runtime without cheating by changing benchmark semantics.

## Metrics
- **Primary**: wasmoon_heapsort_avg_ms (ms, lower is better)
- **Secondary**: wasmoon_heapsort_stddev_ms, wasm_build_seconds, glue_wasm_kb, iterations, warmup

## How to Run
`./autoresearch.sh` — rebuilds the wasm/runtime, rebuilds JS, runs the focused benchmark, and prints `METRIC name=value` lines.

## Files in Scope
- `utils/build-wasm.sh` — emcc flags, exported symbols, runtime settings, allocator, optimization knobs
- `utils/build-wasm.js` — wasm build launcher / Docker fallback
- `lua/*.c` / `lua/*.h` — Lua runtime implementation, only for broadly justifiable runtime improvements
- `rolldown.config.ts` — only if wasm packaging/bundling materially affects runtime loading behavior
- `src/module.ts` / `src/*.ts` — only if needed to adapt to safe wasm-build changes
- `autoresearch.sh` — benchmark driver for this session
- `autoresearch.md` — session context
- `autoresearch.ideas.md` — deferred ideas

## Off Limits
- Benchmark workload semantics in `bench/heapsort.lua`
- Fake optimizations that skip work, cache results across iterations, or otherwise cheat the benchmark
- New dependencies

## Constraints
- Keep benchmark semantics the same: fresh state, load heapsort script, execute returned function
- No benchmark-only cheating or semantic shortcuts
- Prefer broadly useful speedups over highly workload-specific tricks
- Avoid changing public API behavior unless clearly safe

## What's Been Tried
- Previous JS-glue-focused session got the benchmark from `14.775872ms` to `11.751988ms` by reducing JS↔wasm overhead (`lua_callk`/`lua_pcallk` raw exports and direct exported `luaL_loadstring`).
- Profiling after those wins showed the remaining time is dominated by Lua execution itself, so wasm/compiler/runtime changes are now the most promising path.
- For the rebuilt-from-scratch wasm session, default release build (`-O3`) baseline was `12.639066ms`, `7.585s` wasm build time, `277.404kb` wasm.
- Best wasm-build improvement so far: changing release build from `-O3` to `-O2` improved runtime to `11.794092ms`, while also reducing build time to `6.413s` and wasm size to `274.129kb`.
- Cross-checking outside the primary metric suggests some overfitting risk: on an exploratory numeric-heavy script `-O2` slightly beat `-O3`, but on an exploratory string-heavy script `-O3` beat `-O2`. So `-O2` is a strong win for the heapsort/numeric path, not yet a universally proven default.
- Discarded build-flag experiments: `-flto`, `-Os`, `-O1`, `-DNDEBUG`, `-fno-exceptions`/unwind stripping, `-fno-inline-functions`, `INITIAL_MEMORY=32MB`, `SUPPORT_LONGJMP=wasm`, and fixed 64MB memory without growth. All regressed runtime, and some hurt size/build time or semantics.
- Discarded runtime/source experiments: disabling Lua VM jump tables, adding no-continuation C helpers for `lua_call`/`lua_pcall`, adding likely() hints to array fast paths, and reordering `luaH_fastseti` fast-path checks. All regressed on the benchmark.
- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. Build-level tuning found a real win (`-O2`), but the remaining promising paths now look more invasive and should be validated against more than one workload to avoid overfitting.
77 changes: 77 additions & 0 deletions autoresearch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash
set -euo pipefail

build_start=$(python3 - <<'PY'
import time
print(time.time())
PY
)

npm run build:wasm >/dev/null
npm run build >/dev/null

build_end=$(python3 - <<'PY'
import time
print(time.time())
PY
)

wasm_build_seconds=$(python3 - <<PY
print(round(${build_end} - ${build_start}, 6))
PY
)

glue_wasm_kb=$(python3 - <<'PY'
from pathlib import Path
print(round(Path('build/glue.wasm').stat().st_size / 1024, 3))
PY
)

WASMOON_WASM_BUILD_SECONDS="$wasm_build_seconds" WASMOON_GLUE_WASM_KB="$glue_wasm_kb" node --input-type=module <<'EOF'
import { readFileSync } from 'node:fs'
import path from 'node:path'
import { performance } from 'node:perf_hooks'
import { pathToFileURL } from 'node:url'

const root = process.cwd()
const heapsort = readFileSync(path.join(root, 'bench', 'heapsort.lua'), 'utf8')
const distIndex = pathToFileURL(path.join(root, 'dist', 'index.js')).href
const { Lua } = await import(distIndex)

function stats(times) {
const avg = times.reduce((sum, t) => sum + t, 0) / times.length
const variance = times.reduce((sum, t) => sum + (t - avg) ** 2, 0) / times.length
return { avg, stddev: Math.sqrt(variance) }
}

const iterations = 60
const warmup = 8
const lua = await Lua.load()

async function runIteration() {
const state = lua.createState()
state.global.lua.luaL_loadstring(state.global.address, heapsort)
state.global.lua.lua_callk(state.global.address, 0, 1, 0, null)
state.global.lua.lua_callk(state.global.address, 0, 0, 0, null)
state.global.close()
}

for (let i = 0; i < warmup; i++) {
await runIteration()
}

const times = []
for (let i = 0; i < iterations; i++) {
const start = performance.now()
await runIteration()
times.push(performance.now() - start)
}

const { avg, stddev } = stats(times)
console.log(`METRIC wasmoon_heapsort_avg_ms=${avg.toFixed(6)}`)
console.log(`METRIC wasmoon_heapsort_stddev_ms=${stddev.toFixed(6)}`)
console.log(`METRIC wasm_build_seconds=${Number(process.env.WASMOON_WASM_BUILD_SECONDS).toFixed(6)}`)
console.log(`METRIC glue_wasm_kb=${Number(process.env.WASMOON_GLUE_WASM_KB).toFixed(3)}`)
console.log(`METRIC iterations=${iterations}`)
console.log(`METRIC warmup=${warmup}`)
EOF
Loading
Loading