From 9f511cad1830748e1da64cf63ab2ff35d16a0e48 Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 20:54:44 -0300
Subject: [PATCH 1/9] Add autoresearch setup for heapsort benchmark

---
 autoresearch.md | 36 ++++++++++++++++++++++++++++++++++
 autoresearch.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 autoresearch.md
 create mode 100755 autoresearch.sh

diff --git a/autoresearch.md b/autoresearch.md
new file mode 100644
index 0000000..94ac72e
--- /dev/null
+++ b/autoresearch.md
@@ -0,0 +1,36 @@
+# Autoresearch: reduce Wasmoon heapsort benchmark time
+
+## Objective
+Optimize the Wasmoon runtime path used by the plain heapsort benchmark: load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce average runtime for this benchmark on the current machine.
+
+## Metrics
+- **Primary**: wasmoon_heapsort_avg_ms (ms, lower is better)
+- **Secondary**: wasmoon_heapsort_stddev_ms, iterations, warmup
+
+## How to Run
+`./autoresearch.sh` — builds the project, runs a focused benchmark, and prints `METRIC name=value` lines.
+
+## Files in Scope
+- `src/module.ts` — JS↔C binding wrappers and helper utilities around `ccall`
+- `src/thread.ts` — stack operations, string loading, execution helpers
+- `src/global.ts` — state creation and global helpers
+- `src/engine.ts` — engine setup and state lifecycle
+- `src/type-extensions/*.ts` — only if profiling suggests extension registration / value conversion overhead matters
+- `bench/heapsort.lua` — benchmark workload, read-only unless a benchmark bug is found
+- `autoresearch.sh` — benchmark driver
+- `autoresearch.md` — session state and findings
+- `autoresearch.ideas.md` — backlog for promising ideas
+
+## Off Limits
+- `lua/` C sources and wasm build artifacts for this session
+- public API behavior changes unless benchmark gains are substantial and correctness is preserved
+- new dependencies
+
+## Constraints
+- Keep benchmark semantics the same: fresh state, load heapsort script, execute returned function
+- No new dependencies
+- Prefer simple changes with measurable wins
+- Avoid benchmark-only cheats that would not help real users
+
+## What's Been Tried
+- Initial setup only. No experiments yet.
diff --git a/autoresearch.sh b/autoresearch.sh
new file mode 100755
index 0000000..8ac09de
--- /dev/null
+++ b/autoresearch.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -euo pipefail
+
+npm run build >/dev/null
+
+node --input-type=module <<'EOF'
+import { readFileSync } from 'node:fs'
+import path from 'node:path'
+import { performance } from 'node:perf_hooks'
+import { fileURLToPath, pathToFileURL } from 'node:url'
+
+const root = process.cwd()
+const heapsort = readFileSync(path.join(root, 'bench', 'heapsort.lua'), 'utf8')
+const distIndex = pathToFileURL(path.join(root, 'dist', 'index.js')).href
+const { Lua } = await import(distIndex)
+
+function stats(times) {
+  const avg = times.reduce((sum, t) => sum + t, 0) / times.length
+  const variance = times.reduce((sum, t) => sum + (t - avg) ** 2, 0) / times.length
+  return { avg, stddev: Math.sqrt(variance) }
+}
+
+const iterations = 60
+const warmup = 8
+const lua = await Lua.load()
+
+async function runIteration() {
+  const state = lua.createState()
+  state.global.lua.luaL_loadstring(state.global.address, heapsort)
+  state.global.lua.lua_callk(state.global.address, 0, 1, 0, null)
+  state.global.lua.lua_callk(state.global.address, 0, 0, 0, null)
+  state.global.close()
+}
+
+for (let i = 0; i < warmup; i++) {
+  await runIteration()
+}
+
+const times = []
+for (let i = 0; i < iterations; i++) {
+  const start = performance.now()
+  await runIteration()
+  times.push(performance.now() - start)
+}
+
+const { avg, stddev } = stats(times)
+console.log(`METRIC wasmoon_heapsort_avg_ms=${avg.toFixed(6)}`)
+console.log(`METRIC wasmoon_heapsort_stddev_ms=${stddev.toFixed(6)}`)
+console.log(`METRIC iterations=${iterations}`)
+console.log(`METRIC warmup=${warmup}`)
+EOF

From 0411406e8dc9e48e697aa0a924441912d20e3029 Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 20:55:21 -0300
Subject: [PATCH 2/9] Baseline: focused heapsort benchmark via autoresearch.sh
 (build + fresh state + loadstring + execute returned function)\n\nResult:
 {"status":"keep","wasmoon_heapsort_avg_ms":14.775872}

---
 autoresearch.jsonl | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 autoresearch.jsonl

diff --git a/autoresearch.jsonl b/autoresearch.jsonl
new file mode 100644
index 0000000..3088fbb
--- /dev/null
+++ b/autoresearch.jsonl
@@ -0,0 +1 @@
+{"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}

From 9c864c74dbaf145481ca672ba306632afa61fa17 Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 20:56:49 -0300
Subject: [PATCH 3/9] Bind luaL_loadstring with string|number so long Lua
 chunks use a direct UTF-8 buffer path instead of generic ccall string
 marshaling\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":13.135759}

---
 autoresearch.jsonl | 1 +
 src/module.ts      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoresearch.jsonl b/autoresearch.jsonl
index 3088fbb..96a15ce 100644
--- a/autoresearch.jsonl
+++ b/autoresearch.jsonl
@@ -1 +1,2 @@
 {"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}
+{"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0}
diff --git a/src/module.ts b/src/module.ts
index a4bde12..5b694b2 100755
--- a/src/module.ts
+++ b/src/module.ts
@@ -330,7 +330,7 @@ export default class LuaModule {
         this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number'])
         this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string'])
         this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string'])
-        this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string'])
+        this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number'])
         this.luaL_newstate = this.cwrap('luaL_newstate', 'number', [])
         this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number'])
         this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string'])

From 085fc65ebb188c346ce4572a557853206cfbe9ce Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 20:58:34 -0300
Subject: [PATCH 4/9] Call exported wasm functions directly for
 lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call
 paths\n\nResult: {"status":"keep","wasmoon_heapsort_avg_ms":12.001671}

---
 autoresearch.jsonl |  2 ++
 autoresearch.md    |  4 +++-
 src/module.ts      | 30 +++++++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/autoresearch.jsonl b/autoresearch.jsonl
index 96a15ce..11b71b9 100644
--- a/autoresearch.jsonl
+++ b/autoresearch.jsonl
@@ -1,2 +1,4 @@
 {"type":"config","name":"Reduce Wasmoon heapsort benchmark time","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}
 {"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0}
+{"run":2,"commit":"9c864c7","metric":13.135759,"metrics":{},"status":"keep","description":"Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling","timestamp":1773446209778,"segment":0}
+{"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0}
diff --git a/autoresearch.md b/autoresearch.md
index 94ac72e..b5b0927 100644
--- a/autoresearch.md
+++ b/autoresearch.md
@@ -33,4 +33,6 @@ Optimize the Wasmoon runtime path used by the plain heapsort benchmark: load the
 - Avoid benchmark-only cheats that would not help real users
 
 ## What's Been Tried
-- Initial setup only. No experiments yet.
+- Baseline from `./autoresearch.sh`: `14.775872ms` average over 60 iterations / 8 warmup.
+- `src/module.ts`: changed `luaL_loadstring` binding from `['number', 'string']` to `['number', 'string|number']` so large chunks can use the optimized direct-buffer path in `cwrap`. This improved the benchmark to `13.135759ms` (~11.1% faster).
+- Quick profiling notes: state creation cost exists but warmed `createState()` overhead looks much smaller than total benchmark time, so the hottest path appears closer to chunk loading / execution than to engine construction alone.
diff --git a/src/module.ts b/src/module.ts
index 5b694b2..6970a32 100755
--- a/src/module.ts
+++ b/src/module.ts
@@ -28,6 +28,8 @@ interface LuaEmscriptenModule extends EmscriptenModule {
     UTF8ToString: typeof UTF8ToString
     ENV: EnvironmentVariables
     _realloc: (pointer: number, size: number) => number
+    _lua_callk?: (L: LuaState, nargs: number, nresults: number, ctx: number, k: number) => void
+    _lua_pcallk?: (L: LuaState, nargs: number, nresults: number, errfunc: number, ctx: number, k: number) => number
 }
 
 interface ReferenceMetadata {
@@ -330,7 +332,25 @@ export default class LuaModule {
         this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number'])
         this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string'])
         this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string'])
-        this.luaL_loadstring = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number'])
+        const luaLLoadString = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number'])
+        this.luaL_loadstring = (L, s) => {
+            if (typeof s === 'number' || s === null) {
+                return luaLLoadString(L, s)
+            }
+
+            const size = this._emscripten.lengthBytesUTF8(s)
+            if (size <= 1024) {
+                return luaLLoadString(L, s)
+            }
+
+            const bufferPointer = this._emscripten._malloc(size + 1)
+            try {
+                this._emscripten.stringToUTF8(s, bufferPointer, size + 1)
+                return this.luaL_loadbufferx(L, bufferPointer, size, bufferPointer, null)
+            } finally {
+                this._emscripten._free(bufferPointer)
+            }
+        }
         this.luaL_newstate = this.cwrap('luaL_newstate', 'number', [])
         this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number'])
         this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string'])
@@ -411,8 +431,12 @@ export default class LuaModule {
         this.lua_rawsetp = this.cwrap('lua_rawsetp', null, ['number', 'number', 'number'])
         this.lua_setmetatable = this.cwrap('lua_setmetatable', 'number', ['number', 'number'])
         this.lua_setiuservalue = this.cwrap('lua_setiuservalue', 'number', ['number', 'number', 'number'])
-        this.lua_callk = this.cwrap('lua_callk', null, ['number', 'number', 'number', 'number', 'number'])
-        this.lua_pcallk = this.cwrap('lua_pcallk', 'number', ['number', 'number', 'number', 'number', 'number', 'number'])
+        this.lua_callk = module._lua_callk
+            ? (L, nargs, nresults, ctx, k) => module._lua_callk!(L, nargs, nresults, ctx, k ?? 0)
+            : this.cwrap('lua_callk', null, ['number', 'number', 'number', 'number', 'number'])
+        this.lua_pcallk = module._lua_pcallk
+            ? (L, nargs, nresults, errfunc, ctx, k) => module._lua_pcallk!(L, nargs, nresults, errfunc, ctx, k ?? 0)
+            : this.cwrap('lua_pcallk', 'number', ['number', 'number', 'number', 'number', 'number', 'number'])
         this.lua_load = this.cwrap('lua_load', 'number', ['number', 'number', 'number', 'string', 'string'])
         this.lua_dump = this.cwrap('lua_dump', 'number', ['number', 'number', 'number', 'number'])
         this.lua_yieldk = this.cwrap('lua_yieldk', 'number', ['number', 'number', 'number', 'number'])

From 9a1c9e4d8f7b87eabbfb4584e23e481009621efa Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 21:04:16 -0300
Subject: [PATCH 5/9] Call exported wasm luaL_loadstring directly with a
 manually allocated UTF-8 buffer, avoiding ccall overhead on the hot
 chunk-load path\n\nResult:
 {"status":"keep","wasmoon_heapsort_avg_ms":11.751988}

---
 autoresearch.jsonl |  2 ++
 src/module.ts      | 35 ++++++++++++++++-------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/autoresearch.jsonl b/autoresearch.jsonl
index 11b71b9..9f9427c 100644
--- a/autoresearch.jsonl
+++ b/autoresearch.jsonl
@@ -2,3 +2,5 @@
 {"run":1,"commit":"0411406","metric":14.775872,"metrics":{},"status":"keep","description":"Baseline: focused heapsort benchmark via autoresearch.sh (build + fresh state + loadstring + execute returned function)","timestamp":1773446121099,"segment":0}
 {"run":2,"commit":"9c864c7","metric":13.135759,"metrics":{},"status":"keep","description":"Bind luaL_loadstring with string|number so long Lua chunks use a direct UTF-8 buffer path instead of generic ccall string marshaling","timestamp":1773446209778,"segment":0}
 {"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0}
+{"run":4,"commit":"085fc65","metric":12.001671,"metrics":{},"status":"keep","description":"Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths","timestamp":1773446314857,"segment":0}
+{"run":5,"commit":"085fc65","metric":15.1674,"metrics":{},"status":"discard","description":"Tried broader direct-export bindings for state/stack helpers (luaL_newstate/lua_close/lua_newthread/lua_absindex/lua_gettop/lua_settop/lua_rotate), but it regressed badly and increased variance","timestamp":1773446391365,"segment":0}
diff --git a/src/module.ts b/src/module.ts
index 6970a32..96d294e 100755
--- a/src/module.ts
+++ b/src/module.ts
@@ -30,6 +30,7 @@ interface LuaEmscriptenModule extends EmscriptenModule {
     _realloc: (pointer: number, size: number) => number
     _lua_callk?: (L: LuaState, nargs: number, nresults: number, ctx: number, k: number) => void
     _lua_pcallk?: (L: LuaState, nargs: number, nresults: number, errfunc: number, ctx: number, k: number) => number
+    _luaL_loadstring?: (L: LuaState, s: number) => LuaReturn
 }
 
 interface ReferenceMetadata {
@@ -332,25 +333,21 @@ export default class LuaModule {
         this.luaL_unref = this.cwrap('luaL_unref', null, ['number', 'number', 'number'])
         this.luaL_loadfilex = this.cwrap('luaL_loadfilex', 'number', ['number', 'string', 'string'])
         this.luaL_loadbufferx = this.cwrap('luaL_loadbufferx', 'number', ['number', 'string|number', 'number', 'string|number', 'string'])
-        const luaLLoadString = this.cwrap('luaL_loadstring', 'number', ['number', 'string|number'])
-        this.luaL_loadstring = (L, s) => {
-            if (typeof s === 'number' || s === null) {
-                return luaLLoadString(L, s)
-            }
-
-            const size = this._emscripten.lengthBytesUTF8(s)
-            if (size <= 1024) {
-                return luaLLoadString(L, s)
-            }
-
-            const bufferPointer = this._emscripten._malloc(size + 1)
-            try {
-                this._emscripten.stringToUTF8(s, bufferPointer, size + 1)
-                return this.luaL_loadbufferx(L, bufferPointer, size, bufferPointer, null)
-            } finally {
-                this._emscripten._free(bufferPointer)
-            }
-        }
+        const luaLLoadString = module._luaL_loadstring
+            ? (L: LuaState, s: string | number | null) => {
+                  if (typeof s === 'number' || s === null) {
+                      return module._luaL_loadstring!(L, s ?? 0)
+                  }
+
+                  const bufferPointer = this._emscripten.stringToNewUTF8(s)
+                  try {
+                      return module._luaL_loadstring!(L, bufferPointer)
+                  } finally {
+                      this._emscripten._free(bufferPointer)
+                  }
+              }
+            : this.cwrap('luaL_loadstring', 'number', ['number', 'string|number'])
+        this.luaL_loadstring = (L, s) => luaLLoadString(L, s)
         this.luaL_newstate = this.cwrap('luaL_newstate', 'number', [])
         this.luaL_len = this.cwrap('luaL_len', 'number', ['number', 'number'])
         this.luaL_addgsub = this.cwrap('luaL_addgsub', null, ['number', 'string', 'string', 'string'])

From 46d08e6b759a56fda497fa581eece2b9ceed7fb9 Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 22:04:01 -0300
Subject: [PATCH 6/9] Retarget autoresearch to wasm build optimization

---
 autoresearch.ideas.md |  2 ++
 autoresearch.md       | 43 +++++++++++++++++++++----------------------
 autoresearch.sh       | 30 ++++++++++++++++++++++++++++--
 3 files changed, 51 insertions(+), 24 deletions(-)
 create mode 100644 autoresearch.ideas.md

diff --git a/autoresearch.ideas.md b/autoresearch.ideas.md
new file mode 100644
index 0000000..18bdea2
--- /dev/null
+++ b/autoresearch.ideas.md
@@ -0,0 +1,2 @@
+- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but it touches off-limits C/wasm internals for this session.
+- Investigate Lua/emscripten build-level optimizations (compile flags, LTO, allocator choices) for state creation and bytecode execution. Promising, but also off-limits for this session.
diff --git a/autoresearch.md b/autoresearch.md
index b5b0927..d46dc1e 100644
--- a/autoresearch.md
+++ b/autoresearch.md
@@ -1,38 +1,37 @@
-# Autoresearch: reduce Wasmoon heapsort benchmark time
+# Autoresearch: optimize compiled Lua/wasm runtime for Wasmoon heapsort
 
 ## Objective
-Optimize the Wasmoon runtime path used by the plain heapsort benchmark: load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce average runtime for this benchmark on the current machine.
+Optimize the runtime performance of the compiled Lua WebAssembly build used by Wasmoon on the focused heapsort benchmark. The workload is: build the wasm, bundle the JS bridge, load the Lua module once, create a fresh state per iteration, load `bench/heapsort.lua`, execute it, and call the returned function. The goal is to reduce benchmark runtime without cheating by changing benchmark semantics.
 
 ## Metrics
 - **Primary**: wasmoon_heapsort_avg_ms (ms, lower is better)
-- **Secondary**: wasmoon_heapsort_stddev_ms, iterations, warmup
+- **Secondary**: wasmoon_heapsort_stddev_ms, wasm_build_seconds, glue_wasm_kb, iterations, warmup
 
 ## How to Run
-`./autoresearch.sh` — builds the project, runs a focused benchmark, and prints `METRIC name=value` lines.
+`./autoresearch.sh` — rebuilds the wasm/runtime, rebuilds JS, runs the focused benchmark, and prints `METRIC name=value` lines.
 
 ## Files in Scope
-- `src/module.ts` — JS↔C binding wrappers and helper utilities around `ccall`
-- `src/thread.ts` — stack operations, string loading, execution helpers
-- `src/global.ts` — state creation and global helpers
-- `src/engine.ts` — engine setup and state lifecycle
-- `src/type-extensions/*.ts` — only if profiling suggests extension registration / value conversion overhead matters
-- `bench/heapsort.lua` — benchmark workload, read-only unless a benchmark bug is found
-- `autoresearch.sh` — benchmark driver
-- `autoresearch.md` — session state and findings
-- `autoresearch.ideas.md` — backlog for promising ideas
+- `utils/build-wasm.sh` — emcc flags, exported symbols, runtime settings, allocator, optimization knobs
+- `utils/build-wasm.js` — wasm build launcher / Docker fallback
+- `lua/*.c` / `lua/*.h` — Lua runtime implementation, only for broadly justifiable runtime improvements
+- `rolldown.config.ts` — only if wasm packaging/bundling materially affects runtime loading behavior
+- `src/module.ts` / `src/*.ts` — only if needed to adapt to safe wasm-build changes
+- `autoresearch.sh` — benchmark driver for this session
+- `autoresearch.md` — session context
+- `autoresearch.ideas.md` — deferred ideas
 
 ## Off Limits
-- `lua/` C sources and wasm build artifacts for this session
-- public API behavior changes unless benchmark gains are substantial and correctness is preserved
-- new dependencies
+- Benchmark workload semantics in `bench/heapsort.lua`
+- Fake optimizations that skip work, cache results across iterations, or otherwise cheat the benchmark
+- New dependencies
 
 ## Constraints
 - Keep benchmark semantics the same: fresh state, load heapsort script, execute returned function
-- No new dependencies
-- Prefer simple changes with measurable wins
-- Avoid benchmark-only cheats that would not help real users
+- No benchmark-only cheating or semantic shortcuts
+- Prefer broadly useful speedups over highly workload-specific tricks
+- Avoid changing public API behavior unless clearly safe
 
 ## What's Been Tried
-- Baseline from `./autoresearch.sh`: `14.775872ms` average over 60 iterations / 8 warmup.
-- `src/module.ts`: changed `luaL_loadstring` binding from `['number', 'string']` to `['number', 'string|number']` so large chunks can use the optimized direct-buffer path in `cwrap`. This improved the benchmark to `13.135759ms` (~11.1% faster).
-- Quick profiling notes: state creation cost exists but warmed `createState()` overhead looks much smaller than total benchmark time, so the hottest path appears closer to chunk loading / execution than to engine construction alone.
+- Previous JS-glue-focused session got the benchmark from `14.775872ms` to `11.751988ms` by reducing JS↔wasm overhead (`lua_callk`/`lua_pcallk` raw exports and direct exported `luaL_loadstring`).
+- Profiling after those wins showed the remaining time is dominated by Lua execution itself, so wasm/compiler/runtime changes are now the most promising path.
+- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. This session focuses on the latter first.
diff --git a/autoresearch.sh b/autoresearch.sh
index 8ac09de..5a7f4a4 100755
--- a/autoresearch.sh
+++ b/autoresearch.sh
@@ -1,13 +1,37 @@
 #!/bin/bash
 set -euo pipefail
 
+build_start=$(python3 - <<'PY'
+import time
+print(time.time())
+PY
+)
+
+npm run build:wasm >/dev/null
 npm run build >/dev/null
 
-node --input-type=module <<'EOF'
+build_end=$(python3 - <<'PY'
+import time
+print(time.time())
+PY
+)
+
+wasm_build_seconds=$(python3 - <<PY
+print(round(${build_end} - ${build_start}, 6))
+PY
+)
+
+glue_wasm_kb=$(python3 - <<'PY'
+from pathlib import Path
+print(round(Path('build/glue.wasm').stat().st_size / 1024, 3))
+PY
+)
+
+node --input-type=module <<EOF
 import { readFileSync } from 'node:fs'
 import path from 'node:path'
 import { performance } from 'node:perf_hooks'
-import { fileURLToPath, pathToFileURL } from 'node:url'
+import { pathToFileURL } from 'node:url'
 
 const root = process.cwd()
 const heapsort = readFileSync(path.join(root, 'bench', 'heapsort.lua'), 'utf8')
@@ -46,6 +70,8 @@ for (let i = 0; i < iterations; i++) {
 const { avg, stddev } = stats(times)
 console.log(`METRIC wasmoon_heapsort_avg_ms=${avg.toFixed(6)}`)
 console.log(`METRIC wasmoon_heapsort_stddev_ms=${stddev.toFixed(6)}`)
+console.log(`METRIC wasm_build_seconds=${Number(${wasm_build_seconds}).toFixed(6)}`)
+console.log(`METRIC glue_wasm_kb=${Number(${glue_wasm_kb}).toFixed(3)}`)
 console.log(`METRIC iterations=${iterations}`)
 console.log(`METRIC warmup=${warmup}`)
 EOF

From d6332fa5c87ba14d17cb4ad5ea65ff1de28b504b Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 22:05:34 -0300
Subject: [PATCH 7/9] Baseline for wasm-build-focused session: default release
 emcc flags rebuilt from scratch, then focused heapsort benchmark\n\nResult:
 {"status":"keep","wasmoon_heapsort_avg_ms":12.639066}

---
 autoresearch.jsonl | 17 +++++++++++++++++
 autoresearch.sh    |  6 +++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/autoresearch.jsonl b/autoresearch.jsonl
index 9f9427c..475d13c 100644
--- a/autoresearch.jsonl
+++ b/autoresearch.jsonl
@@ -4,3 +4,20 @@
 {"run":3,"commit":"9c864c7","metric":13.838148,"metrics":{},"status":"discard","description":"Tried routing long luaL_loadstring calls through luaL_loadbufferx to avoid C strlen, but the extra JS-side length/allocation work regressed the heapsort benchmark","timestamp":1773446266277,"segment":0}
 {"run":4,"commit":"085fc65","metric":12.001671,"metrics":{},"status":"keep","description":"Call exported wasm functions directly for lua_callk/lua_pcallk when available, bypassing ccall on hot numeric-only call paths","timestamp":1773446314857,"segment":0}
 {"run":5,"commit":"085fc65","metric":15.1674,"metrics":{},"status":"discard","description":"Tried broader direct-export bindings for state/stack helpers (luaL_newstate/lua_close/lua_newthread/lua_absindex/lua_gettop/lua_settop/lua_rotate), but it regressed badly and increased variance","timestamp":1773446391365,"segment":0}
+{"run":6,"commit":"9a1c9e4","metric":11.751988,"metrics":{},"status":"keep","description":"Call exported wasm luaL_loadstring directly with a manually allocated UTF-8 buffer, avoiding ccall overhead on the hot chunk-load path","timestamp":1773446656315,"segment":0}
+{"run":7,"commit":"9a1c9e4","metric":11.952811,"metrics":{},"status":"discard","description":"Tried direct-export binding for luaL_openselectedlibs to cut state setup overhead, but the heapsort benchmark regressed slightly","timestamp":1773446733027,"segment":0}
+{"run":8,"commit":"9a1c9e4","metric":11.838151,"metrics":{},"status":"discard","description":"Tried caching the most recent UTF-8 buffer for luaL_loadstring to avoid repeated string encoding, but it did not beat the current best on the heapsort workload","timestamp":1773446800152,"segment":0}
+{"run":9,"commit":"9a1c9e4","metric":12.8011,"metrics":{},"status":"discard","description":"Tried direct-export binding for lua_close, but closing states through the raw export regressed the heapsort benchmark noticeably","timestamp":1773446842368,"segment":0}
+{"run":10,"commit":"9a1c9e4","metric":12.570699,"metrics":{},"status":"discard","description":"Tried lazily creating FunctionTypeExtension callback threads to reduce state setup work, but the heapsort benchmark regressed","timestamp":1773446931224,"segment":0}
+{"run":11,"commit":"9a1c9e4","metric":13.353136,"metrics":{},"status":"discard","description":"Tried lazy-instantiating type extensions so plain states avoid eager metatable/setup work, but the heapsort benchmark regressed significantly","timestamp":1773447072737,"segment":0}
+{"run":12,"commit":"9a1c9e4","metric":13.977992,"metrics":{},"status":"discard","description":"Tried calling exported wasm luaL_newstate directly to cut state-creation ccall overhead, but it regressed the heapsort benchmark","timestamp":1773447131145,"segment":0}
+{"run":13,"commit":"9a1c9e4","metric":14.443619,"metrics":{},"status":"discard","description":"Tried implementing luaL_loadstring via direct exported luaL_loadbufferx with JS-computed byte length to avoid C strlen, but the extra JS-side encoding/allocation cost regressed badly","timestamp":1773447207802,"segment":0}
+{"run":14,"commit":"9a1c9e4","metric":12.655008,"metrics":{},"status":"discard","description":"Tried reusing a heap buffer for direct luaL_loadstring UTF-8 encoding to avoid malloc/free churn, but it still regressed versus the current best","timestamp":1773447268974,"segment":0}
+{"run":15,"commit":"9a1c9e4","metric":14.450232,"metrics":{},"status":"discard","description":"Tried assigning raw wasm exports directly for lua_callk/lua_pcallk and relying on JS null-to-zero coercion, but it regressed sharply despite being functionally correct","timestamp":1773447315203,"segment":0}
+{"run":16,"commit":"9a1c9e4","metric":13.518631,"metrics":{},"status":"discard","description":"Tried specializing lua_callk/lua_pcallk wrappers for the common no-continuation case by always passing k=0, but that still regressed on the heapsort workload","timestamp":1773447358877,"segment":0}
+{"run":17,"commit":"9a1c9e4","metric":14.026144,"metrics":{},"status":"discard","description":"Tried hoisting raw wasm export references into local constants inside wrapper setup to cut per-call property lookups, but it regressed substantially","timestamp":1773447416704,"segment":0}
+{"run":18,"commit":"9a1c9e4","metric":14.448888,"metrics":{},"status":"discard","description":"Tried insertion-ordered type extension registration to avoid sorting on each createState, but it regressed sharply on the heapsort benchmark","timestamp":1773447523603,"segment":0}
+{"run":19,"commit":"9a1c9e4","metric":14.363467,"metrics":{},"status":"discard","description":"Tried an ASCII fast path for direct exported luaL_loadstring using manual malloc/stringToUTF8 to avoid stringToNewUTF8 work, but it regressed badly","timestamp":1773447568437,"segment":0}
+{"run":20,"commit":"9a1c9e4","metric":14.243077,"metrics":{},"status":"discard","description":"Tried hoisting only the raw exported luaL_loadstring reference into a local constant to reduce property lookups, but it still regressed heavily","timestamp":1773447607620,"segment":0}
+{"type":"config","name":"Optimize compiled Lua/wasm runtime for Wasmoon heapsort","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}
+{"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0}
diff --git a/autoresearch.sh b/autoresearch.sh
index 5a7f4a4..4f08162 100755
--- a/autoresearch.sh
+++ b/autoresearch.sh
@@ -27,7 +27,7 @@ print(round(Path('build/glue.wasm').stat().st_size / 1024, 3))
 PY
 )
 
-node --input-type=module <<EOF
+WASMOON_WASM_BUILD_SECONDS="$wasm_build_seconds" WASMOON_GLUE_WASM_KB="$glue_wasm_kb" node --input-type=module <<'EOF'
 import { readFileSync } from 'node:fs'
 import path from 'node:path'
 import { performance } from 'node:perf_hooks'
@@ -70,8 +70,8 @@ for (let i = 0; i < iterations; i++) {
 const { avg, stddev } = stats(times)
 console.log(`METRIC wasmoon_heapsort_avg_ms=${avg.toFixed(6)}`)
 console.log(`METRIC wasmoon_heapsort_stddev_ms=${stddev.toFixed(6)}`)
-console.log(`METRIC wasm_build_seconds=${Number(${wasm_build_seconds}).toFixed(6)}`)
-console.log(`METRIC glue_wasm_kb=${Number(${glue_wasm_kb}).toFixed(3)}`)
+console.log(`METRIC wasm_build_seconds=${Number(process.env.WASMOON_WASM_BUILD_SECONDS).toFixed(6)}`)
+console.log(`METRIC glue_wasm_kb=${Number(process.env.WASMOON_GLUE_WASM_KB).toFixed(3)}`)
 console.log(`METRIC iterations=${iterations}`)
 console.log(`METRIC warmup=${warmup}`)
 EOF

From 16917a02a2a9ce97213635cbc65bf8effdc10916 Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 22:07:34 -0300
Subject: [PATCH 8/9] Switch release wasm build from -O3 to -O2; this reduced
 runtime, build time, and wasm size on the heapsort workload\n\nResult:
 {"status":"keep","wasmoon_heapsort_avg_ms":11.794092}

---
 autoresearch.jsonl  | 2 ++
 utils/build-wasm.sh | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/autoresearch.jsonl b/autoresearch.jsonl
index 475d13c..1b64dc6 100644
--- a/autoresearch.jsonl
+++ b/autoresearch.jsonl
@@ -21,3 +21,5 @@
 {"run":20,"commit":"9a1c9e4","metric":14.243077,"metrics":{},"status":"discard","description":"Tried hoisting only the raw exported luaL_loadstring reference into a local constant to reduce property lookups, but it still regressed heavily","timestamp":1773447607620,"segment":0}
 {"type":"config","name":"Optimize compiled Lua/wasm runtime for Wasmoon heapsort","metricName":"wasmoon_heapsort_avg_ms","metricUnit":"ms","bestDirection":"lower"}
 {"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0}
+{"run":2,"commit":"d6332fa","metric":12.639066,"metrics":{},"status":"keep","description":"Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark","timestamp":1773450334225,"segment":0}
+{"run":3,"commit":"d6332fa","metric":13.779871,"metrics":{},"status":"discard","description":"Tried enabling LTO in the release emcc build (-flto), but runtime regressed and wasm size/build time increased substantially","timestamp":1773450410919,"segment":0}
diff --git a/utils/build-wasm.sh b/utils/build-wasm.sh
index b044209..715d44c 100755
--- a/utils/build-wasm.sh
+++ b/utils/build-wasm.sh
@@ -9,7 +9,7 @@ if [ "$1" == "dev" ];
 then
     extension="-O0 -g3 -s ASSERTIONS=1 -s SAFE_HEAP=1 -s STACK_OVERFLOW_CHECK=2"
 else
-    extension="-O3"
+    extension="-O2"
 fi
 
 emcc \

From f5982c8b2ebf8a3b3b95ad6eda889b1f63fd135d Mon Sep 17 00:00:00 2001
From: Gabriel Francisco <gabrinelson27@gmail.com>
Date: Fri, 13 Mar 2026 22:29:58 -0300
Subject: [PATCH 9/9] research

---
 autoresearch.ideas.md |  5 +++--
 autoresearch.jsonl    | 17 +++++++++++++++++
 autoresearch.md       |  7 ++++++-
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/autoresearch.ideas.md b/autoresearch.ideas.md
index 18bdea2..b95e8a9 100644
--- a/autoresearch.ideas.md
+++ b/autoresearch.ideas.md
@@ -1,2 +1,3 @@
-- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but it touches off-limits C/wasm internals for this session.
-- Investigate Lua/emscripten build-level optimizations (compile flags, LTO, allocator choices) for state creation and bytecode execution. Promising, but also off-limits for this session.
+- Validate the `-O2` release build win against additional Lua workloads (interop-heavy, library-heavy, allocation-heavy) before treating it as a universal default. Current evidence is strong for heapsort, but exploratory checks already show mixed results versus `-O3` on string-heavy code, so this remains highly relevant.
+- Investigate a dedicated internal fast path that batches `luaL_loadstring` + first `lua_callk` into one exported helper on the wasm side. This could remove a JS↔wasm roundtrip, but needs careful validation across non-benchmark workloads.
+- If broader validation still points to VM-bound execution, profile opcode mix / hot VM handlers on wasm and revisit targeted `lvm.c` or `ltable.h` changes with real evidence instead of micro-tweaks.
diff --git a/autoresearch.jsonl b/autoresearch.jsonl
index 1b64dc6..a40d4ad 100644
--- a/autoresearch.jsonl
+++ b/autoresearch.jsonl
@@ -23,3 +23,20 @@
 {"run":1,"commit":"46d08e6","metric":0,"metrics":{},"status":"crash","description":"Broken baseline attempt after retargeting autoresearch: shell expanded JS template literals inside autoresearch.sh and prevented metric emission","timestamp":1773450278554,"segment":0}
 {"run":2,"commit":"d6332fa","metric":12.639066,"metrics":{},"status":"keep","description":"Baseline for wasm-build-focused session: default release emcc flags rebuilt from scratch, then focused heapsort benchmark","timestamp":1773450334225,"segment":0}
 {"run":3,"commit":"d6332fa","metric":13.779871,"metrics":{},"status":"discard","description":"Tried enabling LTO in the release emcc build (-flto), but runtime regressed and wasm size/build time increased substantially","timestamp":1773450410919,"segment":0}
+{"run":4,"commit":"16917a0","metric":11.794092,"metrics":{},"status":"keep","description":"Switch release wasm build from -O3 to -O2; this reduced runtime, build time, and wasm size on the heapsort workload","timestamp":1773450454889,"segment":0}
+{"run":5,"commit":"16917a0","metric":12.071492,"metrics":{},"status":"discard","description":"Tried -Os for the release wasm build; it shrank the binary a lot but was slower than the -O2 build on heapsort","timestamp":1773450490273,"segment":0}
+{"run":6,"commit":"16917a0","metric":13.675405,"metrics":{},"status":"discard","description":"Tried -O1 for the release wasm build; it compiled faster but hurt runtime badly and grew the wasm versus -O2","timestamp":1773450519030,"segment":0}
+{"run":7,"commit":"16917a0","metric":0,"metrics":{},"status":"crash","description":"Tried dlmalloc instead of emmalloc in the -O2 wasm build; benchmark crashed with out-of-bounds memory access during state teardown","timestamp":1773450580128,"segment":0}
+{"run":8,"commit":"16917a0","metric":12.392698,"metrics":{},"status":"discard","description":"Tried adding -fno-exceptions and unwind-table stripping flags on top of -O2, but runtime regressed with no size win","timestamp":1773450656108,"segment":0}
+{"run":9,"commit":"16917a0","metric":12.682354,"metrics":{},"status":"discard","description":"Tried -fno-inline-functions on top of -O2 to shrink code and maybe help wasm locality, but runtime regressed noticeably","timestamp":1773450699747,"segment":0}
+{"run":10,"commit":"16917a0","metric":13.465365,"metrics":{},"status":"discard","description":"Tried setting INITIAL_MEMORY=32MB while keeping memory growth enabled, but runtime regressed badly on the heapsort workload","timestamp":1773450739008,"segment":0}
+{"run":11,"commit":"16917a0","metric":14.395206,"metrics":{},"status":"discard","description":"Tried SUPPORT_LONGJMP=wasm in the -O2 build, but it regressed heavily and increased build overhead","timestamp":1773450788238,"segment":0}
+{"run":12,"commit":"16917a0","metric":13.94806,"metrics":{},"status":"discard","description":"Tried adding -DNDEBUG to the -O2 build, but it regressed badly with much higher variance and no size change","timestamp":1773450864416,"segment":0}
+{"run":13,"commit":"16917a0","metric":12.710287,"metrics":{},"status":"discard","description":"Tried disabling Lua VM jump tables (-DLUA_USE_JUMPTABLE=0) in the -O2 build; it produced a slightly smaller wasm but slower interpreter execution","timestamp":1773450986537,"segment":0}
+{"run":14,"commit":"16917a0","metric":13.131162,"metrics":{},"status":"discard","description":"Misconfigured combo run: added no-continuation C helpers for lua_call/lua_pcall while jump tables were still disabled from a prior test; overall result regressed, so discard and rerun cleanly","timestamp":1773451119792,"segment":0}
+{"run":15,"commit":"16917a0","metric":14.295386,"metrics":{},"status":"discard","description":"Tried new C helpers for no-continuation lua_call/lua_pcall and bound module.ts to them, but the extra helper layer regressed badly versus direct raw exports","timestamp":1773451151734,"segment":0}
+{"run":16,"commit":"16917a0","metric":14.501163,"metrics":{},"status":"discard","description":"Tried adding likely() branch hints to Lua array fastgeti/fastseti in ltable.h, but it regressed badly on the heapsort workload","timestamp":1773451219245,"segment":0}
+{"run":17,"commit":"16917a0","metric":12.415745,"metrics":{},"status":"discard","description":"Tried fixed 64MB initial memory with memory growth disabled to reduce allocator/growth overhead, but it regressed and would also tighten memory semantics","timestamp":1773451294439,"segment":0}
+{"run":18,"commit":"16917a0","metric":14.639167,"metrics":{},"status":"discard","description":"Tried reordering luaH_fastseti to check the array slot tag before the metatable fast-path test, but it regressed badly on the heapsort workload","timestamp":1773451340612,"segment":0}
+{"run":19,"commit":"16917a0","metric":15.054911,"metrics":{},"status":"discard","description":"Tried mimalloc as the wasm allocator on top of -O2; it increased build time and wasm size and regressed runtime badly","timestamp":1773451479476,"segment":0}
+{"run":20,"commit":"16917a0","metric":12.867971,"metrics":{},"status":"discard","description":"Tried -O3 -fno-inline-functions as a middle ground between O2 and O3, but it remained slower than the current -O2 best on heapsort","timestamp":1773451588143,"segment":0}
diff --git a/autoresearch.md b/autoresearch.md
index d46dc1e..9930b42 100644
--- a/autoresearch.md
+++ b/autoresearch.md
@@ -34,4 +34,9 @@ Optimize the runtime performance of the compiled Lua WebAssembly build used by W
 ## What's Been Tried
 - Previous JS-glue-focused session got the benchmark from `14.775872ms` to `11.751988ms` by reducing JS↔wasm overhead (`lua_callk`/`lua_pcallk` raw exports and direct exported `luaL_loadstring`).
 - Profiling after those wins showed the remaining time is dominated by Lua execution itself, so wasm/compiler/runtime changes are now the most promising path.
-- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. This session focuses on the latter first.
+- For the rebuilt-from-scratch wasm session, default release build (`-O3`) baseline was `12.639066ms`, `7.585s` wasm build time, `277.404kb` wasm.
+- Best wasm-build improvement so far: changing release build from `-O3` to `-O2` improved runtime to `11.794092ms`, while also reducing build time to `6.413s` and wasm size to `274.129kb`.
+- Cross-checking outside the primary metric suggests some overfitting risk: on an exploratory numeric-heavy script `-O2` slightly beat `-O3`, but on an exploratory string-heavy script `-O3` beat `-O2`. So `-O2` is a strong win for the heapsort/numeric path, not yet a universally proven default.
+- Discarded build-flag experiments: `-flto`, `-Os`, `-O1`, `-DNDEBUG`, `-fno-exceptions`/unwind stripping, `-fno-inline-functions`, `INITIAL_MEMORY=32MB`, `SUPPORT_LONGJMP=wasm`, and fixed 64MB memory without growth. All regressed runtime, and some hurt size/build time or semantics.
+- Discarded runtime/source experiments: disabling Lua VM jump tables, adding no-continuation C helpers for `lua_call`/`lua_pcall`, adding likely() hints to array fast paths, and reordering `luaH_fastseti` fast-path checks. All regressed on the benchmark.
+- Deferred ideas from the prior session: batched wasm-side helpers and build-level optimization tuning. Build-level tuning found a real win (`-O2`), but the remaining promising paths now look more invasive and should be validated against more than one workload to avoid overfitting.