widgrensit · Taure · May 5, 2026 · May 5, 2026
diff --git a/docs/adr/0000-record-architecture-decisions.md b/docs/adr/0000-record-architecture-decisions.md
@@ -0,0 +1,49 @@
+# ADR 0000: Record architecture decisions
+
+Date: 2026-05-05
+
+## Status
+
+Accepted.
+
+## Context
+
+asobi_lua hosts user-supplied Lua running inside Luerl. Decisions about
+sandboxing, timeouts, hot-reload, bridge module shape, and which calls
+get spawn-isolated have safety implications that are easy to forget and
+expensive to revisit. We need a record.
+
+## Decision
+
+Record significant architecture decisions as numbered markdown files in
+`docs/adr/`. One file per decision. Filename: `NNNN-short-slug.md`.
+
+Same template as the `asobi` repo (Michael Nygard ADR style):
+
+- **Title** — `ADR NNNN: short imperative phrase`
+- **Date** — `YYYY-MM-DD`
+- **Status** — `Proposed` | `Accepted` | `Superseded by ADR NNNN` | `Deprecated`
+- **Context** — what's true now that motivates this decision
+- **Decision** — the choice, in one or two short paragraphs
+- **Consequences** — what this enables, what it costs
+- **Alternatives considered** — options ruled out, with one-line rationale
+
+ADR-worthy things in asobi_lua:
+
+- Changes to which Luerl entry points are exposed or restricted
+- Changes to the sandbox or timeout/heap budgets
+- New behaviour-bridge modules or callback shapes
+- Decisions that trade safety for performance
+
+Not ADR-worthy: bug fixes, renames, pure refactors.
+
+## Consequences
+
+- Future readers can recover the *why* behind sandbox + timeout choices.
+- Forces articulating which alternative was rejected and on what grounds.
+
+## Alternatives considered
+
+- **Inline comments** — drift, fragmented, easy to miss.
+- **Wiki / external doc** — separates from the code; ADRs in-repo travel
+  with the branch.
diff --git a/docs/adr/0001-shared-state-bridge-module.md b/docs/adr/0001-shared-state-bridge-module.md
@@ -0,0 +1,75 @@
+# ADR 0001: Shared-state Lua bridge as a separate module
+
+Date: 2026-05-05
+
+## Status
+
+Accepted. Shipped in asobi_lua#41 alongside asobi#117.
+
+Retroactive ADR — written after merge.
+
+## Context
+
+The asobi behaviour gained an optional `get_state/1` callback for
+shared-payload broadcasts (asobi ADR 0001). For pure Erlang games the
+callback dispatch is a static export check: implement `get_state/1` and
+the match server picks the shared path.
+
+For Lua games the dispatch is harder. The bridge module is always
+`asobi_lua_match`; whether to use shared or per-player is a property of
+the *user's Lua script*, not the bridge. Two complications:
+
+1. Erlang exports are static. If `asobi_lua_match` exports `get_state/1`,
+   the match server takes the shared path for *every* Lua match — including
+   existing scripts that defined `function get_state(player_id, state)`,
+   which would break (Lua binds the single arg to `player_id`, leaving
+   `state` nil → field access crash → empty payload to all players).
+2. Lua functions don't have arity. `function get_state(player_id, state)`
+   called with one arg silently makes `state = nil`. We can't auto-detect
+   the script's intent.
+
+## Decision
+
+Ship a second bridge module `asobi_lua_match_shared` that:
+
+- Delegates `init/1`, `join/2`, `leave/2`, `handle_input/3`, `tick/1`,
+  `vote_*` directly to `asobi_lua_match` (no duplicated work).
+- Implements its own `get_state/1` that calls the Lua script's
+  `get_state(state)` (one argument) and returns the shared map.
+- Does NOT export `get_state/2`.
+
+Selection is opt-in via `state_strategy = "shared"` in the match script's
+globals. `asobi_lua_config` reads this and adds `state_strategy => shared`
+to the mode config map. `asobi_game_modes`/`asobi_matchmaker` then resolve
+`{lua, Script}` + `state_strategy => shared` to `asobi_lua_match_shared`
+instead of `asobi_lua_match`.
+
+## Consequences
+
+- Existing scripts (`{lua, Script}` with no `state_strategy`) keep
+  resolving to `asobi_lua_match` and the per-player path. Zero behaviour
+  change. Backward compatible.
+- Two bridge modules to maintain. Most callbacks are one-line delegates.
+  Acceptable cost for the explicitness — there is no runtime ambiguity
+  about which path a given match is on.
+- The selection point lives in one place (the Lua script declares its
+  intent). No silent fallback, no auto-detection guessing wrong.
+
+## Alternatives considered
+
+- **Always export `get_state/1` from `asobi_lua_match` and route the
+  per-player path through it by passing a sentinel** — would require
+  every existing Lua script to handle a sentinel argument. Breaks
+  backward compatibility silently.
+- **Auto-detect the script's `get_state` arity** — Lua doesn't expose
+  arity reliably (`debug.getinfo` is gated by the sandbox; even if
+  enabled, it returns `nparams` which is 0 for vararg functions and
+  unreliable across Lua/Luerl versions).
+- **A new optional `is_shared_state/1` callback in asobi_match** — would
+  let one bridge module expose both paths. Considered but rejected: adds
+  a callback to the public asobi behaviour for what is really an asobi_lua
+  internal concern. The two-bridge approach keeps the asobi API minimal.
+- **A config flag in `game_modes` instead of a Lua global** — would split
+  the source of truth. Putting the flag in the script keeps the script
+  self-describing: opening match.lua shows the broadcast strategy, no
+  need to cross-reference `sys.config`.
diff --git a/docs/adr/0002-skip-bounded-eval-for-handle-input.md b/docs/adr/0002-skip-bounded-eval-for-handle-input.md
@@ -0,0 +1,115 @@
+# ADR 0002: Skip bounded_eval for handle_input
+
+Date: 2026-05-05
+
+## Status
+
+Accepted.
+
+## Context
+
+Every Lua callback in `asobi_lua_match` and `asobi_lua_world` runs through
+`asobi_lua_loader:bounded_eval/2`: a `spawn_opt` with `monitor` and
+`max_heap_size: kill => true`, the function runs in the child, the parent
+receives the result via message-passing, and a wall-clock timeout kills
+the child if it overruns.
+
+This protects the gen_server from runaway scripts (infinite loops, heap
+blow-ups). It exists for good reasons documented in
+`guides/security-trust-model.md`.
+
+It also has a per-call cost. `spawn_opt` with `monitor` + `max_heap_size`
+is on the order of 30-50 µs even on idle hardware. Combined with the
+message round-trip and the demonitor on the way out, every Lua call
+pays roughly 40-80 µs of pure overhead before the actual Lua work starts.
+
+For `tick/1` the overhead is amortized over the tick's actual work
+(NPC simulation, world updates) — 80 µs is rounding error against a
+1-5 ms tick.
+
+For `handle_input/3` the picture is different. A WS-arriving player input
+is small: a position delta, an action enum, a few bytes of state to
+update. The actual Lua work is on the order of 50-100 µs. The bounded_eval
+overhead is comparable to the work itself — we're spending nearly half
+our time on safety machinery for a call whose worst-case work is
+intrinsically bounded by message size.
+
+Measurement at 200 players × 10 Hz = 2000 inputs/sec showed
+`asobi-bench-asobi-1` sitting at ~250% CPU. The local bench failed to
+demonstrate a clean win from the encode-once optimization (asobi ADR
+0001) because Luerl-spawn churn from per-input bounded_eval dominated
+the savings. See `asobi-bench/results/2026-05-05-post-fix1.md` for the
+numbers.
+
+## Decision
+
+Stop spawning a child process per `handle_input/3` call. Use
+`asobi_lua_loader:call/3` (which wraps `luerl:call_function/3` in an
+internal try/catch and returns `{ok, _, _} | {error, {lua_error, _} |
+{call_failed, _}}`) instead of `call/4` (which adds `bounded_eval` on
+top).
+
+The bridge code's existing `{error, _}` clause already logs a warning
+and returns `{ok, State}` for any Luerl error, so swapping `call/4` →
+`call/3` is a one-line change with no new error-handling needed. Bad
+input shapes still log and drop the input.
+
+Keep `bounded_eval` for `init/1`, `tick/1`, `get_state/{1,2}`,
+`vote_requested/1`, `vote_resolved/3`, and the world-side
+equivalents — those run script-author code in contexts where wall-clock
+or heap protection still matters.
+
+Apply the same change to `asobi_lua_match_shared:get_state/1`? No.
+`get_state` runs once per tick, not per input. Spawn cost is amortized.
+Leave it alone.
+
+Apply the same change to `asobi_lua_world:handle_input/3`? Yes — same
+reasoning. Per-input frequency, bounded work per call.
+
+## Consequences
+
+- Eliminates ~40-80 µs of overhead per WS input message. At 2000
+  inputs/sec the saving is on the order of 80-160 ms of CPU per second
+  freed up across the BEAM. Re-bench after the change to verify p99
+  improvement at 200 players.
+- A buggy or malicious `handle_input` can no longer be wall-clock-killed.
+  An infinite loop in handle_input now hangs the match server until the
+  caller's gen_server timeout (5s default) trips, then the match server
+  crashes and is restarted by the match supervisor. The blast radius is
+  one match, not the whole node.
+- A heap-allocating handle_input is no longer caught by `max_heap_size`
+  per call. The persistent Luerl state is held in the match server
+  process, which has its own heap and OS-level limits via VM args.
+  Pathological allocation in handle_input would grow the match server
+  process heap until OOM kill — same outcome as today, just no per-call
+  cap.
+- The trust model gets sharper: `tick/1` is the load-bearing isolation
+  point. `handle_input/3` is not a sandbox boundary; it's a hot path for
+  trusted-author scripts. Documented in `guides/security-trust-model.md`.
+- The "blast radius is one match" claim is asserted in the ADR but only
+  partially pinned by tests. The match-bridge and world-bridge contract
+  tests prove infinite-loop handle_input does not self-terminate; an
+  end-to-end CT suite that drives an input through the match supervisor,
+  asserts the gen_server times out, and asserts only the affected match
+  restarts is a follow-up.
+
+## Alternatives considered
+
+- **Pool of long-lived worker processes per match** — one or more workers
+  hold the Lua state, the match server messages them. Eliminates per-call
+  spawn AND keeps timeout protection. Rejected as a much larger change
+  with state-ownership complications (Luerl state mutates on every call;
+  shared ownership across processes needs message-passing of the state
+  itself, which is just as expensive as spawning).
+- **Batch inputs and run them in one Lua call per tick** — drain the
+  input queue at tick time, call a `handle_inputs(queue, state)` Lua
+  function that loops. Bigger API surface (new optional Lua function
+  shape), changes input ordering semantics (currently per-input
+  immediate; batched would be tick-quantized). Possibly a future
+  change but not for the first cut.
+- **Lower the `INPUT_TIMEOUT` to something tiny (e.g. 5 ms) so killed
+  workers cost less** — doesn't help; spawn overhead is the same
+  regardless of timeout, and 5 ms timeout would false-fire on slow
+  scripts that aren't actually broken.
+- **Add a Luerl-level instruction-count budget** — Luerl doesn't expose
+  one. Implementing it would mean modifying Luerl itself, out of scope.
diff --git a/guides/security-trust-model.md b/guides/security-trust-model.md
@@ -48,3 +48,42 @@ also requires the target module to be on an explicit allowlist
 via `application:get_env(asobi_lua, terrain_providers, ...)`) so a
 script that names an unrelated loaded module (`gen_server`, `rpc`,
 etc.) is rejected with a `terrain_provider_not_allowed` warning.
+
+## Per-callback isolation
+
+Most Lua callbacks run inside a child process spawned by
+`asobi_lua_loader:bounded_eval/2` with a wall-clock timeout and a
+`max_heap_size: kill => true`. A runaway loop or a runaway allocation
+in those callbacks crashes the child, the parent gen_server receives a
+`{error, timeout | heap_exhausted}` result, and the match continues.
+
+| Callback | Bridge | Bounded? | Budget |
+|---|---|---|---|
+| `init/1` | match, world | yes | 1000-2000 ms |
+| `tick/1`, `zone_tick/2` | match, world | yes | 500 ms |
+| `get_state/{1,2}` | match, world | yes | 100 ms |
+| `join/2`, `leave/2` | match, world | yes | 200 ms |
+| `vote_*` | match | yes | 200 ms |
+| `phases/1`, `on_phase_*/2` | world | yes | 200 ms |
+| `terrain_provider/1` | world | yes | 5000 ms |
+| **`handle_input/3`** | **match, world** | **NO** | **(see below)** |
+
+`handle_input/3` is the one callback that does **not** spawn-isolate.
+At realistic input rates (one tick × N players × the message rate)
+the per-call spawn cost dominated the actual Lua work (~30-50 µs spawn
++ monitor + heap-cap setup vs ~50-200 µs of input handling). Removing
+the wrapper recovered measured tail-latency wins of 35-45 % at 200
+players × 10 Hz input. See ADR 0002.
+
+The trade is explicit: a `while true do end` inside `handle_input` now
+hangs the match server until its caller's `gen_server:call/2` timeout
+trips (5 s default). The match supervisor then restarts the match
+process. Blast radius is one match.
+
+`handle_input/3` is therefore **not a sandbox boundary**. It is a hot
+path for trusted-author scripts. Audit the inputs your match script
+accepts and avoid pattern-matching dispatch on attacker-controlled
+strings; otherwise, treat the same as you would any Erlang gen_server
+handle_call/2 implementation. Per-tick safety remains owned by
+`tick/1`, which still spawn-isolates and is the right place to
+enforce wall-clock fairness across players.
diff --git a/src/lua/asobi_lua_match.erl b/src/lua/asobi_lua_match.erl
@@ -45,7 +45,6 @@ function vote_resolved(template, result, state) -- return updated state
 %% get the least because they run hundreds of times per second.
 -define(INIT_TIMEOUT, 1000).
 -define(TICK_TIMEOUT, 500).
--define(INPUT_TIMEOUT, 100).
 -define(JOIN_TIMEOUT, 200).
 -define(LEAVE_TIMEOUT, 200).
 -define(GET_STATE_TIMEOUT, 100).
@@ -126,7 +125,9 @@ leave(PlayerId, #{lua_state := LuaSt, game_state := GS} = State) ->
 -spec handle_input(binary(), map(), map()) -> {ok, map()}.
 handle_input(PlayerId, Input, #{lua_state := LuaSt, game_state := GS} = State) ->
     {EncInput, LuaSt1} = luerl:encode(Input, LuaSt),
-    case asobi_lua_loader:call(handle_input, [PlayerId, EncInput, GS], LuaSt1, ?INPUT_TIMEOUT) of
+    %% No bounded_eval: the per-input spawn dominates real Lua work at
+    %% high input rates. tick/1 still spawn-isolates. See ADR 0002.
+    case asobi_lua_loader:call(handle_input, [PlayerId, EncInput, GS], LuaSt1) of
         {ok, [GS1 | _], LuaSt2} ->
             {ok, State#{lua_state => LuaSt2, game_state => GS1}};
         {error, Reason} ->

diff --git a/src/lua/asobi_lua_world.erl b/src/lua/asobi_lua_world.erl
@@ -45,7 +45,6 @@ function on_zone_unloaded(cx, cy, state)     -- return state
 -define(INIT_TIMEOUT, 2000).
 -define(GENERATE_TIMEOUT, 5000).
 -define(TICK_TIMEOUT, 500).
--define(INPUT_TIMEOUT, 100).
 -define(JOIN_TIMEOUT, 200).
 -define(LEAVE_TIMEOUT, 200).
 -define(GET_STATE_TIMEOUT, 100).
@@ -188,9 +187,10 @@ handle_input(PlayerId, Input, Entities) ->
         #{lua_state := LuaSt} = ZoneState ->
             {EncInput, LuaSt1} = luerl:encode(Input, LuaSt),
             {EncEntities, LuaSt2} = luerl:encode(Entities, LuaSt1),
+            %% No bounded_eval: see ADR 0002.
             case
                 asobi_lua_loader:call(
-                    handle_input, [PlayerId, EncInput, EncEntities], LuaSt2, ?INPUT_TIMEOUT
+                    handle_input, [PlayerId, EncInput, EncEntities], LuaSt2
                 )
             of
                 {ok, [Ents1 | _], LuaSt3} ->