From 8fd6e2364fdc9f9b256a4c354dfd959fbd9b7621 Mon Sep 17 00:00:00 2001 From: Daniel Widgren Date: Tue, 5 May 2026 18:33:24 +0200 Subject: [PATCH] feat(lua): drop bounded_eval for handle_input + ADRs 0000/0001/0002 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit handle_input/3 in both asobi_lua_match and asobi_lua_world bridges no longer wraps the Luerl call in bounded_eval (spawn + monitor + heap_limit). At realistic input rates (200 players × 10 Hz = 2k inputs/sec) the per-call spawn overhead dominated actual Lua work and caused tail-latency stalls on the BEAM scheduler. Bench delta (asobi-bench, 200 bots, 30s, 10 Hz): - p99.9: ~2945ms -> ~1860ms (-37%) - max: ~3750ms -> ~2065ms (-45%) - inputs throughput: ~26k -> ~41k per 30s window (+56%) Trade-off documented in ADR 0002 and pinned by tests: - match_handle_input_no_wall_clock_timeout_test (match bridge) - world_handle_input_no_wall_clock_timeout_test (world bridge) - prop_lua_error_containment splits crash modes: tick still tests infinite_loop containment; input_crash_mode excludes it (would wedge the property runner — by design). Trust model updated in guides/security-trust-model.md with a new "Per-callback isolation" table and an explicit "handle_input is not a sandbox boundary" section. Also includes the project ADR convention (0000) and retroactive ADR 0001 documenting the asobi_lua_match_shared bridge that shipped in #41. --- .../adr/0000-record-architecture-decisions.md | 49 ++++++++ docs/adr/0001-shared-state-bridge-module.md | 75 ++++++++++++ ...0002-skip-bounded-eval-for-handle-input.md | 115 ++++++++++++++++++ guides/security-trust-model.md | 39 ++++++ src/lua/asobi_lua_match.erl | 5 +- src/lua/asobi_lua_world.erl | 4 +- test/asobi_lua_resource_limits_tests.erl | 80 +++++++++--- test/prop_lua_error_containment.erl | 20 +-- 8 files changed, 360 insertions(+), 27 deletions(-) create mode 100644 docs/adr/0000-record-architecture-decisions.md create mode 100644 docs/adr/0001-shared-state-bridge-module.md create mode 100644 docs/adr/0002-skip-bounded-eval-for-handle-input.md diff --git a/docs/adr/0000-record-architecture-decisions.md b/docs/adr/0000-record-architecture-decisions.md new file mode 100644 index 0000000..6ea7601 --- /dev/null +++ b/docs/adr/0000-record-architecture-decisions.md @@ -0,0 +1,49 @@ +# ADR 0000: Record architecture decisions + +Date: 2026-05-05 + +## Status + +Accepted. + +## Context + +asobi_lua hosts user-supplied Lua running inside Luerl. Decisions about +sandboxing, timeouts, hot-reload, bridge module shape, and which calls +get spawn-isolated have safety implications that are easy to forget and +expensive to revisit. We need a record. + +## Decision + +Record significant architecture decisions as numbered markdown files in +`docs/adr/`. One file per decision. Filename: `NNNN-short-slug.md`. + +Same template as the `asobi` repo (Michael Nygard ADR style): + +- **Title** — `ADR NNNN: short imperative phrase` +- **Date** — `YYYY-MM-DD` +- **Status** — `Proposed` | `Accepted` | `Superseded by ADR NNNN` | `Deprecated` +- **Context** — what's true now that motivates this decision +- **Decision** — the choice, in one or two short paragraphs +- **Consequences** — what this enables, what it costs +- **Alternatives considered** — options ruled out, with one-line rationale + +ADR-worthy things in asobi_lua: + +- Changes to which Luerl entry points are exposed or restricted +- Changes to the sandbox or timeout/heap budgets +- New behaviour-bridge modules or callback shapes +- Decisions that trade safety for performance + +Not ADR-worthy: bug fixes, renames, pure refactors. + +## Consequences + +- Future readers can recover the *why* behind sandbox + timeout choices. +- Forces articulating which alternative was rejected and on what grounds. + +## Alternatives considered + +- **Inline comments** — drift, fragmented, easy to miss. +- **Wiki / external doc** — separates from the code; ADRs in-repo travel + with the branch. diff --git a/docs/adr/0001-shared-state-bridge-module.md b/docs/adr/0001-shared-state-bridge-module.md new file mode 100644 index 0000000..f40a7c3 --- /dev/null +++ b/docs/adr/0001-shared-state-bridge-module.md @@ -0,0 +1,75 @@ +# ADR 0001: Shared-state Lua bridge as a separate module + +Date: 2026-05-05 + +## Status + +Accepted. Shipped in asobi_lua#41 alongside asobi#117. + +Retroactive ADR — written after merge. + +## Context + +The asobi behaviour gained an optional `get_state/1` callback for +shared-payload broadcasts (asobi ADR 0001). For pure Erlang games the +callback dispatch is a static export check: implement `get_state/1` and +the match server picks the shared path. + +For Lua games the dispatch is harder. The bridge module is always +`asobi_lua_match`; whether to use shared or per-player is a property of +the *user's Lua script*, not the bridge. Two complications: + +1. Erlang exports are static. If `asobi_lua_match` exports `get_state/1`, + the match server takes the shared path for *every* Lua match — including + existing scripts that defined `function get_state(player_id, state)`, + which would break (Lua binds the single arg to `player_id`, leaving + `state` nil → field access crash → empty payload to all players). +2. Lua functions don't have arity. `function get_state(player_id, state)` + called with one arg silently makes `state = nil`. We can't auto-detect + the script's intent. + +## Decision + +Ship a second bridge module `asobi_lua_match_shared` that: + +- Delegates `init/1`, `join/2`, `leave/2`, `handle_input/3`, `tick/1`, + `vote_*` directly to `asobi_lua_match` (no duplicated work). +- Implements its own `get_state/1` that calls the Lua script's + `get_state(state)` (one argument) and returns the shared map. +- Does NOT export `get_state/2`. + +Selection is opt-in via `state_strategy = "shared"` in the match script's +globals. `asobi_lua_config` reads this and adds `state_strategy => shared` +to the mode config map. `asobi_game_modes`/`asobi_matchmaker` then resolve +`{lua, Script}` + `state_strategy => shared` to `asobi_lua_match_shared` +instead of `asobi_lua_match`. + +## Consequences + +- Existing scripts (`{lua, Script}` with no `state_strategy`) keep + resolving to `asobi_lua_match` and the per-player path. Zero behaviour + change. Backward compatible. +- Two bridge modules to maintain. Most callbacks are one-line delegates. + Acceptable cost for the explicitness — there is no runtime ambiguity + about which path a given match is on. +- The selection point lives in one place (the Lua script declares its + intent). No silent fallback, no auto-detection guessing wrong. + +## Alternatives considered + +- **Always export `get_state/1` from `asobi_lua_match` and route the + per-player path through it by passing a sentinel** — would require + every existing Lua script to handle a sentinel argument. Breaks + backward compatibility silently. +- **Auto-detect the script's `get_state` arity** — Lua doesn't expose + arity reliably (`debug.getinfo` is gated by the sandbox; even if + enabled, it returns `nparams` which is 0 for vararg functions and + unreliable across Lua/Luerl versions). +- **A new optional `is_shared_state/1` callback in asobi_match** — would + let one bridge module expose both paths. Considered but rejected: adds + a callback to the public asobi behaviour for what is really an asobi_lua + internal concern. The two-bridge approach keeps the asobi API minimal. +- **A config flag in `game_modes` instead of a Lua global** — would split + the source of truth. Putting the flag in the script keeps the script + self-describing: opening match.lua shows the broadcast strategy, no + need to cross-reference `sys.config`. diff --git a/docs/adr/0002-skip-bounded-eval-for-handle-input.md b/docs/adr/0002-skip-bounded-eval-for-handle-input.md new file mode 100644 index 0000000..60d2d99 --- /dev/null +++ b/docs/adr/0002-skip-bounded-eval-for-handle-input.md @@ -0,0 +1,115 @@ +# ADR 0002: Skip bounded_eval for handle_input + +Date: 2026-05-05 + +## Status + +Accepted. + +## Context + +Every Lua callback in `asobi_lua_match` and `asobi_lua_world` runs through +`asobi_lua_loader:bounded_eval/2`: a `spawn_opt` with `monitor` and +`max_heap_size: kill => true`, the function runs in the child, the parent +receives the result via message-passing, and a wall-clock timeout kills +the child if it overruns. + +This protects the gen_server from runaway scripts (infinite loops, heap +blow-ups). It exists for good reasons documented in +`guides/security-trust-model.md`. + +It also has a per-call cost. `spawn_opt` with `monitor` + `max_heap_size` +is on the order of 30-50 µs even on idle hardware. Combined with the +message round-trip and the demonitor on the way out, every Lua call +pays roughly 40-80 µs of pure overhead before the actual Lua work starts. + +For `tick/1` the overhead is amortized over the tick's actual work +(NPC simulation, world updates) — 80 µs is rounding error against a +1-5 ms tick. + +For `handle_input/3` the picture is different. A WS-arriving player input +is small: a position delta, an action enum, a few bytes of state to +update. The actual Lua work is on the order of 50-100 µs. The bounded_eval +overhead is comparable to the work itself — we're spending nearly half +our time on safety machinery for a call whose worst-case work is +intrinsically bounded by message size. + +Measurement at 200 players × 10 Hz = 2000 inputs/sec showed +`asobi-bench-asobi-1` sitting at ~250% CPU. The local bench failed to +demonstrate a clean win from the encode-once optimization (asobi ADR +0001) because Luerl-spawn churn from per-input bounded_eval dominated +the savings. See `asobi-bench/results/2026-05-05-post-fix1.md` for the +numbers. + +## Decision + +Stop spawning a child process per `handle_input/3` call. Use +`asobi_lua_loader:call/3` (which wraps `luerl:call_function/3` in an +internal try/catch and returns `{ok, _, _} | {error, {lua_error, _} | +{call_failed, _}}`) instead of `call/4` (which adds `bounded_eval` on +top). + +The bridge code's existing `{error, _}` clause already logs a warning +and returns `{ok, State}` for any Luerl error, so swapping `call/4` → +`call/3` is a one-line change with no new error-handling needed. Bad +input shapes still log and drop the input. + +Keep `bounded_eval` for `init/1`, `tick/1`, `get_state/{1,2}`, +`vote_requested/1`, `vote_resolved/3`, and the world-side +equivalents — those run script-author code in contexts where wall-clock +or heap protection still matters. + +Apply the same change to `asobi_lua_match_shared:get_state/1`? No. +`get_state` runs once per tick, not per input. Spawn cost is amortized. +Leave it alone. + +Apply the same change to `asobi_lua_world:handle_input/3`? Yes — same +reasoning. Per-input frequency, bounded work per call. + +## Consequences + +- Eliminates ~40-80 µs of overhead per WS input message. At 2000 + inputs/sec the saving is on the order of 80-160 ms of CPU per second + freed up across the BEAM. Re-bench after the change to verify p99 + improvement at 200 players. +- A buggy or malicious `handle_input` can no longer be wall-clock-killed. + An infinite loop in handle_input now hangs the match server until the + caller's gen_server timeout (5s default) trips, then the match server + crashes and is restarted by the match supervisor. The blast radius is + one match, not the whole node. +- A heap-allocating handle_input is no longer caught by `max_heap_size` + per call. The persistent Luerl state is held in the match server + process, which has its own heap and OS-level limits via VM args. + Pathological allocation in handle_input would grow the match server + process heap until OOM kill — same outcome as today, just no per-call + cap. +- The trust model gets sharper: `tick/1` is the load-bearing isolation + point. `handle_input/3` is not a sandbox boundary; it's a hot path for + trusted-author scripts. Documented in `guides/security-trust-model.md`. +- The "blast radius is one match" claim is asserted in the ADR but only + partially pinned by tests. The match-bridge and world-bridge contract + tests prove infinite-loop handle_input does not self-terminate; an + end-to-end CT suite that drives an input through the match supervisor, + asserts the gen_server times out, and asserts only the affected match + restarts is a follow-up. + +## Alternatives considered + +- **Pool of long-lived worker processes per match** — one or more workers + hold the Lua state, the match server messages them. Eliminates per-call + spawn AND keeps timeout protection. Rejected as a much larger change + with state-ownership complications (Luerl state mutates on every call; + shared ownership across processes needs message-passing of the state + itself, which is just as expensive as spawning). +- **Batch inputs and run them in one Lua call per tick** — drain the + input queue at tick time, call a `handle_inputs(queue, state)` Lua + function that loops. Bigger API surface (new optional Lua function + shape), changes input ordering semantics (currently per-input + immediate; batched would be tick-quantized). Possibly a future + change but not for the first cut. +- **Lower the `INPUT_TIMEOUT` to something tiny (e.g. 5 ms) so killed + workers cost less** — doesn't help; spawn overhead is the same + regardless of timeout, and 5 ms timeout would false-fire on slow + scripts that aren't actually broken. +- **Add a Luerl-level instruction-count budget** — Luerl doesn't expose + one. Implementing it would mean modifying Luerl itself, out of scope. diff --git a/guides/security-trust-model.md b/guides/security-trust-model.md index 4b86cab..74e8cf7 100644 --- a/guides/security-trust-model.md +++ b/guides/security-trust-model.md @@ -48,3 +48,42 @@ also requires the target module to be on an explicit allowlist via `application:get_env(asobi_lua, terrain_providers, ...)`) so a script that names an unrelated loaded module (`gen_server`, `rpc`, etc.) is rejected with a `terrain_provider_not_allowed` warning. + +## Per-callback isolation + +Most Lua callbacks run inside a child process spawned by +`asobi_lua_loader:bounded_eval/2` with a wall-clock timeout and a +`max_heap_size: kill => true`. A runaway loop or a runaway allocation +in those callbacks crashes the child, the parent gen_server receives a +`{error, timeout | heap_exhausted}` result, and the match continues. + +| Callback | Bridge | Bounded? | Budget | +|---|---|---|---| +| `init/1` | match, world | yes | 1000-2000 ms | +| `tick/1`, `zone_tick/2` | match, world | yes | 500 ms | +| `get_state/{1,2}` | match, world | yes | 100 ms | +| `join/2`, `leave/2` | match, world | yes | 200 ms | +| `vote_*` | match | yes | 200 ms | +| `phases/1`, `on_phase_*/2` | world | yes | 200 ms | +| `terrain_provider/1` | world | yes | 5000 ms | +| **`handle_input/3`** | **match, world** | **NO** | **(see below)** | + +`handle_input/3` is the one callback that does **not** spawn-isolate. +At realistic input rates (one tick × N players × the message rate) +the per-call spawn cost dominated the actual Lua work (~30-50 µs spawn ++ monitor + heap-cap setup vs ~50-200 µs of input handling). Removing +the wrapper recovered measured tail-latency wins of 35-45 % at 200 +players × 10 Hz input. See ADR 0002. + +The trade is explicit: a `while true do end` inside `handle_input` now +hangs the match server until its caller's `gen_server:call/2` timeout +trips (5 s default). The match supervisor then restarts the match +process. Blast radius is one match. + +`handle_input/3` is therefore **not a sandbox boundary**. It is a hot +path for trusted-author scripts. Audit the inputs your match script +accepts and avoid pattern-matching dispatch on attacker-controlled +strings; otherwise, treat the same as you would any Erlang gen_server +handle_call/2 implementation. Per-tick safety remains owned by +`tick/1`, which still spawn-isolates and is the right place to +enforce wall-clock fairness across players. diff --git a/src/lua/asobi_lua_match.erl b/src/lua/asobi_lua_match.erl index 5dda43e..b0b025d 100644 --- a/src/lua/asobi_lua_match.erl +++ b/src/lua/asobi_lua_match.erl @@ -45,7 +45,6 @@ function vote_resolved(template, result, state) -- return updated state %% get the least because they run hundreds of times per second. -define(INIT_TIMEOUT, 1000). -define(TICK_TIMEOUT, 500). --define(INPUT_TIMEOUT, 100). -define(JOIN_TIMEOUT, 200). -define(LEAVE_TIMEOUT, 200). -define(GET_STATE_TIMEOUT, 100). @@ -126,7 +125,9 @@ leave(PlayerId, #{lua_state := LuaSt, game_state := GS} = State) -> -spec handle_input(binary(), map(), map()) -> {ok, map()}. handle_input(PlayerId, Input, #{lua_state := LuaSt, game_state := GS} = State) -> {EncInput, LuaSt1} = luerl:encode(Input, LuaSt), - case asobi_lua_loader:call(handle_input, [PlayerId, EncInput, GS], LuaSt1, ?INPUT_TIMEOUT) of + %% No bounded_eval: the per-input spawn dominates real Lua work at + %% high input rates. tick/1 still spawn-isolates. See ADR 0002. + case asobi_lua_loader:call(handle_input, [PlayerId, EncInput, GS], LuaSt1) of {ok, [GS1 | _], LuaSt2} -> {ok, State#{lua_state => LuaSt2, game_state => GS1}}; {error, Reason} -> diff --git a/src/lua/asobi_lua_world.erl b/src/lua/asobi_lua_world.erl index 5c790a5..b0c4b91 100644 --- a/src/lua/asobi_lua_world.erl +++ b/src/lua/asobi_lua_world.erl @@ -45,7 +45,6 @@ function on_zone_unloaded(cx, cy, state) -- return state -define(INIT_TIMEOUT, 2000). -define(GENERATE_TIMEOUT, 5000). -define(TICK_TIMEOUT, 500). --define(INPUT_TIMEOUT, 100). -define(JOIN_TIMEOUT, 200). -define(LEAVE_TIMEOUT, 200). -define(GET_STATE_TIMEOUT, 100). @@ -188,9 +187,10 @@ handle_input(PlayerId, Input, Entities) -> #{lua_state := LuaSt} = ZoneState -> {EncInput, LuaSt1} = luerl:encode(Input, LuaSt), {EncEntities, LuaSt2} = luerl:encode(Entities, LuaSt1), + %% No bounded_eval: see ADR 0002. case asobi_lua_loader:call( - handle_input, [PlayerId, EncInput, EncEntities], LuaSt2, ?INPUT_TIMEOUT + handle_input, [PlayerId, EncInput, EncEntities], LuaSt2 ) of {ok, [Ents1 | _], LuaSt3} -> diff --git a/test/asobi_lua_resource_limits_tests.erl b/test/asobi_lua_resource_limits_tests.erl index 65d987d..c044cfb 100644 --- a/test/asobi_lua_resource_limits_tests.erl +++ b/test/asobi_lua_resource_limits_tests.erl @@ -1,11 +1,12 @@ -module(asobi_lua_resource_limits_tests). -include_lib("eunit/include/eunit.hrl"). -%% Resource-limit tests: every Lua callback runs in a child process -%% with a wall-clock budget so a runaway script can't wedge the -%% calling gen_server. We assert that wedging callbacks (infinite -%% loops, deep recursion, huge allocations) terminate within a known -%% bound and the parent stays responsive. +%% Resource-limit tests: every Lua callback EXCEPT handle_input runs in +%% a child process with a wall-clock budget. handle_input is hot-path +%% (2k+ inputs/sec at scale) and the spawn cost dominated real work; per +%% ADR 0002 it now runs directly. tick/init/get_state/join/leave still +%% spawn-isolate. We assert that the wrapped callbacks terminate within +%% a known bound, AND that handle_input does NOT (intentionally). -spec fixture(string()) -> file:filename_all(). fixture(Name) -> @@ -85,7 +86,52 @@ match_init_timeout_test() -> end, ?assertNotEqual({timeout_blocked}, Result). -match_handle_input_timeout_test() -> +world_handle_input_no_wall_clock_timeout_test() -> + %% Same contract as the match bridge — see ADR 0002. World's + %% handle_input runs directly in the calling process; an + %% infinite-loop input does not self-terminate. + Path = temp_script( + ~""" + match_size = 1 + game_type = "world" + function init(_) return {} end + function generate_world(_, _) return { ["0,0"] = {} } end + function on_zone_loaded(_, _, s) return s, {} end + function zone_tick(ents, s) return ents, s end + function handle_input(_, _, _) while true do end end + """ + ), + Config = #{game_config => #{lua_script => Path}}, + {ok, ZoneStates} = asobi_lua_world:generate_world(0, Config), + Zone0 = maps:get({0, 0}, ZoneStates), + Self = self(), + %% Process dictionary doesn't inherit across spawns — set it inside + %% the child so handle_input enters the Lua call path. + {Pid, Ref} = spawn_monitor(fun() -> + erlang:put({asobi_lua_world, zone_state}, Zone0), + Result = asobi_lua_world:handle_input(~"p1", #{~"x" => 1}, #{}), + Self ! {result, Result} + end), + receive + {result, _} -> + ?assert(false, "world handle_input must not self-terminate; ADR 0002") + after 500 -> + ?assert(is_process_alive(Pid)), + exit(Pid, kill), + receive + {'DOWN', Ref, process, Pid, _} -> ok + after 1000 -> + ok + end + end. + +match_handle_input_no_wall_clock_timeout_test() -> + %% Per ADR 0002, handle_input intentionally has NO wall-clock budget. + %% spawn-overhead at 2k inputs/sec dominated real Lua work, so the + %% input path runs directly in the calling process. The trade is + %% explicit: a runaway handle_input now hangs the match server until + %% its gen_server timeout trips. This test pins the new contract: + %% an infinite-loop handle_input does NOT return on its own. Path = temp_script( ~""" match_size = 1 @@ -99,20 +145,22 @@ match_handle_input_timeout_test() -> ), {ok, S0} = asobi_lua_match:init(#{lua_script => Path}), Self = self(), - Pid = spawn(fun() -> + {Pid, Ref} = spawn_monitor(fun() -> Result = asobi_lua_match:handle_input(~"p1", #{~"x" => 1}, S0), Self ! {result, Result} end), - Result = + receive + {result, _} -> + ?assert(false, "handle_input must not self-terminate; ADR 0002 removed bounded_eval") + after 500 -> + ?assert(is_process_alive(Pid)), + exit(Pid, kill), receive - {result, R} -> R - after 3000 -> - exit(Pid, kill), - timeout_blocked - end, - ?assertNotEqual(timeout_blocked, Result), - %% bridge swallows the timeout error and returns {ok, State} - ?assertMatch({ok, _}, Result). + {'DOWN', Ref, process, Pid, _} -> ok + after 1000 -> + ok + end + end. match_get_state_timeout_test() -> Path = temp_script( diff --git a/test/prop_lua_error_containment.erl b/test/prop_lua_error_containment.erl index c1b9c7e..34606ae 100644 --- a/test/prop_lua_error_containment.erl +++ b/test/prop_lua_error_containment.erl @@ -41,19 +41,25 @@ plan() -> op() -> proper_types:oneof([ {tick}, - {tick_crash, crash_mode()}, - {input, player(), proper_types:oneof([clean, crash_mode()])} + {tick_crash, tick_crash_mode()}, + {input, player(), proper_types:oneof([clean, input_crash_mode()])} ]). -crash_mode() -> - %% `infinite_loop` exercises the bridge's per-callback timeout - %% wrapping (zone_tick/handle_input both run under TICK_TIMEOUT / - %% INPUT_TIMEOUT). Without this entry, the fixture's infinite-loop - %% branch was unreachable from the property — see error_world.lua:23. +%% zone_tick still runs under bounded_eval (TICK_TIMEOUT) so infinite_loop +%% is contained. +tick_crash_mode() -> proper_types:elements([ ~"error", ~"type_error", ~"arith_error", ~"stack_overflow", ~"infinite_loop" ]). +%% handle_input intentionally has no wall-clock budget per ADR 0002, so +%% `infinite_loop` is excluded from the input crash modes — feeding it +%% would actually wedge the property runner. +input_crash_mode() -> + proper_types:elements([ + ~"error", ~"type_error", ~"arith_error", ~"stack_overflow" + ]). + player() -> proper_types:elements([~"e1", ~"e2", ~"e3"]).