From 32bea37378f43875f93cd30c925222bdc15b6291 Mon Sep 17 00:00:00 2001
From: Jeremy lewi <jeremy@lewi.us>
Date: Wed, 22 Apr 2026 07:05:11 -0700
Subject: [PATCH 1/6] docs: update minimal evals design guidance

Signed-off-by: Jeremy lewi <jeremy@lewi.us>
---
 docs-dev/AGENTS.md                        |   8 +
 docs-dev/design/20260421_minimal_evals.md | 659 ++++++++++++++--------
 2 files changed, 432 insertions(+), 235 deletions(-)
 create mode 100644 docs-dev/AGENTS.md

diff --git a/docs-dev/AGENTS.md b/docs-dev/AGENTS.md
new file mode 100644
index 0000000..306c2ee
--- /dev/null
+++ b/docs-dev/AGENTS.md
@@ -0,0 +1,8 @@
+# Agents.md
+
+## Design docs
+
+- Design docs in `docs-dev/` should read as standalone documents. Do not write them as if the reader saw the prompting conversation.
+- Do not title sections around user-asked questions unless the document itself explicitly introduces those questions. Prefer standalone titles like `How To Invoke Prompts And Measure Responses` over titles like `Question 2: How To Invoke Prompts And Measure Responses`.
+- Avoid references like "as discussed above", "as asked", or other wording that assumes the reader knows the chat history. State the problem, recommendation, and rationale directly in the doc.
+- Write in present-tense design language unless the historical sequencing is itself relevant.
diff --git a/docs-dev/design/20260421_minimal_evals.md b/docs-dev/design/20260421_minimal_evals.md
index 90c1223..a61aab0 100644
--- a/docs-dev/design/20260421_minimal_evals.md
+++ b/docs-dev/design/20260421_minimal_evals.md
@@ -2,7 +2,12 @@
 
 ## Status
 
-Draft proposal.
+Current proposal.
+
+The core runtime seam exists. The remaining work is:
+
+- add a small browser-side eval entrypoint
+- add a Node/TS driver that launches a real browser and calls that entrypoint
 
 ## Summary
 
@@ -15,17 +20,20 @@ We want a minimal eval harness for `codex-wasm` that can:
 
 The recommended design is:
 
-1. run evals in a real headless browser runtime
-2. avoid using the DOM as the test contract
-3. reuse the same runtime libraries the UI already uses
-4. drive those libraries from a TypeScript CLI using Playwright or CDP
+1. run evals in headless Chromium
+2. use the same browser runtime services the app already uses
+3. drive the runtime through `HarnessRuntimeManager` and
+   `HarnessChatKitAdapter`, not through DOM automation
+4. prefer `HarnessChatKitAdapter.streamUserMessage(...)` as the prompt seam
 5. assert primarily against:
-   - app-server requests and notifications
    - notebook state snapshots
-   - assistant output
+   - assistant output / emitted ChatKit events
+   - app-server requests and notifications
+   - wasm event journal rows
+   - OPFS contents when agentic search materializes source trees locally
 
-This keeps the eval environment faithful to the browser-only `codex-wasm`
-design while avoiding brittle UI automation.
+The DOM should be incidental. We need a real browser runtime, but not a
+browser-clicking test contract.
 
 ## Context
 
@@ -34,149 +42,217 @@ The newer `codex-wasm` direction is defined in the design docs:
 - `20260415_agentic_search.md`
 - `20260417_codex_wasm_appserver.md`
 
-Agentic search and other capabilities depend on various parameters
+Agentic search quality depends on:
+
 - prompts
 - docs
-- utility libraries
+- search behavior
+- local browser storage state
+- helper/runtime libraries
 
-We need evals to systematically measure and improve these parameters.
+We need evals to measure those pieces systematically.
 
 ## Background: How The UI Works Today
 
-On the latest `main`, the UI now has two Codex-backed harnesses:
+The app has three harnesses:
+
+- `codex`: remote app-server over websocket plus Codex tool bridge
+- `codex-wasm`: browser-local wasm app-server in a dedicated Web Worker
+- `responses-direct`: direct browser-side Responses API integration
+
+The UI now has a cleaner split:
 
-- `codex`: remote app-server over websocket
-- `codex-wasm`: browser-local wasm app-server running behind the same higher-
-  level client/controller flow
+- `ChatKitPanel` owns page-state wiring and `useChatKit(...)`
+- `HarnessRuntimeManager` owns runtime instance caching
+- concrete `HarnessRuntime` classes own start/stop lifecycle
+- `HarnessChatKitAdapter` is the harness-facing request surface
+- `createChatKitFetchFromAdapter(...)` is only a compatibility shim for
+  ChatKit's `fetch` requirement
 
-Both go through the same high-level path:
+The current flow is:
 
 ```text
-ChatKit React UI
-  -> authorized fetch wrapper
-  -> createCodexChatkitFetch()
-  -> CodexConversationController
-  -> CodexAppServerClient
-  -> transport = proxy or wasm
-  -> app-server notifications
-  -> CodexConversationController
-  -> ChatKit SSE events
-  -> ChatKit React UI
+ChatKitPanel
+  -> build page-scoped runtime inputs
+     - codeModeExecutor
+     - codexBridgeHandler (proxy only)
+     - auth resolver (proxy only)
+  -> HarnessRuntimeManager.getOrCreate(...)
+  -> runtime.start()
+  -> runtime.createChatKitAdapter()
+  -> createChatKitFetchFromAdapter(...)
+  -> useChatKit(...)
 ```
 
-More concretely:
-
-1. `ChatKitPanel` creates an authorized fetch wrapper.
-2. That wrapper uses `createCodexChatkitFetch()` as `baseFetch` for the
-   `codex` and `codex-wasm` adapters.
-3. `createCodexChatkitFetch()` parses ChatKit-shaped request bodies such as
-   `threads.list`, `threads.get`, and message-send requests.
-4. For message-send requests, it calls
-   `getCodexConversationController().streamUserMessage(...)`.
-5. `ChatKitPanel` bootstraps the transport through `CodexAppServerClient`:
-   - for `codex`, it calls `connectProxy(...)`
-   - for `codex-wasm`, it calls `connectWasm(...)`
-   - for `codex-wasm`, it also installs the browser code executor bridge
-6. `CodexConversationController`:
-   - ensures or creates the active thread
-   - subscribes to app-server notifications through
-     `CodexAppServerClient.subscribeNotifications(...)`
-   - sends `thread/start` and `turn/start` through the unified app-server
-     client
-   - converts app-server notifications into ChatKit stream events
-7. `createCodexChatkitFetch()` returns those ChatKit stream events as an SSE
-   `Response`.
-
-So the real control plane is already below ChatKit. ChatKit mostly issues
-`fetch` requests; the fetch shim, conversation controller, and app-server
-client do the substantive work.
+For eval design, the important point is that the runtime seam is now explicit
+and reusable below the React/DOM layer.
+
+## Background: Per-Harness Initialization
+
+### `codex` via proxy
+
+`CodexProxyHarnessRuntime.start()` currently does:
+
+1. optionally select the current Codex project
+2. `getCodexAppServerClient().useTransport("proxy")`
+3. install the proxy authorization resolver
+4. `connectProxy(...)`
+5. configure the `CodexToolBridge`
+6. `refreshHistory()`
+7. `ensureActiveThread()`
+
+Prompt execution then flows through:
+
+- `HarnessChatKitAdapter.streamUserMessage(...)`
+- `CodexConversationController.streamUserMessage(...)`
+- `CodexAppServerClient.sendRequest(...)`
+- proxy app-server websocket notifications
+
+### `codex-wasm`
+
+`CodexWasmHarnessRuntime.start()` currently does:
+
+1. optionally select the current Codex project
+2. wrap the page's `codeModeExecutor` with `createCodexWasmCodeExecutor(...)`
+3. `getCodexAppServerClient().useTransport("wasm")`
+4. `connectWasm({ apiKey, sessionOptions })`
+5. clear any proxy bridge state
+6. `refreshHistory()`
+7. `ensureActiveThread()`
+
+Prompt execution still flows through the same:
+
+- `HarnessChatKitAdapter`
+- `CodexConversationController`
+- `CodexAppServerClient`
+
+The only transport difference is that the selected app-server backend is now
+the browser-local wasm worker.
+
+### `responses-direct`
+
+`responses-direct` is now thinner:
+
+- it creates a `ResponsesDirectChatKitAdapter`
+- it does not require a heavy runtime `start()`
+- tool execution is handled inside the harness, not by ChatKit
+
+This matters because the eval seam should follow the harness boundary, not old
+ChatKit-specific tool callback flows.
 
 ## Goals
 
-- Exercise the real `codex-wasm` runtime in a browser-faithful environment.
-- Submit prompts and collect results from JS/TS, not from the UI.
-- Support behavior assertions such as "did the agent use notebook mutation?"
-- Support outcome assertions such as "did the notebook gain a cell containing
+- exercise the real `codex-wasm` runtime in a browser-faithful environment
+- submit prompts and collect results from JS/TS, not by clicking UI controls
+- support behavior assertions such as "did the agent search/fetch source?"
+- support outcome assertions such as "did the notebook gain a cell containing
   `hello world`?"
-- Keep the first version small enough to implement quickly.
+- keep the first version small enough to implement quickly
 
 ## Non-Goals
 
-- Do not make DOM rendering the primary contract.
-- Do not require pixel/UI automation for pass/fail.
-- Do not build a full general-purpose benchmark runner up front.
-- Do not block on a bespoke search SDK; evals should work with the low-level
-  agentic-search model described in the design docs.
+- do not make DOM rendering the primary contract
+- do not require pixel/UI automation for pass/fail
+- do not build a full general-purpose benchmark runner up front
+- do not build a second parallel prompt runtime
 
-## Question 1: Runtime Environment
+## Runtime Environment
 
 ### Recommendation
 
-Use headless Chromium as the runtime environment, launched from a Node/TS
-driver.
+Use headless Chromium launched from a Node/TS driver.
 
-Playwright is the simplest way to do this from TypeScript, though a raw CDP
-client would also work.
+Playwright is the simplest default. Raw CDP would also work.
 
 ### Why Chromium
 
-`codex-wasm` and the proposed agentic-search flow rely on browser features that
-do not exist or are not trustworthy in pure Node-based test environments:
+`codex-wasm` depends on browser features that are not trustworthy in pure
+Node-based test environments:
 
 - Web Workers
 - WebAssembly in a browser worker
 - IndexedDB
-- OPFS / browser-persistent storage
+- OPFS
 - browser `fetch`
-- the existing Runme browser AppKernel integration
+- the existing browser AppKernel integration
 
-`jsdom` or a pure Vitest node environment will not be faithful enough.
+`jsdom` or a pure Vitest node environment is not faithful enough.
 
 ### Why Headless Browser Instead Of DOM Automation
 
-We need browser capabilities, not UI fidelity.
+We need browser capabilities, not a UI-driven contract.
 
-The right distinction is:
+The distinction should be:
 
 - `yes` to a real browser runtime
-- `no` to using visible DOM elements as the control plane
+- `no` to clicking the ChatKit composer and scraping rendered bubbles
 
-The eval driver should call existing runtime interfaces with
-`page.evaluate(...)` or CDP runtime evaluation, not click the ChatKit composer
-and read rendered bubbles.
+The eval driver should call runtime APIs with `page.evaluate(...)`, not drive
+React components.
 
-### Chrome vs Something Else
+## How To Invoke Prompts And Measure Responses
 
-For v0, Chromium should be the default.
+### Recommendation
 
-Reasons:
+Prefer the new harness/runtime seam:
 
-- it is the best-supported headless browser for modern storage/runtime APIs
-- it matches the environment we are already implicitly designing for
-- Playwright support is straightforward
-- CDP access is first-class if we need lower-level debugging later
+- `HarnessRuntimeManager`
+- concrete `HarnessRuntime`
+- `HarnessChatKitAdapter`
 
-Firefox or WebKit can be follow-up compatibility targets, not the initial eval
-runtime.
+For `codex-wasm`, the recommended flow is:
 
-## Question 2: How To Invoke Prompts And Measure Responses
+1. create/start the same runtime the UI uses
+2. get the runtime's `HarnessChatKitAdapter`
+3. call `adapter.streamUserMessage(...)`
+4. collect emitted ChatKit events into an array
+5. inspect notebook state, app-server traces, wasm journal, and OPFS
 
-### Recommendation
+This is better than driving the DOM, and it is more aligned with the current
+code than calling legacy fetch shims directly.
+
+### Preferred Eval Seam
+
+The preferred browser-side eval seam is:
+
+```text
+HarnessRuntimeManager
+  -> CodexWasmHarnessRuntime
+  -> HarnessChatKitAdapter.streamUserMessage(...)
+  -> CodexConversationController
+  -> CodexAppServerClient (transport = wasm)
+  -> CodexWasmAppServerClient
+  -> worker/app-server
+```
+
+### Optional Higher-Level Seam
+
+If we want exact ChatKit request parity, we can still use:
 
-Use the same libraries the UI already uses and bypass ChatKit itself.
+- `createChatKitFetchFromAdapter(...)`
 
-The recommended eval seam is:
+and send it the same request JSON that ChatKit sends.
+
+That is useful for verifying:
+
+- ChatKit request parsing
+- JSON/SSE response formatting
+- abort behavior
 
-- `CodexConversationController` for prompt execution and ChatKit-event emission
-- `CodexAppServerClient` for raw app-server request/notification flow across
-  both `proxy` and `wasm` transport
+But for most evals, `HarnessChatKitAdapter.streamUserMessage(...)` is the
+better seam.
 
-If we want exact ChatKit request parity, an alternative is to call
-`createCodexChatkitFetch()` directly with the same JSON bodies ChatKit would
-send. But for most evals, the better seam is one layer lower:
-`streamUserMessage(...)`.
+### Optional Lower-Level Codex-Specific Seams
 
-Do not make the eval script talk to React components or DOM nodes directly.
+For Codex-specific debugging, we can also call:
+
+- `CodexConversationController.streamUserMessage(...)`
+- `CodexAppServerClient.sendRequest(...)`
+- `CodexAppServerClient.subscribeNotifications(...)`
+- `getCodexWasmAppServerClient().getEventJournal()`
+
+Those are useful when we want lower-level traces, but they are more
+transport-specific than the harness adapter seam.
 
 ## Proposed Architecture
 
@@ -184,94 +260,117 @@ Do not make the eval script talk to React components or DOM nodes directly.
 TS CLI
   -> Playwright Chromium
   -> page.evaluate(...)
-  -> CodexConversationController or createCodexChatkitFetch()
-  -> CodexAppServerClient
-  -> transport = wasm
-  -> AppKernel / notebook runtime / OPFS
+  -> tiny browser eval entrypoint
+  -> HarnessRuntimeManager.getOrCreate(...)
+  -> runtime.start()
+  -> runtime.createChatKitAdapter()
+  -> adapter.streamUserMessage(...)
+  -> notebook/AppKernel/OPFS + app-server traces + wasm journal
 ```
 
 The browser page is the runtime host.
 
-The controller/fetch shim is the contract.
+The harness runtime + adapter is the main contract.
 
 The DOM is incidental.
 
 ## Recommended Eval Layers
 
-### 1. Preferred: Controller-Level Evals
+### 1. Preferred: Adapter-Level Evals
 
-Call the same controller methods the fetch shim uses:
+Call the same adapter methods the UI now routes into:
 
-- `ensureActiveThread()`
-- `streamUserMessage(prompt, chatkitState, sink)`
-- `refreshHistory()`
+- `listThreads()`
 - `getThread(threadId)`
-- `handleListItems(threadId)`
+- `listItems(threadId)`
+- `streamUserMessage({ input, threadId?, model? }, sink)`
 
 This gives us:
 
-- the same app-server methods as the UI
-- the same app-server notification subscription path as the UI
-- the same ChatKit stream event derivation as the UI
-- no dependency on the ChatKit React widget or DOM
+- the same runtime startup path as the UI
+- the same harness-specific prompt path as the UI
+- no dependency on React or DOM state as the control plane
 
-This is the best seam for prompt evals because it reuses the real runtime logic
-without forcing the test to construct full ChatKit HTTP payloads.
+This should be the default minimal eval seam.
 
 ### 2. Optional: Fetch-Shim Evals
 
-If we want exact parity with the ChatKit request contract, call
-`createCodexChatkitFetch()` and send it the same request JSON that ChatKit
-would send.
+Call `createChatKitFetchFromAdapter(...)` with the adapter returned by the
+runtime.
 
-This is useful when we specifically want to validate:
+This is only needed if we specifically want to validate:
 
-- ChatKit request parsing
-- ChatKit state injection
+- ChatKit payload parsing
 - SSE response formatting
+- fetch/abort behavior
 
-But it is a slightly higher-level seam than most prompt evals need.
+### 3. Low-Level Observation
 
-### 3. Raw Proxy Observation
+For assertions on behavior, also collect:
 
-For assertions on lower-level behavior, also use:
+- app-server requests sent through `CodexAppServerClient`
+- app-server notifications
+- wasm journal rows via `getCodexWasmAppServerClient().getEventJournal()`
+- OPFS state, either directly or via the app's OPFS helpers
 
-- `CodexAppServerClient.sendRequest(...)`
-- `CodexAppServerClient.subscribeNotifications(...)`
+## Assertion Model
 
-This is the right place to observe:
+Each eval should produce a structured result object. Assertions should run in
+Node against that object.
 
-- `thread/start`
-- `turn/start`
-- `turn/interrupt`
-- streamed notifications
+## Timing Metrics
 
-For `codex-wasm`, the browser-local worker/journal path is already present on
-`main`, so evals can also query the wasm event journal through
-`CodexWasmAppServerClient.getEventJournal()` when transport-specific inspection
-is useful.
+Minimal evals should record at least these timing metrics for every turn.
 
-## Assertion Model
+### Time To First Message (TTFM)
 
-Each eval should produce a structured result object. Assertions should run in
-Node against that object.
+`TTFM` is the elapsed time from prompt submission until the first assistant
+message content is emitted.
+
+This is the first visible sign of progress to the human, so it is the main
+"how long did the user wait before seeing a response?" metric.
+
+For implementation purposes, this should usually be measured from:
+
+- start: when the eval submits the prompt to the harness adapter
+- end: the first assistant message delta or first assistant message item added
+
+### Turn Time
+
+`TurnTime` is the elapsed time from prompt submission until the turn fully
+completes.
+
+This is the total end-to-end latency for the turn, including any search,
+tooling, notebook mutation, and final assistant response.
 
-### Assertion Types
+For implementation purposes, this should usually be measured from:
 
-#### Tool / behavior assertions
+- start: when the eval submits the prompt to the harness adapter
+- end: the terminal turn-complete event
+
+### Why Both Metrics Matter
+
+- `TTFM` measures perceived responsiveness
+- `TurnTime` measures total completion latency
+
+An eval can have a good `TTFM` but a poor `TurnTime` if it responds quickly
+and then spends a long time finishing the turn. We want to track both.
+
+### Tool / behavior assertions
 
 Examples:
 
-- observed a code-executor bridge request
-- observed notebook mutation activity
-- observed search-related `net.get(...)` access
+- observed `turn/start`
+- observed wasm journal rows for the turn
+- observed code execution / notebook mutation behavior
+- observed search-related network or file activity
 - observed OPFS writes under `/code/runmedev/web`
 - observed the Runme repo cache materialized in OPFS under `/code/runmedev/web`
 
-These should come from journal entries, bridge payloads, and captured tool
-outputs.
+These should come from request logs, notifications, journal rows, and storage
+inspection.
 
-#### Result assertions
+### Result assertions
 
 Examples:
 
@@ -279,7 +378,8 @@ Examples:
 - notebook contains a new code cell
 - inserted cell source contains `hello world`
 
-These should come from notebook snapshots and assistant output, not the DOM.
+These should come from notebook snapshots and assistant output, not from the
+rendered DOM.
 
 ### Example Eval Shape
 
@@ -287,94 +387,195 @@ These should come from notebook snapshots and assistant output, not the DOM.
 await runEval({
   name: "adds hello world cell",
   prompt: 'Add a cell to print "hello world".',
-  assert(result) {
-    expect(result.appServerRequests).toContainEntryMatching((entry) =>
-      entry.method === "turn/start"
+  async assert(result) {
+    expect(result.appServerRequests).toContainEntryMatching(
+      (entry) => entry.method === "turn/start",
     );
-    expect(result.notebook).toContainCellMatching((cell) =>
-      typeof cell.value === "string" && cell.value.includes("hello world")
+    expect(result.notebook.cells).toContainEqual(
+      expect.objectContaining({
+        value: expect.stringContaining("hello world"),
+      }),
     );
   },
 });
 ```
 
-## Minimal Refactor Needed
+## Current Runtime Boundary
 
-### Recommendation
+The current code already provides:
+
+- `HarnessRuntimeManager`
+- `CodexProxyHarnessRuntime`
+- `CodexWasmHarnessRuntime`
+- `HarnessChatKitAdapter`
+- `createChatKitFetchFromAdapter(...)`
+- harness-owned tool handling
+
+For evals, this means:
 
-Yes, we should do a small refactor.
+- we do **not** need to invent a new bootstrap layer
+- we do **not** need a new Codex runtime API
+- we should build on the harness/runtime seam that now exists
 
-The refactor is not about moving logic into the DOM. It is about making the
-existing runtime-facing pieces callable without React.
+## Remaining Additions For Minimal Evals
 
-### What To Extract
+### 1. Add a tiny browser-side eval entrypoint
 
-Do not create a parallel prompt runtime.
+We still need a small browser-side helper, likely test-only, that exposes the
+runtime seam to Playwright/CDP.
 
-Instead, extract only the minimum needed to let a browser-driven script call
-the existing runtime services cleanly outside React.
+That helper should do only this:
 
-The required refactor is:
+- construct the same runtime inputs `ChatKitPanel` constructs
+  - `codeModeExecutor`
+  - `codexBridgeHandler` for proxy mode
+  - auth resolver for proxy mode
+- call `HarnessRuntimeManager.getOrCreate(...)`
+- `start()` the runtime
+- `streamUserMessage(...)`
+- collect emitted events
+- return notebook / trace / journal / OPFS snapshots
 
-1. Extract the Codex harness bootstrap logic currently embedded in
-   `ChatKitPanel` into a reusable helper.
-   This helper should own:
-   - `controller.setSelectedProject(...)`
-   - `proxy.setCodeExecutor(...)` for `codex-wasm`
-   - `proxy.useTransport(...)`
-   - `proxy.connectProxy(...)` or `proxy.connectWasm(...)`
-   - `controller.refreshHistory()`
-   - `controller.ensureActiveThread()`
-   - cleanup via `proxy.disconnect()`,
-     `proxy.setAuthorizationResolver(null)`, and
-     `proxy.setCodeExecutor(null)`
+This can be a small `window.__runmeEval` bridge or a test-only imported module.
 
-2. Add a small helper around
-   `CodexConversationController.streamUserMessage(...)` that collects emitted
-   ChatKit events into an array and returns them with the next ChatKit state.
-   This is a convenience wrapper, not a new runtime.
+### 2. Extract the page-scoped `codeModeExecutor` builder into a reusable helper
 
-3. Add a thin browser-driver entrypoint so a Playwright/CDP script can call the
-   existing singletons without mounting or controlling the ChatKit UI.
-   That entrypoint should forward to the existing services and expose only:
-   - bootstrap/configure runtime
-   - run prompt
-   - inspect notebook state
-   - inspect wasm journal / app-server traces
+This is the recommended approach.
 
-4. Optionally extract a helper for building the wasm code executor from current
-   app state.
-   Today `ChatKitPanel` creates `codeModeExecutor` from notebook/UI state and
-   then wraps it with `createCodexWasmCodeExecutor(...)`.
-   If the eval driver reuses the same page/app state, this can stay mostly as
-   is.
-   If not, we should expose a helper that builds the same executor without
-   depending on the full ChatKit component tree.
+The main logic that still lives in `ChatKitPanel` is the page-state wiring for:
 
-If we need a browser-global helper at all, it should be a tiny adapter over the
-existing controller/proxy methods, not a new runtime layer.
+- `resolveCodeModeNotebook(...)`
+- `listNotebooks(...)`
+- renderer/notebook update hooks
 
-### Why This Refactor Is Worth It
+For notebook-mutation evals, we should extract that logic into a reusable
+helper rather than reconstruct it separately inside an eval driver.
 
-- cleaner separation between runtime and presentation
-- stable test contract
-- easier replay/debugging
-- reusable for scripted demos and future benchmarking
+#### Why extraction is the right approach
 
-## Implementation Plan
+The alternatives are:
 
-### Phase 0: Build The Eval Contract
+1. rebuild notebook resolution logic separately in the eval helper
+2. keep the logic embedded in `ChatKitPanel` and somehow reach into component
+   state from tests
+3. extract a reusable builder and have both the UI and eval path call it
+
+The third option is the cleanest because:
+
+- the UI and evals will use the same notebook resolution rules
+- notebook mutations will go through the same renderer/model update path
+- `codex-wasm` and `responses-direct` will see the same `ExecuteCode`
+  environment in normal UI use and in evals
+- future notebook-related fixes only need to be made in one place
+
+This is not a new runtime layer. It is just moving page-scoped wiring into a
+shared helper.
+
+#### What should be extracted
+
+The helper should own:
+
+- resolving a notebook from:
+  - explicit URI
+  - explicit handle
+  - current visible notebook fallback
+- enumerating open notebooks plus the current notebook
+- applying notebook mutations through the same `NotebookData` model objects
+- forwarding cell updates to the current renderer set
+
+That means the helper should encapsulate the current `ChatKitPanel` logic for:
+
+- `resolveCodeModeNotebook(...)`
+- `listNotebooks(...)`
+- `renderer.onCellUpdate(...)` fanout before `data.updateCell(...)`
+
+#### Recommended helper shape
+
+Something like:
+
+```ts
+type BuildPageCodeModeExecutorOptions = {
+  getNotebookData: (uri: string) => NotebookDataLike | null;
+  getOpenNotebookUris: () => string[];
+  getCurrentDocUri: () => string | null;
+  getRenderers: () => Iterable<{ onCellUpdate(cell: Cell): void }>;
+};
+
+function buildPageCodeModeExecutor(
+  options: BuildPageCodeModeExecutorOptions,
+): CodeModeExecutor
+```
 
-Extract or expose only enough bootstrap code to let a browser-driven script:
+Internally, that helper can still create:
 
-- connect the unified app-server client in `wasm` mode
-- call `streamUserMessage(...)` or the thin wrapper around it
-- collect emitted ChatKit events
-- inspect notebook state
-- inspect request/notification traces
-- inspect the wasm event journal
+- a notebook resolver
+- a notebook lister
+- the final `createCodeModeExecutor(...)`
 
-### Phase 1: Add A Node/TS Driver
+The important point is that callers should not need to rebuild that wiring
+themselves.
+
+#### Why return the final executor instead of intermediate pieces
+
+Returning the final `CodeModeExecutor` is the better default because:
+
+- `ChatKitPanel` wants the executor
+- `CodexWasmHarnessRuntime` wants the executor
+- `ResponsesDirectChatKitAdapter` wants the executor
+- the eval entrypoint also wants the executor
+
+If we instead expose only `resolveCodeModeNotebook(...)`, every caller would
+still need to remember how to assemble:
+
+- `listNotebooks(...)`
+- renderer update hooks
+- `createCodeModeExecutor(...)`
+
+That would spread the same wiring back across multiple sites.
+
+#### What should remain outside the helper
+
+The helper should not own:
+
+- harness selection
+- `HarnessRuntimeManager`
+- Codex auth resolution
+- bridge connection
+- app-server transport selection
+
+Those belong to the harness runtime layer.
+
+The helper is only for building the notebook-aware code execution environment
+used by:
+
+- `codex-wasm`
+- Codex bridge tool handling
+- `responses-direct` internal `ExecuteCode`
+
+#### How the eval path would use it
+
+With this extraction, the browser-side eval entrypoint can:
+
+1. build the page-scoped `CodeModeExecutor` using the same helper as the UI
+2. pass that executor into `HarnessRuntimeManager.getOrCreate(...)`
+3. start the selected runtime
+4. send prompts through `HarnessChatKitAdapter.streamUserMessage(...)`
+
+That keeps the eval path aligned with the real notebook/runtime wiring instead
+of approximating it.
+
+### 3. Add request/journal capture helpers
+
+Minimal evals should capture:
+
+- outbound app-server requests
+- inbound notifications
+- wasm event journal rows
+
+This can be done in the browser helper rather than by changing production
+runtime APIs much further.
+
+### 4. Add a Node/TS driver
 
 Add a script, for example:
 
@@ -384,14 +585,13 @@ Responsibilities:
 
 - launch headless Chromium
 - open the app
-- connect the same runtime libraries the UI uses
+- call the browser eval helper
 - seed notebook state if needed
 - submit prompt
-- fetch result object
 - run assertions
 - print structured pass/fail output
 
-### Phase 2: Add A Tiny Initial Suite
+## Initial Eval Suite
 
 Start with these evals:
 
@@ -415,28 +615,17 @@ Start with these evals:
    - notebook has a new cell
    - cell source contains `hello world`
 
-## Current Dependency Boundary
-
-The wasm runtime pieces are now present on `main`, including:
-
-- `CodexAppServerClient`
-- `CodexWasmAppServerClient`
-- `CodexWasmWorkerClient`
-- the wasm event journal
-
-So the remaining dependency is not missing runtime code. The remaining work is
-extracting the bootstrap and driver-facing entrypoint out of `ChatKitPanel`.
-
 ## Decision
 
 For minimal evals, we should:
 
 - use headless Chromium
-- call the same controller/proxy methods the UI already uses
-- bypass ChatKit React, but optionally reuse `createCodexChatkitFetch()` when
-  exact ChatKit parity matters
-- assert on app-server traces, notebook state, and assistant output
-- do only a small extraction for bootstrap/test access, not a new runtime API
-
-That gives us a faithful environment for agentic search and notebook mutation
-tests without building a brittle browser UI test suite.
+- reuse `HarnessRuntimeManager` and `HarnessChatKitAdapter`
+- bypass ChatKit React and the DOM
+- optionally reuse `createChatKitFetchFromAdapter(...)` when exact ChatKit
+  parity matters
+- assert on notebook state, assistant output, app-server traces, wasm journal,
+  and OPFS state
+
+The right seam is already present in the runtime. The remaining work is to add
+a thin browser-side test entrypoint and a TS driver on top of it.

From dcabc7fe0b449146d3d4e0990189ec1ff30caf50 Mon Sep 17 00:00:00 2001
From: Jeremy lewi <jeremy@lewi.us>
Date: Wed, 22 Apr 2026 07:26:32 -0700
Subject: [PATCH 2/6] feat: add minimal browser eval harness

Signed-off-by: Jeremy lewi <jeremy@lewi.us>
---
 app/package.json                             |   2 +
 app/src/App.tsx                              |   2 +
 app/src/components/ChatKit/ChatKitPanel.tsx  |  78 +--
 app/src/components/Evals/RunmeEvalHost.tsx   | 515 +++++++++++++++++++
 app/src/lib/runtime/pageCodeModeExecutor.ts  | 121 +++++
 app/test/evals/.generated/runMinimalEvals.js | 254 +++++++++
 app/test/evals/runMinimalEvals.ts            | 356 +++++++++++++
 docs-dev/design/20260421_minimal_evals.md    |  79 +--
 testing/aiservice/main.go                    |  47 ++
 9 files changed, 1349 insertions(+), 105 deletions(-)
 create mode 100644 app/src/components/Evals/RunmeEvalHost.tsx
 create mode 100644 app/src/lib/runtime/pageCodeModeExecutor.ts
 create mode 100644 app/test/evals/.generated/runMinimalEvals.js
 create mode 100644 app/test/evals/runMinimalEvals.ts

diff --git a/app/package.json b/app/package.json
index 3d74a48..5f37879 100644
--- a/app/package.json
+++ b/app/package.json
@@ -8,6 +8,8 @@
     "sync:codex-wasm": "node tools/sync_codex_wasm_assets.mjs",
     "typecheck": "tsc -p tsconfig.app.json --noEmit",
     "build": "vite build",
+    "evals:build": "tsc --target es2020 --module nodenext --moduleResolution nodenext --esModuleInterop --skipLibCheck --outDir test/evals/.generated test/evals/runMinimalEvals.ts",
+    "evals:run": "pnpm run evals:build && node test/evals/.generated/runMinimalEvals.js",
     "cuj:build": "tsc --target es2020 --module nodenext --moduleResolution nodenext --esModuleInterop --skipLibCheck --outDir test/browser/.generated test/browser/run-cuj-scenarios.ts",
     "cuj:run": "pnpm run cuj:build && node test/browser/.generated/run-cuj-scenarios.js",
     "cuj:upload": "tsc --target es2020 --module nodenext --moduleResolution nodenext --esModuleInterop --skipLibCheck --outDir test/browser/.generated test/browser/upload-cuj-artifacts.ts && node test/browser/.generated/upload-cuj-artifacts.js",
diff --git a/app/src/App.tsx b/app/src/App.tsx
index c3d4116..e62dc93 100644
--- a/app/src/App.tsx
+++ b/app/src/App.tsx
@@ -48,6 +48,7 @@ import { SidePanelProvider } from "./contexts/SidePanelContext";
 import { appState } from "./lib/runtime/AppState";
 import GlobalToast from "./components/Toast";
 import DriveLinkCoordinatorHost from "./components/DriveLinkCoordinatorHost";
+import { RunmeEvalHost } from "./components/Evals/RunmeEvalHost";
 import { appLogger } from "./lib/logging/runtime";
 import {
   getConfiguredAgentEndpoint,
@@ -167,6 +168,7 @@ function App({ branding }: AppProps) {
                       <OutputProvider>
                         <NotebookProvider>
                           <SidePanelProvider>
+                            <RunmeEvalHost />
                             <GlobalToast />
                             <AppRouter />
                           </SidePanelProvider>
diff --git a/app/src/components/ChatKit/ChatKitPanel.tsx b/app/src/components/ChatKit/ChatKitPanel.tsx
index 264e185..ad1026f 100644
--- a/app/src/components/ChatKit/ChatKitPanel.tsx
+++ b/app/src/components/ChatKit/ChatKitPanel.tsx
@@ -11,6 +11,7 @@ import {
   parser_pb,
 } from '../../contexts/CellContext'
 import { useNotebookContext } from '../../contexts/NotebookContext'
+import { useOutput } from '../../contexts/OutputContext'
 import { useCurrentDoc } from '../../contexts/CurrentDocContext'
 import {
   useHarness,
@@ -21,8 +22,8 @@ import {
 import {
   buildCodexChatKitFetchOptions,
 } from '../../lib/runtime/codexChatKitAdapter'
-import { createCodeModeExecutor } from '../../lib/runtime/codeModeExecutor'
 import { createChatKitFetchFromAdapter } from '../../lib/runtime/createChatKitFetchFromAdapter'
+import { buildPageCodeModeExecutor } from '../../lib/runtime/pageCodeModeExecutor'
 import { useCodexConversationSnapshot } from '../../lib/runtime/codexConversationController'
 import type {
   HarnessChatKitAdapter,
@@ -204,6 +205,7 @@ function ChatKitPanelInner({ defaultHarness }: ChatKitPanelInnerProps) {
   const harnessRuntimeManager = useMemo(() => getHarnessRuntimeManager(), [])
   const { getNotebookData, useNotebookList } =
     useNotebookContext()
+  const { getAllRenderers } = useOutput()
   const { getCurrentDoc } = useCurrentDoc()
   const responsesDirectConfig = useResponsesDirectConfigSnapshot()
   const codexProjects = useCodexProjects()
@@ -217,75 +219,19 @@ function ChatKitPanelInner({ defaultHarness }: ChatKitPanelInnerProps) {
   openNotebookListRef.current = openNotebookList
   const currentDocUriRef = useRef(currentDocUri)
   currentDocUriRef.current = currentDocUri
-
-  const resolveCodeModeNotebook = useCallback(
-    (target?: unknown) => {
-      const targetUri =
-        typeof target === 'string'
-          ? target
-          : typeof target === 'object' && target && 'uri' in target
-            ? (target as { uri?: string }).uri
-            : typeof target === 'object' &&
-                target &&
-                'handle' in target &&
-                (target as { handle?: { uri?: string } }).handle?.uri
-              ? (target as { handle?: { uri?: string } }).handle?.uri
-              : currentDocUriRef.current
-      if (!targetUri) {
-        return null
-      }
-      const data = getNotebookDataRef.current(targetUri)
-      if (!data) {
-        return null
-      }
-
-      return {
-        getUri: () => data.getUri(),
-        getName: () => data.getName(),
-        getNotebook: () => data.getNotebook(),
-        updateCell: (cell: parser_pb.Cell) => {
-          for (const renderer of getAllRenderersRef.current().values()) {
-            renderer.onCellUpdate(cell)
-          }
-          data.updateCell(cell)
-        },
-        getCell: (refId: string) => data.getCell(refId),
-        appendCodeCell: data.appendCodeCell?.bind(data),
-        addCodeCellAfter: data.addCodeCellAfter?.bind(data),
-        addCodeCellBefore: data.addCodeCellBefore?.bind(data),
-        removeCell: data.removeCell?.bind(data),
-      }
-    },
-    []
-  )
+  const getAllRenderersRef = useRef(getAllRenderers)
+  getAllRenderersRef.current = getAllRenderers
 
   const codeModeExecutor = useMemo(
     () =>
-      createCodeModeExecutor({
-        mode: 'sandbox',
-        resolveNotebook: resolveCodeModeNotebook,
-        listNotebooks: () => {
-          const uris = new Set<string>()
-          for (const notebook of openNotebookListRef.current) {
-            if (typeof notebook?.uri === 'string' && notebook.uri.trim()) {
-              uris.add(notebook.uri)
-            }
-          }
-          if (currentDocUriRef.current) {
-            uris.add(currentDocUriRef.current)
-          }
-          return Array.from(uris)
-            .map((uri) => resolveCodeModeNotebook(uri))
-            .filter(
-              (
-                notebook
-              ): notebook is NonNullable<
-                ReturnType<typeof resolveCodeModeNotebook>
-              > => Boolean(notebook)
-            )
-        },
+      buildPageCodeModeExecutor({
+        getNotebookData: (uri) => getNotebookDataRef.current(uri),
+        getOpenNotebookUris: () =>
+          openNotebookListRef.current.map((notebook) => notebook.uri),
+        getCurrentDocUri: () => currentDocUriRef.current,
+        getRenderers: () => getAllRenderersRef.current().values(),
       }),
-    [resolveCodeModeNotebook]
+    []
   )
 
   const handleCodexBridgeToolCall = useMemo(
diff --git a/app/src/components/Evals/RunmeEvalHost.tsx b/app/src/components/Evals/RunmeEvalHost.tsx
new file mode 100644
index 0000000..a330472
--- /dev/null
+++ b/app/src/components/Evals/RunmeEvalHost.tsx
@@ -0,0 +1,515 @@
+import { create } from "@bufbuild/protobuf";
+import { useEffect, useMemo, useRef } from "react";
+
+import { useCurrentDoc } from "../../contexts/CurrentDocContext";
+import { useNotebookContext } from "../../contexts/NotebookContext";
+import { useOutput } from "../../contexts/OutputContext";
+import { parser_pb } from "../../runme/client";
+import { LOCAL_FOLDER_URI } from "../../storage/local";
+import { getAuthData } from "../../token";
+import { appState } from "../../lib/runtime/AppState";
+import {
+  getCodexAppServerClient,
+  type CodexProxyJsonRpcNotification,
+} from "../../lib/runtime/codexAppServerClient";
+import { useCodexProjects } from "../../lib/runtime/codexProjectManager";
+import { getCodexWasmAppServerClient } from "../../lib/runtime/codexWasmAppServerClient";
+import type { HarnessChatKitAdapter } from "../../lib/runtime/harnessChatKitAdapter";
+import { getHarnessRuntimeManager } from "../../lib/runtime/harnessRuntimeManager";
+import type { HarnessAdapter, HarnessProfile } from "../../lib/runtime/harnessManager";
+import { createCodexBridgeToolHandler } from "../../lib/runtime/notebookToolHandlers";
+import { createAppKernelOpfsApi } from "../../lib/runtime/appKernelLowLevelApis";
+import { buildPageCodeModeExecutor } from "../../lib/runtime/pageCodeModeExecutor";
+import {
+  responsesDirectConfigManager,
+  useResponsesDirectConfigSnapshot,
+} from "../../lib/runtime/responsesDirectConfigManager";
+import type { ChatKitStreamEvent } from "../../lib/runtime/chatkitProtocol";
+
+type EvalNotebookCellInput = {
+  refId?: string;
+  languageId?: string;
+  value?: string;
+  metadata?: Record<string, string>;
+};
+
+type CreateEvalNotebookOptions = {
+  name: string;
+  cells?: EvalNotebookCellInput[];
+  open?: boolean;
+};
+
+type RunEvalOptions = {
+  harness: {
+    adapter: HarnessAdapter;
+    name?: string;
+    baseUrl?: string;
+  };
+  prompt: string;
+  notebookUri?: string | null;
+  projectId?: string;
+  model?: string;
+  timeoutMs?: number;
+  wasmApiKey?: string;
+  responsesApiBaseUrl?: string;
+  inspectOpfsPath?: string | null;
+};
+
+type EvalRequestRecord = {
+  timestamp: string;
+  method: string;
+  params?: unknown;
+};
+
+type EvalOpfsEntry = {
+  path: string;
+  kind: "file" | "directory";
+  size?: number;
+};
+
+type RunEvalResult = {
+  harness: HarnessProfile;
+  prompt: string;
+  threadId: string | null;
+  events: ChatKitStreamEvent[];
+  assistantText: string;
+  requestLog: EvalRequestRecord[];
+  notifications: CodexProxyJsonRpcNotification[];
+  wasmJournal: unknown[];
+  notebook:
+    | {
+        uri: string;
+        name: string;
+        cells: Array<{
+          refId: string;
+          languageId: string;
+          value: string;
+        }>;
+      }
+    | null;
+  opfs: EvalOpfsEntry[];
+  metrics: {
+    ttfmMs: number | null;
+    turnTimeMs: number;
+  };
+};
+
+type RunmeEvalApi = {
+  waitUntilReady(timeoutMs?: number): Promise<true>;
+  configureResponsesDirect(options: {
+    authMethod?: "oauth" | "api_key";
+    apiKey?: string;
+    openaiOrganization?: string;
+    openaiProject?: string;
+  }): Promise<void>;
+  createLocalNotebook(options: CreateEvalNotebookOptions): Promise<{ uri: string }>;
+  openNotebook(uri: string): Promise<void>;
+  run(options: RunEvalOptions): Promise<RunEvalResult>;
+};
+
+declare global {
+  interface Window {
+    __runmeEval?: RunmeEvalApi;
+  }
+}
+
+function buildHarnessProfile(input: RunEvalOptions["harness"]): HarnessProfile {
+  const baseUrl =
+    input.baseUrl ??
+    (input.adapter === "responses-direct"
+      ? "https://api.openai.com"
+      : "http://127.0.0.1:19989");
+  return {
+    name:
+      input.name ??
+      `${input.adapter}-eval`,
+    adapter: input.adapter,
+    baseUrl,
+  };
+}
+
+function buildNotebook(
+  cells: EvalNotebookCellInput[] | undefined,
+): parser_pb.Notebook {
+  return create(parser_pb.NotebookSchema, {
+    metadata: {},
+    cells: (cells ?? []).map((cell, index) =>
+      create(parser_pb.CellSchema, {
+        refId:
+          cell.refId ??
+          `eval_cell_${index + 1}_${Math.random().toString(36).slice(2, 8)}`,
+        kind: parser_pb.CellKind.CODE,
+        role: parser_pb.CellRole.USER,
+        languageId: cell.languageId ?? "python",
+        value: cell.value ?? "",
+        metadata: cell.metadata ?? {},
+        outputs: [],
+      }),
+    ),
+  });
+}
+
+function extractAssistantText(events: ChatKitStreamEvent[]): string {
+  const chunks: string[] = [];
+  const textByItem = new Map<string, string>();
+  for (const event of events) {
+    if (event.type === "response.output_text.delta") {
+      chunks.push(event.delta);
+      continue;
+    }
+    if (
+      event.type === "thread.item.updated" &&
+      event.update.type === "assistant_message.content_part.text_delta"
+    ) {
+      const next = `${textByItem.get(event.item_id) ?? ""}${event.update.delta}`;
+      textByItem.set(event.item_id, next);
+      continue;
+    }
+    if (
+      event.type === "thread.item.updated" &&
+      event.update.type === "assistant_message.content_part.done"
+    ) {
+      textByItem.set(event.item_id, event.update.content.text);
+      continue;
+    }
+    if (
+      event.type === "thread.item.done" &&
+      event.item.type === "assistant_message"
+    ) {
+      const text = event.item.content
+        .map((part) => ("text" in part ? part.text : ""))
+        .join("");
+      if (text) {
+        chunks.length = 0;
+        chunks.push(text);
+      }
+    }
+  }
+  if (chunks.length === 0 && textByItem.size > 0) {
+    return Array.from(textByItem.values()).join("\n");
+  }
+  return chunks.join("");
+}
+
+function eventStartsAssistantOutput(event: ChatKitStreamEvent): boolean {
+  if (event.type === "response.output_text.delta" && event.delta.trim()) {
+    return true;
+  }
+  if (
+    event.type === "thread.item.updated" &&
+    event.update.type === "assistant_message.content_part.text_delta" &&
+    event.update.delta.trim()
+  ) {
+    return true;
+  }
+  if (
+    event.type === "thread.item.added" &&
+    event.item.type === "assistant_message"
+  ) {
+    return event.item.content.some(
+      (part) => "text" in part && part.text.trim().length > 0,
+    );
+  }
+  return false;
+}
+
+async function listOpfsTree(path: string | null | undefined): Promise<EvalOpfsEntry[]> {
+  if (!path) {
+    return [];
+  }
+  const opfs = createAppKernelOpfsApi();
+  if (!(await opfs.exists(path))) {
+    return [];
+  }
+  const results: EvalOpfsEntry[] = [];
+  const visit = async (currentPath: string) => {
+    const stat = await opfs.stat(currentPath);
+    results.push({
+      path: currentPath,
+      kind: stat.kind,
+      size: stat.size,
+    });
+    if (stat.kind !== "directory") {
+      return;
+    }
+    const entries = await opfs.list(currentPath);
+    for (const entry of entries) {
+      const childPath =
+        currentPath === "/"
+          ? `/${entry.name}`
+          : `${currentPath.replace(/\/+$/, "")}/${entry.name}`;
+      await visit(childPath);
+    }
+  };
+  await visit(path);
+  return results.sort((left, right) => left.path.localeCompare(right.path));
+}
+
+async function waitForCondition(
+  predicate: () => boolean,
+  timeoutMs: number,
+  intervalMs = 25,
+): Promise<void> {
+  const deadline = Date.now() + timeoutMs;
+  while (Date.now() < deadline) {
+    if (predicate()) {
+      return;
+    }
+    await new Promise((resolve) => setTimeout(resolve, intervalMs));
+  }
+  throw new Error(`Timed out after ${timeoutMs}ms`);
+}
+
+export function RunmeEvalHost() {
+  const { getNotebookData, useNotebookList } = useNotebookContext();
+  const { getCurrentDoc, setCurrentDoc } = useCurrentDoc();
+  const { getAllRenderers } = useOutput();
+  const openNotebookList = useNotebookList();
+  const responsesDirectConfig = useResponsesDirectConfigSnapshot();
+  const codexProjects = useCodexProjects();
+
+  const getNotebookDataRef = useRef(getNotebookData);
+  getNotebookDataRef.current = getNotebookData;
+  const getCurrentDocRef = useRef(getCurrentDoc);
+  getCurrentDocRef.current = getCurrentDoc;
+  const setCurrentDocRef = useRef(setCurrentDoc);
+  setCurrentDocRef.current = setCurrentDoc;
+  const openNotebookListRef = useRef(openNotebookList);
+  openNotebookListRef.current = openNotebookList;
+  const getAllRenderersRef = useRef(getAllRenderers);
+  getAllRenderersRef.current = getAllRenderers;
+  const responsesDirectConfigRef = useRef(responsesDirectConfig);
+  responsesDirectConfigRef.current = responsesDirectConfig;
+  const defaultProjectIdRef = useRef(codexProjects.defaultProject.id);
+  defaultProjectIdRef.current = codexProjects.defaultProject.id;
+
+  const codeModeExecutor = useMemo(
+    () =>
+      buildPageCodeModeExecutor({
+        getNotebookData: (uri) => getNotebookDataRef.current(uri),
+        getOpenNotebookUris: () =>
+          openNotebookListRef.current.map((item) => item.uri),
+        getCurrentDocUri: () => getCurrentDocRef.current(),
+        getRenderers: () => getAllRenderersRef.current().values(),
+      }),
+    [],
+  );
+
+  const codexBridgeHandler = useMemo(
+    () =>
+      createCodexBridgeToolHandler({
+        codeModeExecutor,
+      }),
+    [codeModeExecutor],
+  );
+
+  useEffect(() => {
+    const api: RunmeEvalApi = {
+      async waitUntilReady(timeoutMs = 15000) {
+        await waitForCondition(
+          () => Boolean(appState.localNotebooks),
+          timeoutMs,
+        );
+        return true;
+      },
+      async configureResponsesDirect(options) {
+        if (options.authMethod) {
+          responsesDirectConfigManager.setAuthMethod(options.authMethod);
+        }
+        if (typeof options.apiKey === "string") {
+          responsesDirectConfigManager.setAPIKey(options.apiKey);
+        }
+        if (typeof options.openaiOrganization === "string") {
+          responsesDirectConfigManager.setOpenAIOrganization(
+            options.openaiOrganization,
+          );
+        }
+        if (typeof options.openaiProject === "string") {
+          responsesDirectConfigManager.setOpenAIProject(options.openaiProject);
+        }
+      },
+      async createLocalNotebook(options) {
+        if (!appState.localNotebooks) {
+          throw new Error("Local notebook store is not initialized yet.");
+        }
+        const created = await appState.localNotebooks.create(
+          LOCAL_FOLDER_URI,
+          options.name,
+        );
+        await appState.localNotebooks.save(
+          created.uri,
+          buildNotebook(options.cells),
+        );
+        if (options.open !== false) {
+          setCurrentDocRef.current(created.uri);
+          await waitForCondition(() => {
+            const data = getNotebookDataRef.current(created.uri);
+            return Boolean(data?.getSnapshot().loaded);
+          }, 15000);
+        }
+        return { uri: created.uri };
+      },
+      async openNotebook(uri) {
+        setCurrentDocRef.current(uri);
+        await waitForCondition(() => {
+          const data = getNotebookDataRef.current(uri);
+          return Boolean(data?.getSnapshot().loaded);
+        }, 15000);
+      },
+      async run(options) {
+        const harnessRuntimeManager = getHarnessRuntimeManager();
+        const profile = buildHarnessProfile(options.harness);
+        const runtime = harnessRuntimeManager.getOrCreate({
+          profile,
+          projectId: options.projectId ?? defaultProjectIdRef.current,
+          resolveAuthorization:
+            profile.adapter === "codex"
+              ? async () => {
+                  const authData = await getAuthData();
+                  const idToken = authData?.idToken?.trim();
+                  if (idToken) {
+                    return `Bearer ${idToken}`;
+                  }
+                  const isLocalFakeHarness =
+                    profile.baseUrl.includes("127.0.0.1") ||
+                    profile.baseUrl.includes("localhost");
+                  return isLocalFakeHarness ? "Bearer eval-test-token" : "";
+                }
+              : undefined,
+          codeModeExecutor,
+          codexBridgeHandler:
+            profile.adapter === "codex" ? codexBridgeHandler : undefined,
+          wasmApiKey:
+            profile.adapter === "codex-wasm"
+              ? options.wasmApiKey ??
+                responsesDirectConfigRef.current.apiKey ??
+                ""
+              : undefined,
+          responsesApiBaseUrl:
+            profile.adapter === "responses-direct"
+              ? options.responsesApiBaseUrl ?? profile.baseUrl
+              : undefined,
+        });
+
+        const events: ChatKitStreamEvent[] = [];
+        const requestLog: EvalRequestRecord[] = [];
+        const notifications: CodexProxyJsonRpcNotification[] = [];
+        const client = getCodexAppServerClient();
+        const originalSendRequest = client.sendRequest.bind(client);
+        const instrumentedSendRequest = async <T,>(
+          method: string,
+          params?: unknown,
+        ): Promise<T> => {
+          requestLog.push({
+            timestamp: new Date().toISOString(),
+            method,
+            params,
+          });
+          return await originalSendRequest<T>(method, params);
+        };
+        client.sendRequest = instrumentedSendRequest as typeof client.sendRequest;
+        const unsubscribeNotifications = client.subscribeNotifications(
+          (notification) => {
+            notifications.push(notification);
+          },
+        );
+
+        const adapter = runtime.createChatKitAdapter();
+        const startTime = performance.now();
+        let ttfmMs: number | null = null;
+        let timeoutId: ReturnType<typeof setTimeout> | null = null;
+        const abortController = new AbortController();
+
+        try {
+          if (options.notebookUri) {
+            await api.openNotebook(options.notebookUri);
+          }
+          await runtime.start();
+          await Promise.race([
+            adapter.streamUserMessage(
+              {
+                input: options.prompt,
+                model: options.model,
+                signal: abortController.signal,
+              },
+              {
+                emit(event) {
+                  events.push(event);
+                  if (ttfmMs === null && eventStartsAssistantOutput(event)) {
+                    ttfmMs = performance.now() - startTime;
+                  }
+                },
+              },
+            ),
+            new Promise<never>((_, reject) => {
+              timeoutId = setTimeout(() => {
+                abortController.abort();
+                reject(
+                  new Error(
+                    `Eval turn timed out after ${options.timeoutMs ?? 45000}ms`,
+                  ),
+                );
+              }, options.timeoutMs ?? 45000);
+            }),
+          ]);
+
+          const turnTimeMs = performance.now() - startTime;
+          const threadId = adapter.initialThreadId ?? null;
+          const notebookUri =
+            options.notebookUri ?? getCurrentDocRef.current() ?? null;
+          const notebookData = notebookUri
+            ? getNotebookDataRef.current(notebookUri)
+            : undefined;
+          const notebookSnapshot = notebookData?.getSnapshot() ?? null;
+
+          return {
+            harness: profile,
+            prompt: options.prompt,
+            threadId,
+            events,
+            assistantText: extractAssistantText(events),
+            requestLog,
+            notifications,
+            wasmJournal:
+              profile.adapter === "codex-wasm"
+                ? await getCodexWasmAppServerClient().getEventJournal()
+                : [],
+            notebook: notebookSnapshot
+              ? {
+                  uri: notebookSnapshot.uri,
+                  name: notebookSnapshot.name,
+                  cells: notebookSnapshot.notebook.cells.map((cell) => ({
+                    refId: cell.refId,
+                    languageId: cell.languageId,
+                    value: cell.value ?? "",
+                  })),
+                }
+              : null,
+            opfs: await listOpfsTree(options.inspectOpfsPath),
+            metrics: {
+              ttfmMs,
+              turnTimeMs,
+            },
+          };
+        } finally {
+          if (timeoutId) {
+            clearTimeout(timeoutId);
+          }
+          unsubscribeNotifications();
+          client.sendRequest = originalSendRequest;
+          runtime.stop();
+          harnessRuntimeManager.remove(profile.name);
+        }
+      },
+    };
+
+    window.__runmeEval = api;
+    return () => {
+      if (window.__runmeEval === api) {
+        delete window.__runmeEval;
+      }
+    };
+  }, [codeModeExecutor, codexBridgeHandler]);
+
+  return null;
+}
diff --git a/app/src/lib/runtime/pageCodeModeExecutor.ts b/app/src/lib/runtime/pageCodeModeExecutor.ts
new file mode 100644
index 0000000..43cafd1
--- /dev/null
+++ b/app/src/lib/runtime/pageCodeModeExecutor.ts
@@ -0,0 +1,121 @@
+import { parser_pb } from "../../contexts/CellContext";
+import type { OutputRenderer } from "../../contexts/OutputContext";
+import type { NotebookData } from "../notebookData";
+import {
+  createCodeModeExecutor,
+  type CodeModeExecutor,
+} from "./codeModeExecutor";
+import type { NotebookDataLike } from "./runmeConsole";
+
+type NotebookTargetLike =
+  | string
+  | { uri?: string }
+  | { handle?: { uri?: string } }
+  | undefined;
+
+export type BuildPageCodeModeExecutorOptions = {
+  getNotebookData: (uri: string) => NotebookData | undefined;
+  getOpenNotebookUris: () => string[];
+  getCurrentDocUri: () => string | null;
+  getRenderers: () => Iterable<OutputRenderer>;
+};
+
+function resolveTargetUri(
+  target: NotebookTargetLike,
+  getCurrentDocUri: () => string | null,
+): string | null {
+  if (typeof target === "string" && target.trim()) {
+    return target;
+  }
+  if (
+    target &&
+    typeof target === "object" &&
+    "uri" in target &&
+    typeof target.uri === "string" &&
+    target.uri.trim()
+  ) {
+    return target.uri;
+  }
+  if (
+    target &&
+    typeof target === "object" &&
+    "handle" in target &&
+    target.handle &&
+    typeof target.handle.uri === "string" &&
+    target.handle.uri.trim()
+  ) {
+    return target.handle.uri;
+  }
+  return getCurrentDocUri();
+}
+
+function toNotebookDataLike(
+  data: NotebookData,
+  getRenderers: () => Iterable<OutputRenderer>,
+): NotebookDataLike {
+  return {
+    getUri: () => data.getUri(),
+    getName: () => data.getName(),
+    getNotebook: () => data.getNotebook(),
+    updateCell: (cell: parser_pb.Cell) => {
+      for (const renderer of getRenderers()) {
+        renderer.onCellUpdate(cell);
+      }
+      data.updateCell(cell);
+    },
+    getCell: (refId: string) => data.getCell(refId),
+    appendCodeCell: data.appendCodeCell?.bind(data),
+    addCodeCellAfter: data.addCodeCellAfter?.bind(data),
+    addCodeCellBefore: data.addCodeCellBefore?.bind(data),
+    removeCell: data.removeCell?.bind(data),
+  };
+}
+
+export function createPageNotebookResolver(
+  options: BuildPageCodeModeExecutorOptions,
+): (target?: unknown) => NotebookDataLike | null {
+  return (target?: unknown) => {
+    const targetUri = resolveTargetUri(
+      target as NotebookTargetLike,
+      options.getCurrentDocUri,
+    );
+    if (!targetUri) {
+      return null;
+    }
+    const data = options.getNotebookData(targetUri);
+    if (!data) {
+      return null;
+    }
+    return toNotebookDataLike(data, options.getRenderers);
+  };
+}
+
+export function listPageNotebooks(
+  options: BuildPageCodeModeExecutorOptions,
+  resolveNotebook: (target?: unknown) => NotebookDataLike | null,
+): NotebookDataLike[] {
+  const uris = new Set<string>();
+  for (const uri of options.getOpenNotebookUris()) {
+    if (typeof uri === "string" && uri.trim()) {
+      uris.add(uri);
+    }
+  }
+  const currentDocUri = options.getCurrentDocUri();
+  if (currentDocUri) {
+    uris.add(currentDocUri);
+  }
+  return Array.from(uris)
+    .map((uri) => resolveNotebook(uri))
+    .filter((notebook): notebook is NotebookDataLike => Boolean(notebook));
+}
+
+export function buildPageCodeModeExecutor(
+  options: BuildPageCodeModeExecutorOptions,
+): CodeModeExecutor {
+  const resolveNotebook = createPageNotebookResolver(options);
+  return createCodeModeExecutor({
+    mode: "sandbox",
+    resolveNotebook,
+    listNotebooks: () => listPageNotebooks(options, resolveNotebook),
+  });
+}
diff --git a/app/test/evals/.generated/runMinimalEvals.js b/app/test/evals/.generated/runMinimalEvals.js
new file mode 100644
index 0000000..fadd4b0
--- /dev/null
+++ b/app/test/evals/.generated/runMinimalEvals.js
@@ -0,0 +1,254 @@
+import { spawn, spawnSync } from "node:child_process";
+import { createWriteStream, mkdirSync } from "node:fs";
+import { join, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+const CURRENT_FILE = fileURLToPath(import.meta.url);
+const GENERATED_DIR = resolve(CURRENT_FILE, "..");
+const TEST_DIR = GENERATED_DIR.endsWith("/.generated") || GENERATED_DIR.endsWith("\\.generated")
+    ? resolve(GENERATED_DIR, "..")
+    : GENERATED_DIR;
+const APP_ROOT = resolve(TEST_DIR, "..", "..");
+const REPO_ROOT = resolve(APP_ROOT, "..");
+const OUTPUT_DIR = join(TEST_DIR, "eval-output");
+const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5173";
+const FAKE_AI_BASE_URL = process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
+const FAKE_AI_HEALTH_URL = `${FAKE_AI_BASE_URL}/healthz`;
+const FAKE_AI_RESET_URL = `${FAKE_AI_BASE_URL}/reset`;
+const AGENT_BROWSER_SESSION = process.env.AGENT_BROWSER_SESSION?.trim() ?? "";
+const AGENT_BROWSER_PROFILE = process.env.AGENT_BROWSER_PROFILE?.trim() ?? "";
+const AGENT_BROWSER_HEADED = (process.env.AGENT_BROWSER_HEADED ?? "false")
+    .trim()
+    .toLowerCase() === "true";
+mkdirSync(OUTPUT_DIR, { recursive: true });
+function shellQuote(value) {
+    return `'${value.replace(/'/g, `'\"'\"'`)}'`;
+}
+function withAgentBrowserOptions(command) {
+    const trimmed = command.trimStart();
+    if (!trimmed.startsWith("agent-browser ")) {
+        return command;
+    }
+    const leadingWhitespace = command.slice(0, command.length - trimmed.length);
+    const subcommand = trimmed.slice("agent-browser ".length);
+    const args = [];
+    if (AGENT_BROWSER_SESSION) {
+        args.push("--session", shellQuote(AGENT_BROWSER_SESSION));
+    }
+    if (AGENT_BROWSER_PROFILE) {
+        args.push("--profile", shellQuote(AGENT_BROWSER_PROFILE));
+    }
+    if (AGENT_BROWSER_HEADED) {
+        args.push("--headed");
+    }
+    return `${leadingWhitespace}${["agent-browser", ...args].join(" ")} ${subcommand}`;
+}
+function run(command, timeoutMs = 30000) {
+    const effectiveCommand = withAgentBrowserOptions(command);
+    const result = spawnSync(effectiveCommand, {
+        shell: true,
+        encoding: "utf-8",
+        timeout: timeoutMs,
+        killSignal: "SIGKILL",
+        cwd: REPO_ROOT,
+    });
+    return {
+        status: result.status ?? 1,
+        stdout: result.stdout ?? "",
+        stderr: result.stderr ?? "",
+    };
+}
+function runOrThrow(command, timeoutMs = 30000) {
+    const result = run(command, timeoutMs);
+    if (result.status !== 0) {
+        throw new Error(`Command failed: ${command}\n${result.stderr}`);
+    }
+    return result.stdout;
+}
+async function fetchWithTimeout(input, timeoutMs = 15000, init) {
+    return await fetch(input, {
+        ...init,
+        signal: init?.signal ?? AbortSignal.timeout(timeoutMs),
+    });
+}
+function isHttpReady(url) {
+    return run(`curl -sf ${shellQuote(url)}`, 5000).status === 0;
+}
+async function waitForHttpReady(url, timeoutMs = 45000) {
+    const deadline = Date.now() + timeoutMs;
+    while (Date.now() < deadline) {
+        if (isHttpReady(url)) {
+            return;
+        }
+        await new Promise((resolve) => setTimeout(resolve, 500));
+    }
+    throw new Error(`Timed out waiting for ${url}`);
+}
+function startService(name, command, cwd) {
+    const logPath = join(OUTPUT_DIR, `${name}.log`);
+    const logStream = createWriteStream(logPath, { flags: "a" });
+    const child = spawn(command, {
+        cwd,
+        shell: true,
+        stdio: ["ignore", "pipe", "pipe"],
+        env: process.env,
+    });
+    child.stdout?.pipe(logStream);
+    child.stderr?.pipe(logStream);
+    return { name, process: child, logStream };
+}
+function stopService(handle) {
+    if (!handle) {
+        return;
+    }
+    handle.process.kill("SIGTERM");
+    handle.logStream.end();
+}
+async function ensureFrontend() {
+    try {
+        await waitForHttpReady(FRONTEND_URL, 3000);
+        return null;
+    }
+    catch {
+        const handle = startService("frontend", "pnpm -C app run dev -- --host 127.0.0.1 --port 5173", REPO_ROOT);
+        await waitForHttpReady(FRONTEND_URL);
+        return handle;
+    }
+}
+async function ensureFakeAi() {
+    try {
+        await waitForHttpReady(FAKE_AI_HEALTH_URL, 3000);
+        return null;
+    }
+    catch {
+        const handle = startService("fake-ai", `go run ${shellQuote(join(REPO_ROOT, "testing", "aiservice", "main.go"))}`, REPO_ROOT);
+        await waitForHttpReady(FAKE_AI_HEALTH_URL);
+        return handle;
+    }
+}
+async function resetFakeAi() {
+    await fetchWithTimeout(FAKE_AI_RESET_URL, 5000, { method: "POST" });
+}
+function escapeForDoubleQuotes(value) {
+    return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
+}
+function evaluateJson(script, timeoutMs = 30000) {
+    const executable = script.trim().startsWith("(async")
+        ? script
+        : `(async () => { return ${script}; })()`;
+    const wrapped = `agent-browser eval "${escapeForDoubleQuotes(executable)}"`;
+    const stdout = runOrThrow(wrapped, timeoutMs).trim();
+    const normalized = stdout.startsWith('"') ? JSON.parse(stdout) : stdout;
+    if (typeof normalized !== "string") {
+        return normalized;
+    }
+    return JSON.parse(normalized);
+}
+async function bootstrapEvalBridge() {
+    run("agent-browser close", 5000);
+    runOrThrow(`agent-browser open ${FRONTEND_URL}`, 30000);
+    run("agent-browser wait 2500", 5000);
+    evaluateJson(`JSON.stringify(await window.__runmeEval.waitUntilReady())`, 30000);
+}
+function createNotebook(name) {
+    return evaluateJson(`JSON.stringify(await window.__runmeEval.createLocalNotebook({
+      name: ${JSON.stringify(name)},
+      cells: [],
+      open: true
+    }))`, 30000);
+}
+function configureResponsesDirect() {
+    evaluateJson(`JSON.stringify((await window.__runmeEval.configureResponsesDirect({
+      authMethod: 'api_key',
+      apiKey: 'test-key'
+    }), true))`, 15000);
+}
+function runBrowserEval(options) {
+    return evaluateJson(`JSON.stringify(await window.__runmeEval.run(${JSON.stringify(options)}))`, 90000);
+}
+function assert(ok, message) {
+    return { ok, message };
+}
+async function runCodexProxyHelloWorldEval() {
+    await resetFakeAi();
+    const notebook = createNotebook("eval-codex-proxy.runme.md");
+    const result = runBrowserEval({
+        harness: {
+            adapter: "codex",
+            name: "codex-proxy-eval",
+            baseUrl: FAKE_AI_BASE_URL,
+        },
+        notebookUri: notebook.uri,
+        prompt: `Add a cell to print("hello world")`,
+        timeoutMs: 45000,
+    });
+    const assertions = [
+        assert(result.assistantText.includes("Cell has been added."), "assistant text confirms the cell was added"),
+        assert(result.requestLog.some((entry) => entry.method === "turn/start"), "request log includes turn/start"),
+        assert(Boolean(result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`))), "notebook snapshot contains the hello world cell"),
+        assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
+        assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
+    ];
+    return {
+        name: "codex proxy adds hello world cell",
+        status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
+        assertions,
+    };
+}
+async function runResponsesDirectHelloWorldEval() {
+    await resetFakeAi();
+    configureResponsesDirect();
+    const notebook = createNotebook("eval-responses-direct.runme.md");
+    const result = runBrowserEval({
+        harness: {
+            adapter: "responses-direct",
+            name: "responses-direct-eval",
+            baseUrl: FAKE_AI_BASE_URL,
+        },
+        notebookUri: notebook.uri,
+        prompt: `Add a cell to print("hello world")`,
+        timeoutMs: 45000,
+    });
+    const assertions = [
+        assert(result.assistantText.includes("Cell has been added."), "assistant text confirms the cell was added"),
+        assert(Boolean(result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`))), "notebook snapshot contains the hello world cell"),
+        assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
+        assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
+    ];
+    return {
+        name: "responses-direct adds hello world cell",
+        status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
+        assertions,
+    };
+}
+async function main() {
+    let frontendHandle = null;
+    let fakeAiHandle = null;
+    try {
+        frontendHandle = await ensureFrontend();
+        fakeAiHandle = await ensureFakeAi();
+        await bootstrapEvalBridge();
+        const results = [
+            await runCodexProxyHelloWorldEval(),
+            await runResponsesDirectHelloWorldEval(),
+        ];
+        let failed = 0;
+        for (const result of results) {
+            console.log(`${result.status} ${result.name}`);
+            for (const assertion of result.assertions) {
+                console.log(`  ${assertion.ok ? "PASS" : "FAIL"} ${assertion.message}`);
+            }
+            if (result.status === "FAIL") {
+                failed += 1;
+            }
+        }
+        if (failed > 0) {
+            process.exitCode = 1;
+        }
+    }
+    finally {
+        run("agent-browser close", 5000);
+        stopService(fakeAiHandle);
+        stopService(frontendHandle);
+    }
+}
+await main();
diff --git a/app/test/evals/runMinimalEvals.ts b/app/test/evals/runMinimalEvals.ts
new file mode 100644
index 0000000..9b062fa
--- /dev/null
+++ b/app/test/evals/runMinimalEvals.ts
@@ -0,0 +1,356 @@
+import { spawn, spawnSync, type ChildProcess } from "node:child_process";
+import { createWriteStream, mkdirSync } from "node:fs";
+import { join, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+
+type EvalAssertion = {
+  ok: boolean;
+  message: string;
+};
+
+type EvalResultSummary = {
+  name: string;
+  status: "PASS" | "FAIL";
+  assertions: EvalAssertion[];
+};
+
+type RunmeEvalResult = {
+  assistantText: string;
+  requestLog: Array<{ method: string }>;
+  notebook: {
+    uri: string;
+    cells: Array<{ value: string }>;
+  } | null;
+  metrics: {
+    ttfmMs: number | null;
+    turnTimeMs: number;
+  };
+};
+
+type ServiceHandle = {
+  name: string;
+  process: ChildProcess;
+  logStream: ReturnType<typeof createWriteStream>;
+};
+
+const CURRENT_FILE = fileURLToPath(import.meta.url);
+const GENERATED_DIR = resolve(CURRENT_FILE, "..");
+const TEST_DIR = GENERATED_DIR.endsWith("/.generated") || GENERATED_DIR.endsWith("\\.generated")
+  ? resolve(GENERATED_DIR, "..")
+  : GENERATED_DIR;
+const APP_ROOT = resolve(TEST_DIR, "..", "..");
+const REPO_ROOT = resolve(APP_ROOT, "..");
+const OUTPUT_DIR = join(TEST_DIR, "eval-output");
+const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5173";
+const FAKE_AI_BASE_URL = process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
+const FAKE_AI_HEALTH_URL = `${FAKE_AI_BASE_URL}/healthz`;
+const FAKE_AI_RESET_URL = `${FAKE_AI_BASE_URL}/reset`;
+const AGENT_BROWSER_SESSION = process.env.AGENT_BROWSER_SESSION?.trim() ?? "";
+const AGENT_BROWSER_PROFILE = process.env.AGENT_BROWSER_PROFILE?.trim() ?? "";
+const AGENT_BROWSER_HEADED = (process.env.AGENT_BROWSER_HEADED ?? "false")
+  .trim()
+  .toLowerCase() === "true";
+
+mkdirSync(OUTPUT_DIR, { recursive: true });
+
+function shellQuote(value: string): string {
+  return `'${value.replace(/'/g, `'\"'\"'`)}'`;
+}
+
+function withAgentBrowserOptions(command: string): string {
+  const trimmed = command.trimStart();
+  if (!trimmed.startsWith("agent-browser ")) {
+    return command;
+  }
+  const leadingWhitespace = command.slice(0, command.length - trimmed.length);
+  const subcommand = trimmed.slice("agent-browser ".length);
+  const args: string[] = [];
+  if (AGENT_BROWSER_SESSION) {
+    args.push("--session", shellQuote(AGENT_BROWSER_SESSION));
+  }
+  if (AGENT_BROWSER_PROFILE) {
+    args.push("--profile", shellQuote(AGENT_BROWSER_PROFILE));
+  }
+  if (AGENT_BROWSER_HEADED) {
+    args.push("--headed");
+  }
+  return `${leadingWhitespace}${["agent-browser", ...args].join(" ")} ${subcommand}`;
+}
+
+function run(command: string, timeoutMs = 30000): {
+  status: number;
+  stdout: string;
+  stderr: string;
+} {
+  const effectiveCommand = withAgentBrowserOptions(command);
+  const result = spawnSync(effectiveCommand, {
+    shell: true,
+    encoding: "utf-8",
+    timeout: timeoutMs,
+    killSignal: "SIGKILL",
+    cwd: REPO_ROOT,
+  });
+  return {
+    status: result.status ?? 1,
+    stdout: result.stdout ?? "",
+    stderr: result.stderr ?? "",
+  };
+}
+
+function runOrThrow(command: string, timeoutMs = 30000): string {
+  const result = run(command, timeoutMs);
+  if (result.status !== 0) {
+    throw new Error(`Command failed: ${command}\n${result.stderr}`);
+  }
+  return result.stdout;
+}
+
+async function fetchWithTimeout(
+  input: string,
+  timeoutMs = 15000,
+  init?: RequestInit,
+): Promise<Response> {
+  return await fetch(input, {
+    ...init,
+    signal: init?.signal ?? AbortSignal.timeout(timeoutMs),
+  });
+}
+
+function isHttpReady(url: string): boolean {
+  return run(`curl -sf ${shellQuote(url)}`, 5000).status === 0;
+}
+
+async function waitForHttpReady(url: string, timeoutMs = 45000): Promise<void> {
+  const deadline = Date.now() + timeoutMs;
+  while (Date.now() < deadline) {
+    if (isHttpReady(url)) {
+      return;
+    }
+    await new Promise((resolve) => setTimeout(resolve, 500));
+  }
+  throw new Error(`Timed out waiting for ${url}`);
+}
+
+function startService(name: string, command: string, cwd: string): ServiceHandle {
+  const logPath = join(OUTPUT_DIR, `${name}.log`);
+  const logStream = createWriteStream(logPath, { flags: "a" });
+  const child = spawn(command, {
+    cwd,
+    shell: true,
+    stdio: ["ignore", "pipe", "pipe"],
+    env: process.env,
+  });
+  child.stdout?.pipe(logStream);
+  child.stderr?.pipe(logStream);
+  return { name, process: child, logStream };
+}
+
+function stopService(handle: ServiceHandle | null): void {
+  if (!handle) {
+    return;
+  }
+  handle.process.kill("SIGTERM");
+  handle.logStream.end();
+}
+
+async function ensureFrontend(): Promise<ServiceHandle | null> {
+  try {
+    await waitForHttpReady(FRONTEND_URL, 3000);
+    return null;
+  } catch {
+    const handle = startService(
+      "frontend",
+      "pnpm -C app run dev -- --host 127.0.0.1 --port 5173",
+      REPO_ROOT,
+    );
+    await waitForHttpReady(FRONTEND_URL);
+    return handle;
+  }
+}
+
+async function ensureFakeAi(): Promise<ServiceHandle | null> {
+  try {
+    await waitForHttpReady(FAKE_AI_HEALTH_URL, 3000);
+    return null;
+  } catch {
+    const handle = startService(
+      "fake-ai",
+      `go run ${shellQuote(join(REPO_ROOT, "testing", "aiservice", "main.go"))}`,
+      REPO_ROOT,
+    );
+    await waitForHttpReady(FAKE_AI_HEALTH_URL);
+    return handle;
+  }
+}
+
+async function resetFakeAi(): Promise<void> {
+  await fetchWithTimeout(FAKE_AI_RESET_URL, 5000, { method: "POST" });
+}
+
+function escapeForDoubleQuotes(value: string): string {
+  return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
+}
+
+function evaluateJson<T>(script: string, timeoutMs = 30000): T {
+  const executable = script.trim().startsWith("(async")
+    ? script
+    : `(async () => { return ${script}; })()`;
+  const wrapped = `agent-browser eval "${escapeForDoubleQuotes(executable)}"`;
+  const stdout = runOrThrow(wrapped, timeoutMs).trim();
+  const normalized = stdout.startsWith('"') ? JSON.parse(stdout) : stdout;
+  if (typeof normalized !== "string") {
+    return normalized as T;
+  }
+  return JSON.parse(normalized) as T;
+}
+
+async function bootstrapEvalBridge(): Promise<void> {
+  run("agent-browser close", 5000);
+  runOrThrow(`agent-browser open ${FRONTEND_URL}`, 30000);
+  run("agent-browser wait 2500", 5000);
+  evaluateJson<boolean>(
+    `JSON.stringify(await window.__runmeEval.waitUntilReady())`,
+    30000,
+  );
+}
+
+function createNotebook(name: string): { uri: string } {
+  return evaluateJson<{ uri: string }>(
+    `JSON.stringify(await window.__runmeEval.createLocalNotebook({
+      name: ${JSON.stringify(name)},
+      cells: [],
+      open: true
+    }))`,
+    30000,
+  );
+}
+
+function configureResponsesDirect(): void {
+  evaluateJson<boolean>(
+    `JSON.stringify((await window.__runmeEval.configureResponsesDirect({
+      authMethod: 'api_key',
+      apiKey: 'test-key'
+    }), true))`,
+    15000,
+  );
+}
+
+function runBrowserEval(options: object): RunmeEvalResult {
+  return evaluateJson<RunmeEvalResult>(
+    `JSON.stringify(await window.__runmeEval.run(${JSON.stringify(options)}))`,
+    90000,
+  );
+}
+
+function assert(ok: boolean, message: string): EvalAssertion {
+  return { ok, message };
+}
+
+async function runCodexProxyHelloWorldEval(): Promise<EvalResultSummary> {
+  await resetFakeAi();
+  const notebook = createNotebook("eval-codex-proxy.runme.md");
+  const result = runBrowserEval({
+    harness: {
+      adapter: "codex",
+      name: "codex-proxy-eval",
+      baseUrl: FAKE_AI_BASE_URL,
+    },
+    notebookUri: notebook.uri,
+    prompt: `Add a cell to print("hello world")`,
+    timeoutMs: 45000,
+  });
+  const assertions = [
+    assert(
+      result.assistantText.includes("Cell has been added."),
+      "assistant text confirms the cell was added",
+    ),
+    assert(
+      result.requestLog.some((entry) => entry.method === "turn/start"),
+      "request log includes turn/start",
+    ),
+    assert(
+      Boolean(
+        result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`)),
+      ),
+      "notebook snapshot contains the hello world cell",
+    ),
+    assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
+    assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
+  ];
+  return {
+    name: "codex proxy adds hello world cell",
+    status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
+    assertions,
+  };
+}
+
+async function runResponsesDirectHelloWorldEval(): Promise<EvalResultSummary> {
+  await resetFakeAi();
+  configureResponsesDirect();
+  const notebook = createNotebook("eval-responses-direct.runme.md");
+  const result = runBrowserEval({
+    harness: {
+      adapter: "responses-direct",
+      name: "responses-direct-eval",
+      baseUrl: FAKE_AI_BASE_URL,
+    },
+    notebookUri: notebook.uri,
+    prompt: `Add a cell to print("hello world")`,
+    timeoutMs: 45000,
+  });
+  const assertions = [
+    assert(
+      result.assistantText.includes("Cell has been added."),
+      "assistant text confirms the cell was added",
+    ),
+    assert(
+      Boolean(
+        result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`)),
+      ),
+      "notebook snapshot contains the hello world cell",
+    ),
+    assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
+    assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
+  ];
+  return {
+    name: "responses-direct adds hello world cell",
+    status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
+    assertions,
+  };
+}
+
+async function main(): Promise<void> {
+  let frontendHandle: ServiceHandle | null = null;
+  let fakeAiHandle: ServiceHandle | null = null;
+  try {
+    frontendHandle = await ensureFrontend();
+    fakeAiHandle = await ensureFakeAi();
+    await bootstrapEvalBridge();
+
+    const results = [
+      await runCodexProxyHelloWorldEval(),
+      await runResponsesDirectHelloWorldEval(),
+    ];
+
+    let failed = 0;
+    for (const result of results) {
+      console.log(`${result.status} ${result.name}`);
+      for (const assertion of result.assertions) {
+        console.log(`  ${assertion.ok ? "PASS" : "FAIL"} ${assertion.message}`);
+      }
+      if (result.status === "FAIL") {
+        failed += 1;
+      }
+    }
+
+    if (failed > 0) {
+      process.exitCode = 1;
+    }
+  } finally {
+    run("agent-browser close", 5000);
+    stopService(fakeAiHandle);
+    stopService(frontendHandle);
+  }
+}
+
+await main();
diff --git a/docs-dev/design/20260421_minimal_evals.md b/docs-dev/design/20260421_minimal_evals.md
index a61aab0..1f035c3 100644
--- a/docs-dev/design/20260421_minimal_evals.md
+++ b/docs-dev/design/20260421_minimal_evals.md
@@ -2,12 +2,19 @@
 
 ## Status
 
-Current proposal.
+Initial implementation landed.
 
-The core runtime seam exists. The remaining work is:
+The core runtime seam now has:
 
-- add a small browser-side eval entrypoint
-- add a Node/TS driver that launches a real browser and calls that entrypoint
+- a reusable page-scoped `CodeModeExecutor` builder
+- a browser-side eval entrypoint exposed by `RunmeEvalHost`
+- a Node/TS eval driver at `app/test/evals/runMinimalEvals.ts`
+- first passing local evals for:
+  - `codex` via fake proxy/app-server
+  - `responses-direct` via fake `/v1/responses`
+
+The main remaining work is the real `codex-wasm` eval path against a live API
+key/runtime configuration.
 
 ## Summary
 
@@ -419,37 +426,36 @@ For evals, this means:
 
 ## Remaining Additions For Minimal Evals
 
-### 1. Add a tiny browser-side eval entrypoint
+The minimal eval foundation now exists in code:
 
-We still need a small browser-side helper, likely test-only, that exposes the
-runtime seam to Playwright/CDP.
+- `app/src/lib/runtime/pageCodeModeExecutor.ts`
+- `app/src/components/Evals/RunmeEvalHost.tsx`
+- `app/test/evals/runMinimalEvals.ts`
 
-That helper should do only this:
+The remaining additions below are the still-open pieces beyond that first
+working slice.
 
-- construct the same runtime inputs `ChatKitPanel` constructs
-  - `codeModeExecutor`
-  - `codexBridgeHandler` for proxy mode
-  - auth resolver for proxy mode
-- call `HarnessRuntimeManager.getOrCreate(...)`
-- `start()` the runtime
-- `streamUserMessage(...)`
-- collect emitted events
-- return notebook / trace / journal / OPFS snapshots
+### 1. Extend the passing suite from local fake harnesses to real `codex-wasm`
 
-This can be a small `window.__runmeEval` bridge or a test-only imported module.
+The current executable slice proves the harness/runtime seam and the browser
+entrypoint, but the passing local suite is still built around deterministic
+fake backends.
 
-### 2. Extract the page-scoped `codeModeExecutor` builder into a reusable helper
+The next step is to add a real `codex-wasm` run mode that:
 
-This is the recommended approach.
+- supplies a real OpenAI API key
+- points assertions at wasm journal rows and OPFS state
+- verifies the worker-backed app-server path rather than the fake proxy path
 
-The main logic that still lives in `ChatKitPanel` is the page-state wiring for:
+### 2. Keep the shared page-scoped `codeModeExecutor` builder as the only notebook wiring seam
 
-- `resolveCodeModeNotebook(...)`
-- `listNotebooks(...)`
-- renderer/notebook update hooks
+This extraction has been implemented and should remain the single place that
+builds the notebook-aware code execution environment used by:
 
-For notebook-mutation evals, we should extract that logic into a reusable
-helper rather than reconstruct it separately inside an eval driver.
+- `ChatKitPanel`
+- the eval bridge
+- `codex-wasm`
+- `responses-direct`
 
 #### Why extraction is the right approach
 
@@ -564,7 +570,7 @@ With this extraction, the browser-side eval entrypoint can:
 That keeps the eval path aligned with the real notebook/runtime wiring instead
 of approximating it.
 
-### 3. Add request/journal capture helpers
+### 3. Add richer request/journal capture helpers
 
 Minimal evals should capture:
 
@@ -575,21 +581,16 @@ Minimal evals should capture:
 This can be done in the browser helper rather than by changing production
 runtime APIs much further.
 
-### 4. Add a Node/TS driver
-
-Add a script, for example:
+### 4. Grow the Node/TS driver beyond the first local suite
 
-- `app/test/evals/runCodexEval.ts`
+The current driver is `app/test/evals/runMinimalEvals.ts`.
 
-Responsibilities:
+The next additions should be:
 
-- launch headless Chromium
-- open the app
-- call the browser eval helper
-- seed notebook state if needed
-- submit prompt
-- run assertions
-- print structured pass/fail output
+- explicit suite selection / filtering
+- structured JSON result output for CI
+- `codex-wasm`-specific configuration inputs
+- agentic-search assertions over OPFS repo cache contents
 
 ## Initial Eval Suite
 
diff --git a/testing/aiservice/main.go b/testing/aiservice/main.go
index ff1d4b2..d9d3abf 100644
--- a/testing/aiservice/main.go
+++ b/testing/aiservice/main.go
@@ -250,6 +250,10 @@ func handleChatkit(w http.ResponseWriter, r *http.Request) {
 		writeCodexChatkitSSE(w)
 		return
 	}
+	if r.URL.Path == "/v1/responses" {
+		writeResponsesDirectSSE(w, string(body))
+		return
+	}
 	events := []string{
 		`{"type":"response.created","response":{"id":"resp_cuj"}}`,
 		`{"type":"response.output_text.delta","delta":"Fake assistant response from CUJ server."}`,
@@ -262,6 +266,49 @@ func handleChatkit(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
+func writeResponsesDirectSSE(w http.ResponseWriter, requestBody string) {
+	if strings.Contains(requestBody, `"type":"function_call_output"`) {
+		finalRespID := "resp_cuj_responses_2"
+		finalItemID := "msg_cuj_responses_2"
+		finalText := `Cell has been added.`
+		events := []string{
+			fmt.Sprintf(`{"type":"response.created","response":{"id":"%s"}}`, finalRespID),
+			fmt.Sprintf(`{"type":"response.output_item.added","response_id":"%s","output_index":0,"item":{"id":"%s","type":"message","status":"in_progress","role":"assistant","content":[]}}`, finalRespID, finalItemID),
+			fmt.Sprintf(`{"type":"response.output_text.delta","response_id":"%s","output_index":0,"item_id":"%s","content_index":0,"delta":%q}`, finalRespID, finalItemID, finalText),
+			fmt.Sprintf(`{"type":"response.output_item.done","response_id":"%s","output_index":0,"item":{"id":"%s","type":"message","status":"completed","role":"assistant","content":[{"type":"output_text","text":%q}]}}`, finalRespID, finalItemID, finalText),
+			fmt.Sprintf(`{"type":"response.completed","response":{"id":"%s"}}`, finalRespID),
+		}
+		for _, event := range events {
+			writeSSE(w, event)
+			time.Sleep(8 * time.Millisecond)
+		}
+		return
+	}
+
+	toolRespID := "resp_cuj_responses_1"
+	toolItemID := "tool_cuj_responses_1"
+	toolCallID := "call_cuj_responses_1"
+	code := strings.Join([]string{
+		"const doc = await notebooks.get();",
+		"await notebooks.update({",
+		"  target: { handle: doc.handle },",
+		`  operations: [{ op: "insert", at: { index: -1 }, cells: [{ kind: "code", languageId: "python", value: 'print("hello world")' }] }],`,
+		"});",
+		`console.log("Cell added.");`,
+	}, "\n")
+	toolArguments := fmt.Sprintf(`{"code":%q}`, code)
+	events := []string{
+		fmt.Sprintf(`{"type":"response.created","response":{"id":"%s"}}`, toolRespID),
+		fmt.Sprintf(`{"type":"response.output_item.added","response_id":"%s","output_index":0,"item":{"id":"%s","type":"function_call","call_id":"%s","name":"ExecuteCode"}}`, toolRespID, toolItemID, toolCallID),
+		fmt.Sprintf(`{"type":"response.function_call_arguments.done","response_id":"%s","item_id":"%s","call_id":"%s","name":"ExecuteCode","arguments":%q}`, toolRespID, toolItemID, toolCallID, toolArguments),
+		fmt.Sprintf(`{"type":"response.completed","response":{"id":"%s"}}`, toolRespID),
+	}
+	for _, event := range events {
+		writeSSE(w, event)
+		time.Sleep(8 * time.Millisecond)
+	}
+}
+
 func writeCodexChatkitSSE(w http.ResponseWriter) {
 	firstRespID := "resp_cuj_codex_1"
 	secondRespID := "resp_cuj_codex_2"

From 02603f1aae5e20fcc78f4ae1b0414d1cc414f0e5 Mon Sep 17 00:00:00 2001
From: Jeremy lewi <jeremy@lewi.us>
Date: Wed, 22 Apr 2026 07:27:28 -0700
Subject: [PATCH 3/6] chore: ignore generated eval artifacts

Signed-off-by: Jeremy lewi <jeremy@lewi.us>
---
 .gitignore                                   |   1 +
 app/test/evals/.generated/runMinimalEvals.js | 254 -------------------
 2 files changed, 1 insertion(+), 254 deletions(-)
 delete mode 100644 app/test/evals/.generated/runMinimalEvals.js

diff --git a/.gitignore b/.gitignore
index d951afa..715057d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,7 @@ packages/react-console/gen
 /app/.generated/
 /app/assets/generated
 /app/test/browser/.generated/
+/app/test/evals/.generated/
 /app/test/browser/test-output/*
 
 # Local checkout symlink
diff --git a/app/test/evals/.generated/runMinimalEvals.js b/app/test/evals/.generated/runMinimalEvals.js
deleted file mode 100644
index fadd4b0..0000000
--- a/app/test/evals/.generated/runMinimalEvals.js
+++ /dev/null
@@ -1,254 +0,0 @@
-import { spawn, spawnSync } from "node:child_process";
-import { createWriteStream, mkdirSync } from "node:fs";
-import { join, resolve } from "node:path";
-import { fileURLToPath } from "node:url";
-const CURRENT_FILE = fileURLToPath(import.meta.url);
-const GENERATED_DIR = resolve(CURRENT_FILE, "..");
-const TEST_DIR = GENERATED_DIR.endsWith("/.generated") || GENERATED_DIR.endsWith("\\.generated")
-    ? resolve(GENERATED_DIR, "..")
-    : GENERATED_DIR;
-const APP_ROOT = resolve(TEST_DIR, "..", "..");
-const REPO_ROOT = resolve(APP_ROOT, "..");
-const OUTPUT_DIR = join(TEST_DIR, "eval-output");
-const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5173";
-const FAKE_AI_BASE_URL = process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
-const FAKE_AI_HEALTH_URL = `${FAKE_AI_BASE_URL}/healthz`;
-const FAKE_AI_RESET_URL = `${FAKE_AI_BASE_URL}/reset`;
-const AGENT_BROWSER_SESSION = process.env.AGENT_BROWSER_SESSION?.trim() ?? "";
-const AGENT_BROWSER_PROFILE = process.env.AGENT_BROWSER_PROFILE?.trim() ?? "";
-const AGENT_BROWSER_HEADED = (process.env.AGENT_BROWSER_HEADED ?? "false")
-    .trim()
-    .toLowerCase() === "true";
-mkdirSync(OUTPUT_DIR, { recursive: true });
-function shellQuote(value) {
-    return `'${value.replace(/'/g, `'\"'\"'`)}'`;
-}
-function withAgentBrowserOptions(command) {
-    const trimmed = command.trimStart();
-    if (!trimmed.startsWith("agent-browser ")) {
-        return command;
-    }
-    const leadingWhitespace = command.slice(0, command.length - trimmed.length);
-    const subcommand = trimmed.slice("agent-browser ".length);
-    const args = [];
-    if (AGENT_BROWSER_SESSION) {
-        args.push("--session", shellQuote(AGENT_BROWSER_SESSION));
-    }
-    if (AGENT_BROWSER_PROFILE) {
-        args.push("--profile", shellQuote(AGENT_BROWSER_PROFILE));
-    }
-    if (AGENT_BROWSER_HEADED) {
-        args.push("--headed");
-    }
-    return `${leadingWhitespace}${["agent-browser", ...args].join(" ")} ${subcommand}`;
-}
-function run(command, timeoutMs = 30000) {
-    const effectiveCommand = withAgentBrowserOptions(command);
-    const result = spawnSync(effectiveCommand, {
-        shell: true,
-        encoding: "utf-8",
-        timeout: timeoutMs,
-        killSignal: "SIGKILL",
-        cwd: REPO_ROOT,
-    });
-    return {
-        status: result.status ?? 1,
-        stdout: result.stdout ?? "",
-        stderr: result.stderr ?? "",
-    };
-}
-function runOrThrow(command, timeoutMs = 30000) {
-    const result = run(command, timeoutMs);
-    if (result.status !== 0) {
-        throw new Error(`Command failed: ${command}\n${result.stderr}`);
-    }
-    return result.stdout;
-}
-async function fetchWithTimeout(input, timeoutMs = 15000, init) {
-    return await fetch(input, {
-        ...init,
-        signal: init?.signal ?? AbortSignal.timeout(timeoutMs),
-    });
-}
-function isHttpReady(url) {
-    return run(`curl -sf ${shellQuote(url)}`, 5000).status === 0;
-}
-async function waitForHttpReady(url, timeoutMs = 45000) {
-    const deadline = Date.now() + timeoutMs;
-    while (Date.now() < deadline) {
-        if (isHttpReady(url)) {
-            return;
-        }
-        await new Promise((resolve) => setTimeout(resolve, 500));
-    }
-    throw new Error(`Timed out waiting for ${url}`);
-}
-function startService(name, command, cwd) {
-    const logPath = join(OUTPUT_DIR, `${name}.log`);
-    const logStream = createWriteStream(logPath, { flags: "a" });
-    const child = spawn(command, {
-        cwd,
-        shell: true,
-        stdio: ["ignore", "pipe", "pipe"],
-        env: process.env,
-    });
-    child.stdout?.pipe(logStream);
-    child.stderr?.pipe(logStream);
-    return { name, process: child, logStream };
-}
-function stopService(handle) {
-    if (!handle) {
-        return;
-    }
-    handle.process.kill("SIGTERM");
-    handle.logStream.end();
-}
-async function ensureFrontend() {
-    try {
-        await waitForHttpReady(FRONTEND_URL, 3000);
-        return null;
-    }
-    catch {
-        const handle = startService("frontend", "pnpm -C app run dev -- --host 127.0.0.1 --port 5173", REPO_ROOT);
-        await waitForHttpReady(FRONTEND_URL);
-        return handle;
-    }
-}
-async function ensureFakeAi() {
-    try {
-        await waitForHttpReady(FAKE_AI_HEALTH_URL, 3000);
-        return null;
-    }
-    catch {
-        const handle = startService("fake-ai", `go run ${shellQuote(join(REPO_ROOT, "testing", "aiservice", "main.go"))}`, REPO_ROOT);
-        await waitForHttpReady(FAKE_AI_HEALTH_URL);
-        return handle;
-    }
-}
-async function resetFakeAi() {
-    await fetchWithTimeout(FAKE_AI_RESET_URL, 5000, { method: "POST" });
-}
-function escapeForDoubleQuotes(value) {
-    return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
-}
-function evaluateJson(script, timeoutMs = 30000) {
-    const executable = script.trim().startsWith("(async")
-        ? script
-        : `(async () => { return ${script}; })()`;
-    const wrapped = `agent-browser eval "${escapeForDoubleQuotes(executable)}"`;
-    const stdout = runOrThrow(wrapped, timeoutMs).trim();
-    const normalized = stdout.startsWith('"') ? JSON.parse(stdout) : stdout;
-    if (typeof normalized !== "string") {
-        return normalized;
-    }
-    return JSON.parse(normalized);
-}
-async function bootstrapEvalBridge() {
-    run("agent-browser close", 5000);
-    runOrThrow(`agent-browser open ${FRONTEND_URL}`, 30000);
-    run("agent-browser wait 2500", 5000);
-    evaluateJson(`JSON.stringify(await window.__runmeEval.waitUntilReady())`, 30000);
-}
-function createNotebook(name) {
-    return evaluateJson(`JSON.stringify(await window.__runmeEval.createLocalNotebook({
-      name: ${JSON.stringify(name)},
-      cells: [],
-      open: true
-    }))`, 30000);
-}
-function configureResponsesDirect() {
-    evaluateJson(`JSON.stringify((await window.__runmeEval.configureResponsesDirect({
-      authMethod: 'api_key',
-      apiKey: 'test-key'
-    }), true))`, 15000);
-}
-function runBrowserEval(options) {
-    return evaluateJson(`JSON.stringify(await window.__runmeEval.run(${JSON.stringify(options)}))`, 90000);
-}
-function assert(ok, message) {
-    return { ok, message };
-}
-async function runCodexProxyHelloWorldEval() {
-    await resetFakeAi();
-    const notebook = createNotebook("eval-codex-proxy.runme.md");
-    const result = runBrowserEval({
-        harness: {
-            adapter: "codex",
-            name: "codex-proxy-eval",
-            baseUrl: FAKE_AI_BASE_URL,
-        },
-        notebookUri: notebook.uri,
-        prompt: `Add a cell to print("hello world")`,
-        timeoutMs: 45000,
-    });
-    const assertions = [
-        assert(result.assistantText.includes("Cell has been added."), "assistant text confirms the cell was added"),
-        assert(result.requestLog.some((entry) => entry.method === "turn/start"), "request log includes turn/start"),
-        assert(Boolean(result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`))), "notebook snapshot contains the hello world cell"),
-        assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
-        assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
-    ];
-    return {
-        name: "codex proxy adds hello world cell",
-        status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
-        assertions,
-    };
-}
-async function runResponsesDirectHelloWorldEval() {
-    await resetFakeAi();
-    configureResponsesDirect();
-    const notebook = createNotebook("eval-responses-direct.runme.md");
-    const result = runBrowserEval({
-        harness: {
-            adapter: "responses-direct",
-            name: "responses-direct-eval",
-            baseUrl: FAKE_AI_BASE_URL,
-        },
-        notebookUri: notebook.uri,
-        prompt: `Add a cell to print("hello world")`,
-        timeoutMs: 45000,
-    });
-    const assertions = [
-        assert(result.assistantText.includes("Cell has been added."), "assistant text confirms the cell was added"),
-        assert(Boolean(result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`))), "notebook snapshot contains the hello world cell"),
-        assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
-        assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
-    ];
-    return {
-        name: "responses-direct adds hello world cell",
-        status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
-        assertions,
-    };
-}
-async function main() {
-    let frontendHandle = null;
-    let fakeAiHandle = null;
-    try {
-        frontendHandle = await ensureFrontend();
-        fakeAiHandle = await ensureFakeAi();
-        await bootstrapEvalBridge();
-        const results = [
-            await runCodexProxyHelloWorldEval(),
-            await runResponsesDirectHelloWorldEval(),
-        ];
-        let failed = 0;
-        for (const result of results) {
-            console.log(`${result.status} ${result.name}`);
-            for (const assertion of result.assertions) {
-                console.log(`  ${assertion.ok ? "PASS" : "FAIL"} ${assertion.message}`);
-            }
-            if (result.status === "FAIL") {
-                failed += 1;
-            }
-        }
-        if (failed > 0) {
-            process.exitCode = 1;
-        }
-    }
-    finally {
-        run("agent-browser close", 5000);
-        stopService(fakeAiHandle);
-        stopService(frontendHandle);
-    }
-}
-await main();

From 0f0aea7f12d5beefa7f637bb41361575a97297fc Mon Sep 17 00:00:00 2001
From: Jeremy lewi <jeremy@lewi.us>
Date: Wed, 22 Apr 2026 07:28:56 -0700
Subject: [PATCH 4/6] chore: default eval driver to dedicated frontend port

Signed-off-by: Jeremy lewi <jeremy@lewi.us>
---
 app/test/evals/runMinimalEvals.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/app/test/evals/runMinimalEvals.ts b/app/test/evals/runMinimalEvals.ts
index 9b062fa..19d77c9 100644
--- a/app/test/evals/runMinimalEvals.ts
+++ b/app/test/evals/runMinimalEvals.ts
@@ -41,10 +41,11 @@ const TEST_DIR = GENERATED_DIR.endsWith("/.generated") || GENERATED_DIR.endsWith
 const APP_ROOT = resolve(TEST_DIR, "..", "..");
 const REPO_ROOT = resolve(APP_ROOT, "..");
 const OUTPUT_DIR = join(TEST_DIR, "eval-output");
-const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5173";
+const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5174";
 const FAKE_AI_BASE_URL = process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
 const FAKE_AI_HEALTH_URL = `${FAKE_AI_BASE_URL}/healthz`;
 const FAKE_AI_RESET_URL = `${FAKE_AI_BASE_URL}/reset`;
+const FRONTEND_PORT = new URL(FRONTEND_URL).port || "5174";
 const AGENT_BROWSER_SESSION = process.env.AGENT_BROWSER_SESSION?.trim() ?? "";
 const AGENT_BROWSER_PROFILE = process.env.AGENT_BROWSER_PROFILE?.trim() ?? "";
 const AGENT_BROWSER_HEADED = (process.env.AGENT_BROWSER_HEADED ?? "false")
@@ -160,7 +161,7 @@ async function ensureFrontend(): Promise<ServiceHandle | null> {
   } catch {
     const handle = startService(
       "frontend",
-      "pnpm -C app run dev -- --host 127.0.0.1 --port 5173",
+      `pnpm -C app run dev -- --host localhost --port ${FRONTEND_PORT}`,
       REPO_ROOT,
     );
     await waitForHttpReady(FRONTEND_URL);

From 69150141d1aaeedc8714cab2cfb23bc6700cbcad Mon Sep 17 00:00:00 2001
From: Jeremy lewi <jeremy@lewi.us>
Date: Wed, 22 Apr 2026 19:56:56 -0700
Subject: [PATCH 5/6] fix: support live codex wasm evals

Signed-off-by: Jeremy lewi <jeremy@lewi.us>
---
 .../components/ChatKit/ChatKitPanel.test.tsx  |   1 -
 .../codexConversationController.test.ts       |  48 ++++++
 .../runtime/codexConversationController.ts    |  21 ++-
 app/src/lib/runtime/runmeChatkitPrompts.ts    |   3 -
 app/test/evals/runMinimalEvals.ts             | 140 ++++++++++++++++--
 5 files changed, 192 insertions(+), 21 deletions(-)

diff --git a/app/src/components/ChatKit/ChatKitPanel.test.tsx b/app/src/components/ChatKit/ChatKitPanel.test.tsx
index e77c72c..df073a1 100644
--- a/app/src/components/ChatKit/ChatKitPanel.test.tsx
+++ b/app/src/components/ChatKit/ChatKitPanel.test.tsx
@@ -375,7 +375,6 @@ describe("ChatKitPanel codex harness routing", () => {
       expect(proxyMock.connectWasm).toHaveBeenCalledWith({
         apiKey: "sk-test",
         sessionOptions: expect.objectContaining({
-          cwd: "/workspace",
           instructions: expect.objectContaining({
             developer: expect.stringContaining(
               "Executed JavaScript runs inside the Runme AppKernel runtime.",
diff --git a/app/src/lib/runtime/codexConversationController.test.ts b/app/src/lib/runtime/codexConversationController.test.ts
index dc69168..910e770 100644
--- a/app/src/lib/runtime/codexConversationController.test.ts
+++ b/app/src/lib/runtime/codexConversationController.test.ts
@@ -16,6 +16,7 @@ const project = {
 const notificationHandlers = new Set<(notification: any) => void>();
 const proxyClient = {
   sendRequest: vi.fn(),
+  getTransport: vi.fn(() => "proxy"),
   subscribeNotifications: vi.fn((handler: (notification: any) => void) => {
     notificationHandlers.add(handler);
     return () => {
@@ -285,6 +286,8 @@ describe("CodexConversationController", () => {
   beforeEach(() => {
     vi.useRealTimers();
     proxyClient.sendRequest.mockReset();
+    proxyClient.getTransport.mockReset();
+    proxyClient.getTransport.mockReturnValue("proxy");
     proxyClient.subscribeNotifications.mockClear();
     notificationHandlers.clear();
     projectManager.setDefault.mockClear();
@@ -313,6 +316,20 @@ describe("CodexConversationController", () => {
     ]);
   });
 
+  it("omits cwd when refreshing history over the wasm transport", async () => {
+    proxyClient.getTransport.mockReturnValue("wasm");
+    proxyClient.sendRequest.mockResolvedValueOnce({
+      threads: [
+        { id: "thread-1", title: "One", last_turn_id: "turn-1" },
+      ],
+    });
+
+    const controller = createCodexConversationControllerForTests();
+    await controller.refreshHistory();
+
+    expect(proxyClient.sendRequest).toHaveBeenCalledWith("thread/list", {});
+  });
+
   it("ensures an active thread by creating one when no current thread exists", async () => {
     proxyClient.sendRequest.mockImplementation(async (method: string) => {
       if (method === "thread/start") {
@@ -352,6 +369,37 @@ describe("CodexConversationController", () => {
     expect(controller.getSnapshot().currentThreadId).toBe("thread-bootstrap");
   });
 
+  it("omits cwd when creating a thread over the wasm transport", async () => {
+    proxyClient.getTransport.mockReturnValue("wasm");
+    proxyClient.sendRequest.mockImplementation(async (method: string) => {
+      if (method === "thread/start") {
+        return {
+          thread: {
+            id: "thread-bootstrap",
+            title: "Bootstrap Thread",
+          },
+        };
+      }
+      throw new Error(`unexpected method ${method}`);
+    });
+
+    const controller = createCodexConversationControllerForTests();
+    await controller.ensureActiveThread();
+
+    expect(proxyClient.sendRequest).toHaveBeenCalledWith(
+      "thread/start",
+      expect.objectContaining({
+        projectId: "project-1",
+        model: "gpt-5.4",
+        approvalPolicy: "never",
+        sandboxPolicy: "workspace-write",
+      }),
+    );
+    expect(
+      (proxyClient.sendRequest.mock.calls.at(-1)?.[1] as Record<string, unknown>)?.cwd,
+    ).toBeUndefined();
+  });
+
   it("uses a per-request model override for thread start and turn start", async () => {
     proxyClient.sendRequest.mockImplementation(async (method: string, params?: unknown) => {
       if (method === "thread/start") {
diff --git a/app/src/lib/runtime/codexConversationController.ts b/app/src/lib/runtime/codexConversationController.ts
index f52c121..7ace3ef 100644
--- a/app/src/lib/runtime/codexConversationController.ts
+++ b/app/src/lib/runtime/codexConversationController.ts
@@ -476,9 +476,10 @@ class CodexConversationController {
     this.historyError = null;
     this.notify();
     try {
-      const result = await proxy.sendRequest("thread/list", {
-        cwd: project.cwd,
-      });
+      const result = await proxy.sendRequest(
+        "thread/list",
+        this.buildProjectScope(project),
+      );
       const record = asRecord(result);
       const entries = Array.isArray(record.threads)
         ? (record.threads as unknown[])
@@ -1126,7 +1127,7 @@ class CodexConversationController {
   private buildProjectDefaults(project: CodexProject, modelOverride?: string): JsonRecord {
     return {
       projectId: project.id,
-      cwd: project.cwd,
+      ...this.buildProjectScope(project),
       model: asString(modelOverride) ?? project.model,
       approvalPolicy: project.approvalPolicy,
       sandboxPolicy: project.sandboxPolicy,
@@ -1136,6 +1137,18 @@ class CodexConversationController {
     };
   }
 
+  private buildProjectScope(project: CodexProject): JsonRecord {
+    const client = getCodexAppServerClient() as {
+      getTransport?: () => string;
+    };
+    if (client.getTransport?.() === "wasm") {
+      return {};
+    }
+    return {
+      cwd: project.cwd,
+    };
+  }
+
   private listThreadsForProject(projectId: string): CodexConversationThread[] {
     const project = getCodexProjectManager().get(projectId);
     if (!project) {
diff --git a/app/src/lib/runtime/runmeChatkitPrompts.ts b/app/src/lib/runtime/runmeChatkitPrompts.ts
index b023503..f5019ad 100644
--- a/app/src/lib/runtime/runmeChatkitPrompts.ts
+++ b/app/src/lib/runtime/runmeChatkitPrompts.ts
@@ -57,8 +57,6 @@ export const RUNME_RESPONSES_DIRECT_INSTRUCTIONS = [
   RUNME_SHARED_RUNTIME_INSTRUCTIONS,
 ].join('\n')
 
-export const RUNME_CODEX_WASM_CWD = '/workspace'
-
 const RUNME_CODEX_WASM_OVERLAY = [
   'When you need to inspect or modify notebooks, use Codex code mode.',
   'Codex code mode executes JavaScript in the same Runme AppKernel runtime described above.',
@@ -71,7 +69,6 @@ export const RUNME_CODEX_WASM_DEVELOPER_INSTRUCTIONS = [
 
 export function buildRunmeCodexWasmSessionOptions(): BrowserSessionOptions {
   return {
-    cwd: RUNME_CODEX_WASM_CWD,
     instructions: {
       developer: RUNME_CODEX_WASM_DEVELOPER_INSTRUCTIONS,
     },
diff --git a/app/test/evals/runMinimalEvals.ts b/app/test/evals/runMinimalEvals.ts
index 19d77c9..97a7e7a 100644
--- a/app/test/evals/runMinimalEvals.ts
+++ b/app/test/evals/runMinimalEvals.ts
@@ -1,5 +1,5 @@
 import { spawn, spawnSync, type ChildProcess } from "node:child_process";
-import { createWriteStream, mkdirSync } from "node:fs";
+import { createWriteStream, mkdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
 
@@ -17,6 +17,7 @@ type EvalResultSummary = {
 type RunmeEvalResult = {
   assistantText: string;
   requestLog: Array<{ method: string }>;
+  wasmJournal: unknown[];
   notebook: {
     uri: string;
     cells: Array<{ value: string }>;
@@ -42,10 +43,28 @@ const APP_ROOT = resolve(TEST_DIR, "..", "..");
 const REPO_ROOT = resolve(APP_ROOT, "..");
 const OUTPUT_DIR = join(TEST_DIR, "eval-output");
 const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5174";
+const EVAL_BACKEND = (process.env.RUNME_EVAL_BACKEND ?? "fake").trim().toLowerCase();
 const FAKE_AI_BASE_URL = process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
 const FAKE_AI_HEALTH_URL = `${FAKE_AI_BASE_URL}/healthz`;
 const FAKE_AI_RESET_URL = `${FAKE_AI_BASE_URL}/reset`;
 const FRONTEND_PORT = new URL(FRONTEND_URL).port || "5174";
+const DEFAULT_OPENAI_API_KEY_FILE =
+  "/Users/jlewi/secrets/openai.org-openai-internal-project-aisre-name-oaictl-jlewi.key";
+const OPENAI_API_KEY_FILE =
+  process.env.RUNME_EVAL_OPENAI_API_KEY_FILE ??
+  process.env.OPENAI_API_KEY_FILE ??
+  DEFAULT_OPENAI_API_KEY_FILE;
+const OPENAI_RESPONSES_BASE_URL =
+  process.env.RUNME_EVAL_OPENAI_BASE_URL ?? "https://api.openai.com";
+const OPENAI_ORGANIZATION =
+  process.env.RUNME_EVAL_OPENAI_ORGANIZATION ??
+  process.env.OPENAI_ORGANIZATION ??
+  "";
+const OPENAI_PROJECT =
+  process.env.RUNME_EVAL_OPENAI_PROJECT ??
+  process.env.OPENAI_PROJECT_ID ??
+  process.env.OPENAI_PROJECT ??
+  "";
 const AGENT_BROWSER_SESSION = process.env.AGENT_BROWSER_SESSION?.trim() ?? "";
 const AGENT_BROWSER_PROFILE = process.env.AGENT_BROWSER_PROFILE?.trim() ?? "";
 const AGENT_BROWSER_HEADED = (process.env.AGENT_BROWSER_HEADED ?? "false")
@@ -58,6 +77,22 @@ function shellQuote(value: string): string {
   return `'${value.replace(/'/g, `'\"'\"'`)}'`;
 }
 
+function isLiveBackend(): boolean {
+  return EVAL_BACKEND === "live" || EVAL_BACKEND === "openai" || EVAL_BACKEND === "real";
+}
+
+function resolveOpenAIApiKey(): string {
+  const direct = process.env.RUNME_EVAL_OPENAI_API_KEY?.trim() ?? process.env.OPENAI_API_KEY?.trim() ?? "";
+  if (direct) {
+    return direct;
+  }
+  try {
+    return readFileSync(OPENAI_API_KEY_FILE, "utf-8").trim();
+  } catch {
+    return "";
+  }
+}
+
 function withAgentBrowserOptions(command: string): string {
   const trimmed = command.trimStart();
   if (!trimmed.startsWith("agent-browser ")) {
@@ -170,6 +205,9 @@ async function ensureFrontend(): Promise<ServiceHandle | null> {
 }
 
 async function ensureFakeAi(): Promise<ServiceHandle | null> {
+  if (isLiveBackend()) {
+    return null;
+  }
   try {
     await waitForHttpReady(FAKE_AI_HEALTH_URL, 3000);
     return null;
@@ -185,6 +223,9 @@ async function ensureFakeAi(): Promise<ServiceHandle | null> {
 }
 
 async function resetFakeAi(): Promise<void> {
+  if (isLiveBackend()) {
+    return;
+  }
   await fetchWithTimeout(FAKE_AI_RESET_URL, 5000, { method: "POST" });
 }
 
@@ -226,11 +267,18 @@ function createNotebook(name: string): { uri: string } {
   );
 }
 
-function configureResponsesDirect(): void {
+function configureResponsesDirect(options?: {
+  authMethod?: "oauth" | "api_key";
+  apiKey?: string;
+  openaiOrganization?: string;
+  openaiProject?: string;
+}): void {
   evaluateJson<boolean>(
     `JSON.stringify((await window.__runmeEval.configureResponsesDirect({
-      authMethod: 'api_key',
-      apiKey: 'test-key'
+      authMethod: ${JSON.stringify(options?.authMethod ?? "api_key")},
+      apiKey: ${JSON.stringify(options?.apiKey ?? "test-key")},
+      openaiOrganization: ${JSON.stringify(options?.openaiOrganization ?? "")},
+      openaiProject: ${JSON.stringify(options?.openaiProject ?? "")}
     }), true))`,
     15000,
   );
@@ -287,13 +335,24 @@ async function runCodexProxyHelloWorldEval(): Promise<EvalResultSummary> {
 
 async function runResponsesDirectHelloWorldEval(): Promise<EvalResultSummary> {
   await resetFakeAi();
-  configureResponsesDirect();
+  const apiKey = isLiveBackend() ? resolveOpenAIApiKey() : "test-key";
+  if (isLiveBackend() && !apiKey) {
+    throw new Error(
+      `RUNME_EVAL_BACKEND=live requires an API key. Set OPENAI_API_KEY or place it in ${OPENAI_API_KEY_FILE}.`,
+    );
+  }
+  configureResponsesDirect({
+    authMethod: "api_key",
+    apiKey,
+    openaiOrganization: OPENAI_ORGANIZATION,
+    openaiProject: OPENAI_PROJECT,
+  });
   const notebook = createNotebook("eval-responses-direct.runme.md");
   const result = runBrowserEval({
     harness: {
       adapter: "responses-direct",
       name: "responses-direct-eval",
-      baseUrl: FAKE_AI_BASE_URL,
+      baseUrl: isLiveBackend() ? OPENAI_RESPONSES_BASE_URL : FAKE_AI_BASE_URL,
     },
     notebookUri: notebook.uri,
     prompt: `Add a cell to print("hello world")`,
@@ -301,8 +360,12 @@ async function runResponsesDirectHelloWorldEval(): Promise<EvalResultSummary> {
   });
   const assertions = [
     assert(
-      result.assistantText.includes("Cell has been added."),
-      "assistant text confirms the cell was added",
+      isLiveBackend()
+        ? result.assistantText.trim().length > 0
+        : result.assistantText.includes("Cell has been added."),
+      isLiveBackend()
+        ? "assistant text was emitted"
+        : "assistant text confirms the cell was added",
     ),
     assert(
       Boolean(
@@ -314,7 +377,53 @@ async function runResponsesDirectHelloWorldEval(): Promise<EvalResultSummary> {
     assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
   ];
   return {
-    name: "responses-direct adds hello world cell",
+    name: `${isLiveBackend() ? "responses-direct live" : "responses-direct"} adds hello world cell`,
+    status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
+    assertions,
+  };
+}
+
+async function runCodexWasmHelloWorldEval(): Promise<EvalResultSummary> {
+  const apiKey = resolveOpenAIApiKey();
+  if (!apiKey) {
+    throw new Error(
+      `RUNME_EVAL_BACKEND=live requires an API key. Set OPENAI_API_KEY or place it in ${OPENAI_API_KEY_FILE}.`,
+    );
+  }
+  configureResponsesDirect({
+    authMethod: "api_key",
+    apiKey,
+    openaiOrganization: OPENAI_ORGANIZATION,
+    openaiProject: OPENAI_PROJECT,
+  });
+  const notebook = createNotebook("eval-codex-wasm.runme.md");
+  const result = runBrowserEval({
+    harness: {
+      adapter: "codex-wasm",
+      name: "codex-wasm-eval",
+    },
+    notebookUri: notebook.uri,
+    prompt: `Add a cell to print("hello world")`,
+    timeoutMs: 90000,
+    wasmApiKey: apiKey,
+  });
+  const assertions = [
+    assert(
+      result.assistantText.length > 0,
+      "assistant text was emitted",
+    ),
+    assert(
+      Boolean(
+        result.notebook?.cells.some((cell) => cell.value.includes(`hello world`)),
+      ),
+      "notebook snapshot contains the hello world cell",
+    ),
+    assert(result.metrics.ttfmMs !== null, "TTFM was recorded"),
+    assert(result.metrics.turnTimeMs > 0, "turn time was recorded"),
+    assert(result.wasmJournal.length > 0, "wasm journal captured activity"),
+  ];
+  return {
+    name: "codex-wasm live adds hello world cell",
     status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
     assertions,
   };
@@ -328,10 +437,15 @@ async function main(): Promise<void> {
     fakeAiHandle = await ensureFakeAi();
     await bootstrapEvalBridge();
 
-    const results = [
-      await runCodexProxyHelloWorldEval(),
-      await runResponsesDirectHelloWorldEval(),
-    ];
+    const results = isLiveBackend()
+      ? [
+          await runResponsesDirectHelloWorldEval(),
+          await runCodexWasmHelloWorldEval(),
+        ]
+      : [
+          await runCodexProxyHelloWorldEval(),
+          await runResponsesDirectHelloWorldEval(),
+        ];
 
     let failed = 0;
     for (const result of results) {

From 82479c38d8ef83232cf82d4f240a2c1a8b526fee Mon Sep 17 00:00:00 2001
From: Jeremy lewi <jeremy@lewi.us>
Date: Sat, 25 Apr 2026 12:17:43 -0700
Subject: [PATCH 6/6] feat: export eval artifacts as json

Signed-off-by: Jeremy lewi <jeremy@lewi.us>
---
 .gitignore                        |   1 +
 app/test/evals/runMinimalEvals.ts | 203 ++++++++++++++++++++++++------
 2 files changed, 168 insertions(+), 36 deletions(-)

diff --git a/.gitignore b/.gitignore
index 715057d..1097683 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ packages/react-console/gen
 /app/assets/generated
 /app/test/browser/.generated/
 /app/test/evals/.generated/
+/app/test/evals/eval-output/*
 /app/test/browser/test-output/*
 
 # Local checkout symlink
diff --git a/app/test/evals/runMinimalEvals.ts b/app/test/evals/runMinimalEvals.ts
index 97a7e7a..ea4cea5 100644
--- a/app/test/evals/runMinimalEvals.ts
+++ b/app/test/evals/runMinimalEvals.ts
@@ -1,5 +1,10 @@
 import { spawn, spawnSync, type ChildProcess } from "node:child_process";
-import { createWriteStream, mkdirSync, readFileSync } from "node:fs";
+import {
+  createWriteStream,
+  mkdirSync,
+  readFileSync,
+  writeFileSync,
+} from "node:fs";
 import { join, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
 
@@ -8,43 +13,87 @@ type EvalAssertion = {
   message: string;
 };
 
-type EvalResultSummary = {
-  name: string;
-  status: "PASS" | "FAIL";
-  assertions: EvalAssertion[];
+type EvalRequestLogEntry = {
+  method: string;
+  timestamp?: string;
+  params?: unknown;
 };
 
 type RunmeEvalResult = {
+  harness: {
+    name: string;
+    adapter: string;
+    baseUrl: string;
+  };
+  prompt: string;
+  threadId: string | null;
   assistantText: string;
-  requestLog: Array<{ method: string }>;
+  requestLog: EvalRequestLogEntry[];
+  notifications: unknown[];
   wasmJournal: unknown[];
-  notebook: {
-    uri: string;
-    cells: Array<{ value: string }>;
-  } | null;
+  notebook:
+    | {
+        uri: string;
+        name: string;
+        cells: Array<{
+          refId: string;
+          languageId: string;
+          value: string;
+        }>;
+      }
+    | null;
+  opfs: Array<{
+    path: string;
+    kind: "file" | "directory";
+    size?: number;
+  }>;
   metrics: {
     ttfmMs: number | null;
     turnTimeMs: number;
   };
 };
 
+type EvalResultSummary = {
+  name: string;
+  status: "PASS" | "FAIL";
+  assertions: EvalAssertion[];
+  result: RunmeEvalResult;
+};
+
 type ServiceHandle = {
   name: string;
   process: ChildProcess;
   logStream: ReturnType<typeof createWriteStream>;
 };
 
+type EvalArtifactManifestEntry = {
+  index: number;
+  name: string;
+  status: "PASS" | "FAIL";
+  file: string;
+  harness: string;
+  ttfmMs: number | null;
+  turnTimeMs: number;
+};
+
 const CURRENT_FILE = fileURLToPath(import.meta.url);
 const GENERATED_DIR = resolve(CURRENT_FILE, "..");
-const TEST_DIR = GENERATED_DIR.endsWith("/.generated") || GENERATED_DIR.endsWith("\\.generated")
-  ? resolve(GENERATED_DIR, "..")
-  : GENERATED_DIR;
+const TEST_DIR =
+  GENERATED_DIR.endsWith("/.generated") ||
+  GENERATED_DIR.endsWith("\\.generated")
+    ? resolve(GENERATED_DIR, "..")
+    : GENERATED_DIR;
 const APP_ROOT = resolve(TEST_DIR, "..", "..");
 const REPO_ROOT = resolve(APP_ROOT, "..");
-const OUTPUT_DIR = join(TEST_DIR, "eval-output");
-const FRONTEND_URL = process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5174";
-const EVAL_BACKEND = (process.env.RUNME_EVAL_BACKEND ?? "fake").trim().toLowerCase();
-const FAKE_AI_BASE_URL = process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
+const OUTPUT_ROOT =
+  process.env.RUNME_EVAL_OUTPUT_DIR ?? join(TEST_DIR, "eval-output");
+const FRONTEND_URL =
+  process.env.RUNME_EVAL_FRONTEND_URL ?? "http://localhost:5174";
+const EVAL_BACKEND = (process.env.RUNME_EVAL_BACKEND ?? "fake")
+  .trim()
+  .toLowerCase();
+const FAKE_AI_BASE_URL =
+  process.env.RUNME_EVAL_FAKE_AI_BASE_URL ?? "http://127.0.0.1:19989";
 const FAKE_AI_HEALTH_URL = `${FAKE_AI_BASE_URL}/healthz`;
 const FAKE_AI_RESET_URL = `${FAKE_AI_BASE_URL}/reset`;
 const FRONTEND_PORT = new URL(FRONTEND_URL).port || "5174";
@@ -70,19 +119,44 @@ const AGENT_BROWSER_PROFILE = process.env.AGENT_BROWSER_PROFILE?.trim() ?? "";
 const AGENT_BROWSER_HEADED = (process.env.AGENT_BROWSER_HEADED ?? "false")
   .trim()
   .toLowerCase() === "true";
+const RUN_ID =
+  process.env.RUNME_EVAL_RUN_ID?.trim() ||
+  new Date().toISOString().replace(/[:.]/g, "-");
+const RUN_OUTPUT_DIR = join(OUTPUT_ROOT, RUN_ID);
 
-mkdirSync(OUTPUT_DIR, { recursive: true });
+mkdirSync(RUN_OUTPUT_DIR, { recursive: true });
 
 function shellQuote(value: string): string {
   return `'${value.replace(/'/g, `'\"'\"'`)}'`;
 }
 
+function slugify(value: string): string {
+  return (
+    value
+      .toLowerCase()
+      .replace(/[^a-z0-9]+/g, "-")
+      .replace(/^-+|-+$/g, "")
+      .slice(0, 80) || "eval"
+  );
+}
+
+function writeJsonFile(path: string, data: unknown): void {
+  writeFileSync(path, `${JSON.stringify(data, null, 2)}\n`, "utf-8");
+}
+
 function isLiveBackend(): boolean {
-  return EVAL_BACKEND === "live" || EVAL_BACKEND === "openai" || EVAL_BACKEND === "real";
+  return (
+    EVAL_BACKEND === "live" ||
+    EVAL_BACKEND === "openai" ||
+    EVAL_BACKEND === "real"
+  );
 }
 
 function resolveOpenAIApiKey(): string {
-  const direct = process.env.RUNME_EVAL_OPENAI_API_KEY?.trim() ?? process.env.OPENAI_API_KEY?.trim() ?? "";
+  const direct =
+    process.env.RUNME_EVAL_OPENAI_API_KEY?.trim() ??
+    process.env.OPENAI_API_KEY?.trim() ??
+    "";
   if (direct) {
     return direct;
   }
@@ -113,11 +187,10 @@ function withAgentBrowserOptions(command: string): string {
   return `${leadingWhitespace}${["agent-browser", ...args].join(" ")} ${subcommand}`;
 }
 
-function run(command: string, timeoutMs = 30000): {
-  status: number;
-  stdout: string;
-  stderr: string;
-} {
+function run(
+  command: string,
+  timeoutMs = 30000,
+): { status: number; stdout: string; stderr: string } {
   const effectiveCommand = withAgentBrowserOptions(command);
   const result = spawnSync(effectiveCommand, {
     shell: true,
@@ -156,19 +229,26 @@ function isHttpReady(url: string): boolean {
   return run(`curl -sf ${shellQuote(url)}`, 5000).status === 0;
 }
 
-async function waitForHttpReady(url: string, timeoutMs = 45000): Promise<void> {
+async function waitForHttpReady(
+  url: string,
+  timeoutMs = 45000,
+): Promise<void> {
   const deadline = Date.now() + timeoutMs;
   while (Date.now() < deadline) {
     if (isHttpReady(url)) {
       return;
     }
-    await new Promise((resolve) => setTimeout(resolve, 500));
+    await new Promise((resolveTimer) => setTimeout(resolveTimer, 500));
   }
   throw new Error(`Timed out waiting for ${url}`);
 }
 
-function startService(name: string, command: string, cwd: string): ServiceHandle {
-  const logPath = join(OUTPUT_DIR, `${name}.log`);
+function startService(
+  name: string,
+  command: string,
+  cwd: string,
+): ServiceHandle {
+  const logPath = join(RUN_OUTPUT_DIR, `${name}.log`);
   const logStream = createWriteStream(logPath, { flags: "a" });
   const child = spawn(command, {
     cwd,
@@ -284,7 +364,7 @@ function configureResponsesDirect(options?: {
   );
 }
 
-function runBrowserEval(options: object): RunmeEvalResult {
+function runBrowserEval(options: Record<string, unknown>): RunmeEvalResult {
   return evaluateJson<RunmeEvalResult>(
     `JSON.stringify(await window.__runmeEval.run(${JSON.stringify(options)}))`,
     90000,
@@ -295,6 +375,36 @@ function assert(ok: boolean, message: string): EvalAssertion {
   return { ok, message };
 }
 
+function writeEvalArtifact(
+  index: number,
+  summary: EvalResultSummary,
+): EvalArtifactManifestEntry {
+  const filename = `${String(index + 1).padStart(2, "0")}-${slugify(summary.name)}.json`;
+  writeJsonFile(join(RUN_OUTPUT_DIR, filename), {
+    schemaVersion: 1,
+    runId: RUN_ID,
+    createdAt: new Date().toISOString(),
+    backend: EVAL_BACKEND,
+    frontendUrl: FRONTEND_URL,
+    eval: {
+      index,
+      name: summary.name,
+      status: summary.status,
+      assertions: summary.assertions,
+    },
+    result: summary.result,
+  });
+  return {
+    index,
+    name: summary.name,
+    status: summary.status,
+    file: filename,
+    harness: summary.result.harness.adapter,
+    ttfmMs: summary.result.metrics.ttfmMs,
+    turnTimeMs: summary.result.metrics.turnTimeMs,
+  };
+}
+
 async function runCodexProxyHelloWorldEval(): Promise<EvalResultSummary> {
   await resetFakeAi();
   const notebook = createNotebook("eval-codex-proxy.runme.md");
@@ -319,7 +429,9 @@ async function runCodexProxyHelloWorldEval(): Promise<EvalResultSummary> {
     ),
     assert(
       Boolean(
-        result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`)),
+        result.notebook?.cells.some((cell) =>
+          cell.value.includes(`print("hello world")`),
+        ),
       ),
       "notebook snapshot contains the hello world cell",
     ),
@@ -330,6 +442,7 @@ async function runCodexProxyHelloWorldEval(): Promise<EvalResultSummary> {
     name: "codex proxy adds hello world cell",
     status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
     assertions,
+    result,
   };
 }
 
@@ -369,7 +482,9 @@ async function runResponsesDirectHelloWorldEval(): Promise<EvalResultSummary> {
     ),
     assert(
       Boolean(
-        result.notebook?.cells.some((cell) => cell.value.includes(`print("hello world")`)),
+        result.notebook?.cells.some((cell) =>
+          cell.value.includes(`print("hello world")`),
+        ),
       ),
       "notebook snapshot contains the hello world cell",
     ),
@@ -380,6 +495,7 @@ async function runResponsesDirectHelloWorldEval(): Promise<EvalResultSummary> {
     name: `${isLiveBackend() ? "responses-direct live" : "responses-direct"} adds hello world cell`,
     status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
     assertions,
+    result,
   };
 }
 
@@ -408,10 +524,7 @@ async function runCodexWasmHelloWorldEval(): Promise<EvalResultSummary> {
     wasmApiKey: apiKey,
   });
   const assertions = [
-    assert(
-      result.assistantText.length > 0,
-      "assistant text was emitted",
-    ),
+    assert(result.assistantText.length > 0, "assistant text was emitted"),
     assert(
       Boolean(
         result.notebook?.cells.some((cell) => cell.value.includes(`hello world`)),
@@ -426,6 +539,7 @@ async function runCodexWasmHelloWorldEval(): Promise<EvalResultSummary> {
     name: "codex-wasm live adds hello world cell",
     status: assertions.every((item) => item.ok) ? "PASS" : "FAIL",
     assertions,
+    result,
   };
 }
 
@@ -448,15 +562,32 @@ async function main(): Promise<void> {
         ];
 
     let failed = 0;
+    const manifestEntries = results.map((result, index) =>
+      writeEvalArtifact(index, result),
+    );
+    writeJsonFile(join(RUN_OUTPUT_DIR, "summary.json"), {
+      schemaVersion: 1,
+      runId: RUN_ID,
+      createdAt: new Date().toISOString(),
+      backend: EVAL_BACKEND,
+      frontendUrl: FRONTEND_URL,
+      outputDir: RUN_OUTPUT_DIR,
+      evals: manifestEntries,
+    });
+
     for (const result of results) {
       console.log(`${result.status} ${result.name}`);
       for (const assertion of result.assertions) {
         console.log(`  ${assertion.ok ? "PASS" : "FAIL"} ${assertion.message}`);
       }
+      console.log(
+        `  metrics ttfmMs=${result.result.metrics.ttfmMs ?? "null"} turnTimeMs=${result.result.metrics.turnTimeMs}`,
+      );
       if (result.status === "FAIL") {
         failed += 1;
       }
     }
+    console.log(`Artifacts written to ${RUN_OUTPUT_DIR}`);
 
     if (failed > 0) {
       process.exitCode = 1;