BRIKEV · kevinccbsg · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/.serena/.gitignore b/.serena/.gitignore
@@ -0,0 +1,2 @@
+/cache
+/project.local.yml
diff --git a/.serena/project.yml b/.serena/project.yml
@@ -0,0 +1,154 @@
+# the name by which the project can be referenced within Serena
+project_name: "twd-cli"
+
+
+# list of languages for which language servers are started; choose from:
+#   al                  bash                clojure             cpp                 csharp
+#   csharp_omnisharp    dart                elixir              elm                 erlang
+#   fortran             fsharp              go                  groovy              haskell
+#   haxe                java                julia               kotlin              lua
+#   markdown
+#   matlab              nix                 pascal              perl                php
+#   php_phpactor        powershell          python              python_jedi         r
+#   rego                ruby                ruby_solargraph     rust                scala
+#   swift               terraform           toml                typescript          typescript_vts
+#   vue                 yaml                zig
+#   (This list may be outdated. For the current list, see values of Language enum here:
+#   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
+#   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
+# Note:
+#   - For C, use cpp
+#   - For JavaScript, use typescript
+#   - For Free Pascal/Lazarus, use pascal
+# Special requirements:
+#   Some languages require additional setup/installations.
+#   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
+# When using multiple languages, the first language server that supports a given file will be used for that file.
+# The first language is the default language and the respective language server will be used as a fallback.
+# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
+languages:
+- typescript
+
+# the encoding used by text files in the project
+# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
+encoding: "utf-8"
+
+# line ending convention to use when writing source files.
+# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
+# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
+line_ending:
+
+# The language backend to use for this project.
+# If not set, the global setting from serena_config.yml is used.
+# Valid values: LSP, JetBrains
+# Note: the backend is fixed at startup. If a project with a different backend
+# is activated post-init, an error will be returned.
+language_backend:
+
+# whether to use project's .gitignore files to ignore files
+ignore_all_files_in_gitignore: true
+
+# advanced configuration option allowing to configure language server-specific options.
+# Maps the language key to the options.
+# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
+# No documentation on options means no options are available.
+ls_specific_settings: {}
+
+# list of additional paths to ignore in this project.
+# Same syntax as gitignore, so you can use * and **.
+# Note: global ignored_paths from serena_config.yml are also applied additively.
+ignored_paths: []
+
+# whether the project is in read-only mode
+# If set to true, all editing tools will be disabled and attempts to use them will result in an error
+# Added on 2025-04-18
+read_only: false
+
+# list of tool names to exclude.
+# This extends the existing exclusions (e.g. from the global configuration)
+#
+# Below is the complete list of tools for convenience.
+# To make sure you have the latest list of tools, and to view their descriptions, 
+# execute `uv run scripts/print_tool_overview.py`.
+#
+#  * `activate_project`: Activates a project based on the project name or path.
+#  * `check_onboarding_performed`: Checks whether project onboarding was already performed.
+#  * `create_text_file`: Creates/overwrites a file in the project directory.
+#  * `delete_memory`: Delete a memory file. Should only happen if a user asks for it explicitly,
+#       for example by saying that the information retrieved from a memory file is no longer correct
+#       or no longer relevant for the project.
+#  * `edit_memory`: Replaces content matching a regular expression in a memory.
+#  * `execute_shell_command`: Executes a shell command.
+#  * `find_file`: Finds files in the given relative paths
+#  * `find_referencing_symbols`: Finds symbols that reference the given symbol using the language server backend
+#  * `find_symbol`: Performs a global (or local) search using the language server backend.
+#  * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes.
+#  * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file.
+#  * `initial_instructions`: Provides instructions Serena usage (i.e. the 'Serena Instructions Manual')
+#       for clients that do not read the initial instructions when the MCP server is connected.
+#  * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol.
+#  * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol.
+#  * `list_dir`: Lists files and directories in the given directory (optionally with recursion).
+#  * `list_memories`: List available memories. Any memory can be read using the `read_memory` tool.
+#  * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building).
+#  * `read_file`: Reads a file within the project directory.
+#  * `read_memory`: Read the content of a memory file. This tool should only be used if the information
+#       is relevant to the current task. You can infer whether the information
+#       is relevant from the memory file name.
+#       You should not read the same memory file multiple times in the same conversation.
+#  * `rename_memory`: Renames or moves a memory. Moving between project and global scope is supported
+#       (e.g., renaming "global/foo" to "bar" moves it from global to project scope).
+#  * `rename_symbol`: Renames a symbol throughout the codebase using language server refactoring capabilities.
+#       For JB, we use a separate tool.
+#  * `replace_content`: Replaces content in a file (optionally using regular expressions).
+#  * `replace_symbol_body`: Replaces the full definition of a symbol using the language server backend.
+#  * `safe_delete_symbol`:
+#  * `search_for_pattern`: Performs a search for a pattern in the project.
+#  * `write_memory`: Write some information (utf-8-encoded) about this project that can be useful for future tasks to a memory in md format.
+#       The memory name should be meaningful.
+excluded_tools: []
+
+# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
+# This extends the existing inclusions (e.g. from the global configuration).
+included_optional_tools: []
+
+# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
+# This cannot be combined with non-empty excluded_tools or included_optional_tools.
+fixed_tools: []
+
+# list of mode names to that are always to be included in the set of active modes
+# The full set of modes to be activated is base_modes + default_modes.
+# If the setting is undefined, the base_modes from the global configuration (serena_config.yml) apply.
+# Otherwise, this setting overrides the global configuration.
+# Set this to [] to disable base modes for this project.
+# Set this to a list of mode names to always include the respective modes for this project.
+base_modes:
+
+# list of mode names that are to be activated by default.
+# The full set of modes to be activated is base_modes + default_modes.
+# If the setting is undefined, the default_modes from the global configuration (serena_config.yml) apply.
+# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
+# This setting can, in turn, be overridden by CLI parameters (--mode).
+default_modes:
+
+# initial prompt for the project. It will always be given to the LLM upon activating the project
+# (contrary to the memories, which are loaded on demand).
+initial_prompt: ""
+
+# time budget (seconds) per tool call for the retrieval of additional symbol information
+# such as docstrings or parameter information.
+# This overrides the corresponding setting in the global configuration; see the documentation there.
+# If null or missing, use the setting from the global configuration.
+symbol_info_budget:
+
+# list of regex patterns which, when matched, mark a memory entry as read‑only.
+# Extends the list from the global configuration, merging the two lists.
+read_only_memory_patterns: []
+
+# list of regex patterns for memories to completely ignore.
+# Matching memories will not appear in list_memories or activate_project output
+# and cannot be accessed via read_memory or write_memory.
+# To access ignored memory files, use the read_file tool on the raw file path.
+# Extends the list from the global configuration, merging the two lists.
+# Example: ["_archive/.*", "_episodes/.*"]
+ignored_memory_patterns: []
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -19,7 +19,9 @@ The codebase is a small ESM-only Node.js CLI with two core source files:
 
 **`bin/twd-cli.js`** — CLI entry point. Parses `process.argv` for the `run` command, calls `runTests()`, and exits with code 0 (pass) or 1 (failure).
 
-**`src/config.js`** — `loadConfig()` reads `twd.config.json` from `process.cwd()`, merges it with defaults (url, timeout, coverage, headless, puppeteerArgs), and returns the merged config. Falls back to defaults if the file is missing or unparseable.
+**`src/config.js`** — `loadConfig()` reads `twd.config.json` from `process.cwd()`, merges it with defaults (url, timeout, coverage, headless, puppeteerArgs, retryCount, protocolTimeout), and returns the merged config. Falls back to defaults if the file is missing or unparseable.
+
+`protocolTimeout` (default `300000`, 5 min) is passed to `puppeteer.launch` and bounds Puppeteer's CDP commands. It matters because the entire suite runs inside a single `page.evaluate` (`Runtime.callFunctionOn`), so Puppeteer's implicit 180000ms ceiling would abort long-but-passing suites with no per-test output. Raise it for slow CI; `0` means no timeout.
 
 **`src/index.js`** — `runTests()` is the main orchestrator:
 1. Loads config via `loadConfig()`

diff --git a/README.md b/README.md
@@ -37,7 +37,8 @@ Create a `twd.config.json` file in your project root:
   "nycOutputDir": "./.nyc_output",
   "headless": true,
   "puppeteerArgs": ["--no-sandbox", "--disable-setuid-sandbox"],
-  "retryCount": 2
+  "retryCount": 2,
+  "protocolTimeout": 300000
 }
 ```
 
@@ -53,6 +54,7 @@ Create a `twd.config.json` file in your project root:
 | `headless` | boolean | `true` | Run browser in headless mode |
 | `puppeteerArgs` | string[] | `["--no-sandbox", "--disable-setuid-sandbox"]` | Additional Puppeteer launch arguments |
 | `retryCount` | number | `2` | Number of attempts per test before reporting failure. Set to `1` to disable retries |
+| `protocolTimeout` | number | `300000` | Puppeteer CDP `protocolTimeout` in ms (5 min). The whole suite runs inside one `page.evaluate`, so this bounds the **entire run** — raise it (e.g. `600000`) for slow CI; `0` means no timeout. Defaults above Puppeteer's implicit 180000ms ceiling |
 | `contracts` | array | — | OpenAPI contract validation specs (see [Contract Validation](#contract-validation)) |
 | `contractReportPath` | string | — | Path to write a markdown report for CI/PR integration |
 

diff --git a/docs/spec-protocol-timeout.md b/docs/spec-protocol-timeout.md
@@ -0,0 +1,94 @@
+# Spec: Stop the full suite from dying at puppeteer's 180s `protocolTimeout`
+
+## Problem
+
+`twd-cli` runs the **entire** test suite inside a single `page.evaluate(...)` call
+(`src/index.js:58`). Puppeteer maps every `page.evaluate` to one
+`Runtime.callFunctionOn` CDP command, and that command is bound by
+`protocolTimeout`, which defaults to **180000 ms (180s)**.
+
+When the whole suite takes longer than 180s — common on slow CI runners even
+when it passes locally — puppeteer aborts with:
+
+```
+Error running tests: ProtocolError: Runtime.callFunctionOn timed out.
+Increase the 'protocolTimeout' setting in launch/connect calls for a higher timeout if needed.
+```
+
+This is a hard failure with **no per-test output** (the whole batch is lost), so
+it looks like a crash rather than a slow suite. Observed in
+holafly/web-checkout CI run 26503182725: page loaded at 09:40:46, timed out at
+09:43:46 — exactly 180s — while the same suite finishes in ~130s locally.
+
+The current `timeout` config field (default 10000) is **only** passed to
+`page.waitForSelector` (`src/index.js:54`), not to puppeteer's protocol layer,
+so there is no way to raise the ceiling today.
+
+## Goal
+
+Make the suite robust against the 180s protocol ceiling, and give a clearer
+failure mode when a run is genuinely too slow.
+
+## Scope
+
+Two changes, independently shippable. Phase 1 unblocks immediately; Phase 2 is
+the durable fix.
+
+### Phase 1 — Configurable `protocolTimeout` (quick, low-risk)
+
+1. Add `protocolTimeout` to `DEFAULT_CONFIG` in `src/config.js`. Default to a
+   safe value above the current implicit one, e.g. `300000` (5 min). Document
+   that `0` means "no timeout" (puppeteer treats `0` as unlimited).
+2. Pass it through to `puppeteer.launch(...)` in `src/index.js:27`:
+   ```js
+   browser = await puppeteer.launch({
+     headless: config.headless,
+     args: config.puppeteerArgs,
+     protocolTimeout: config.protocolTimeout,
+   });
+   ```
+3. Update `twd.config.example.json`, `README.md`, and `CLAUDE.md` to document
+   the new field and explain *why* it exists (whole-suite-in-one-evaluate).
+
+**Acceptance:** a consumer can set `"protocolTimeout": 600000` in
+`twd.config.json` and a 200s suite passes in CI.
+
+### Phase 2 — Run tests incrementally, not in one `page.evaluate` (durable)
+
+The deeper issue is that one slow run blows the whole batch and emits no
+streamed output. Drive the in-browser `TestRunner` test-by-test (or in bounded
+batches) from Node, so each `page.evaluate` is short and bounded by the
+per-test `timeout`, not the suite total.
+
+Sketch:
+- Expose the test list from the browser first:
+  `const tests = await page.evaluate(() => window.__testRunner.list())`
+  (add a `list()`/`getTests()` accessor to the runner in `twd-js` if absent).
+- Loop in Node, evaluating one test (or a batch of N) per call, collecting
+  `testStatus` incrementally and printing results as they complete (matches the
+  streamed output `twd-relay` already gives).
+- Each call is now bounded by per-test time, so `protocolTimeout` only has to
+  cover the slowest single test/batch, not the whole suite.
+
+Benefits: no whole-suite loss on timeout, live progress in CI logs, natural
+place to enforce a real per-test timeout and to short-circuit on first failure
+if desired.
+
+**Acceptance:** CI logs show per-test pass/fail as the run progresses; a single
+hanging test fails only that test (or that batch) instead of the entire run.
+
+## Out of scope
+
+- Parallelizing across multiple pages/browsers.
+- Changing `retryCount` semantics.
+
+## Notes for the implementer
+
+- `src/index.js:58-82` is the single `page.evaluate` to refactor for Phase 2.
+- `src/config.js` `DEFAULT_CONFIG` (lines 4-13) is where the new field lands.
+- Keep Phase 1 even after Phase 2 ships — a generous `protocolTimeout` is still
+  a sane safety net for the per-batch calls.
+- Tests for this repo live under `tests/`; add config-merge coverage for the new
+  field and, for Phase 2, a fixture app with an artificially slow test.
+</content>
+</invoke>
diff --git a/src/config.js b/src/config.js
@@ -10,6 +10,7 @@ const DEFAULT_CONFIG = {
   headless: true,
   puppeteerArgs: ['--no-sandbox', '--disable-setuid-sandbox'],
   retryCount: 2,
+  protocolTimeout: 300000,
 };
 
 export function loadConfig() {

diff --git a/src/index.js b/src/index.js
@@ -9,6 +9,14 @@ import { generateContractMarkdown } from './contractMarkdown.js';
 import { buildTestPath } from './buildTestPath.js';
 import { formatTestSummary, formatFailedTestsBlock } from './testSummary.js';
 
+function isProtocolTimeout(error) {
+  const message = error && error.message ? error.message : '';
+  return (
+    (error && error.name === 'ProtocolError' && /timed out/i.test(message)) ||
+    /protocolTimeout/i.test(message)
+  );
+}
+
 export async function runTests() {
   let browser;
   try {
@@ -27,6 +35,7 @@ export async function runTests() {
     browser = await puppeteer.launch({
       headless: config.headless,
       args: config.puppeteerArgs,
+      protocolTimeout: config.protocolTimeout,
     });
 
     const page = await browser.newPage();
@@ -173,6 +182,13 @@ export async function runTests() {
 
   } catch (error) {
     console.error('Error running tests:', error);
+    if (isProtocolTimeout(error)) {
+      console.error(
+        '\nThis looks like a Puppeteer protocolTimeout. The whole test suite runs in a single\n' +
+        'page.evaluate call, so the run aborts if it takes longer than "protocolTimeout" (ms).\n' +
+        'Raise it in twd.config.json, e.g. { "protocolTimeout": 600000 } (0 = no timeout).'
+      );
+    }
     if (browser) await browser.close();
     throw error;
   }

diff --git a/tests/config.test.js b/tests/config.test.js
@@ -32,6 +32,7 @@ describe('loadConfig', () => {
       headless: true,
       puppeteerArgs: ['--no-sandbox', '--disable-setuid-sandbox'],
       retryCount: 2,
+      protocolTimeout: 300000,
     });
     expect(fs.existsSync).toHaveBeenCalledWith(path.resolve(mockCwd, 'twd.config.json'));
   });
@@ -56,6 +57,7 @@ describe('loadConfig', () => {
       headless: true,
       puppeteerArgs: ['--no-sandbox', '--disable-setuid-sandbox'],
       retryCount: 2,
+      protocolTimeout: 300000,
     });
     expect(fs.readFileSync).toHaveBeenCalledWith(
       path.resolve(mockCwd, 'twd.config.json'),
@@ -73,6 +75,7 @@ describe('loadConfig', () => {
       headless: false,
       puppeteerArgs: ['--disable-dev-shm-usage'],
       retryCount: 3,
+      protocolTimeout: 600000,
     };
 
     vi.mocked(fs.existsSync).mockReturnValue(true);
@@ -100,6 +103,7 @@ describe('loadConfig', () => {
       headless: true,
       puppeteerArgs: ['--no-sandbox', '--disable-setuid-sandbox'],
       retryCount: 2,
+      protocolTimeout: 300000,
     });
     expect(consoleWarnSpy).toHaveBeenCalledWith(
       expect.stringContaining('Warning: Could not parse twd.config.json'),
@@ -123,6 +127,15 @@ describe('loadConfig', () => {
     expect(config.url).toBe('http://localhost:5173');
   });
 
+  it('should default protocolTimeout and allow user override', () => {
+    vi.mocked(fs.existsSync).mockReturnValue(false);
+    expect(loadConfig().protocolTimeout).toBe(300000);
+
+    vi.mocked(fs.existsSync).mockReturnValue(true);
+    vi.mocked(fs.readFileSync).mockReturnValue(JSON.stringify({ protocolTimeout: 0 }));
+    expect(loadConfig().protocolTimeout).toBe(0);
+  });
+
   it('should handle partial user config correctly', () => {
     const userConfig = {
       headless: false,