SingularityAI-Dev · antnewman · May 7, 2026 · May 7, 2026 · May 8, 2026 · coderabbitai
diff --git a/package.json b/package.json
@@ -8,6 +8,7 @@
 	],
 	"scripts": {
 		"test": "vitest run",
+		"bench": "vitest run --config packages/core/vitest.perf.config.ts",
 		"lint": "biome check .",
 		"lint:fix": "biome check --write .",
 		"typecheck": "tsc --build tsconfig.build.json",

diff --git a/packages/core/__perf__/README.md b/packages/core/__perf__/README.md
@@ -0,0 +1,73 @@
+# `@logic-md/core` perf assertions
+
+Pre-merge regression assertions on the three core paths most likely to acquire
+silent quadratic behaviour, per the analysis in #46.
+
+## Running
+
+From the repository root:
+
+```bash
+npm run bench
+```
+
+This invokes vitest with [`vitest.perf.config.ts`](../vitest.perf.config.ts),
+which picks up only `**/__perf__/**/*.perf.ts` files and runs them serially in
+a single fork (for stable timings). Default `npm test` does not run the bench
+suite — `*.perf.ts` is outside the default `**/*.test.ts` glob.
+
+## Coverage
+
+| File | Asserts |
+|---|---|
+| [`compiler.perf.ts`](compiler.perf.ts) | `compileWorkflow` on a 200-step linear chain |
+| [`expression.perf.ts`](expression.perf.ts) | `evaluate` × 10,000 calls on the same template against varying contexts |
+| [`dag.perf.ts`](dag.perf.ts) | `resolve` on a 1000-step linear chain |
+
+Linear chains are the worst-case input shape — depth equals node count, which
+maximises the impact of any per-pop or per-level work in the DAG resolver and
+maximises the per-step traversal cost in the compiler.
+
+## Calibration methodology
+
+Thresholds are calibrated against `main` per the methodology agreed in #46:
+
+1. Run the bench on `main` repeatedly across multiple developer-machine
+   sessions with varying background load.
+2. Take the worst observed elapsed time per metric.
+3. Multiply by **1.5** (Math.ceil) for slower-machine headroom.
+4. Round up to a clean number for the assertion threshold.
+
+The +50% headroom is wider than the +25% suggested in the original #46 review,
+based on observed variance on Windows developer machines (single-shot timings
+can vary up to ~3× between quiet and loaded sessions). The bench is opt-in, not
+default-CI, so this trade-off favours stable execution at the cost of slightly
+weaker regression sensitivity. Once the algorithmic fixes in PRs 2-4 land, the
+assertion margin will widen substantially (~100× for the compiler fix), which
+provides a much sharper proof-of-fix signal than the initial calibration.
+
+Each `*.perf.ts` file documents its own calibration data in a header comment so
+that recalibration after a change is auditable. If a fix legitimately reduces
+the workload (e.g. PR 2 in the #46 sequence eliminating the per-step DAG
+re-resolution), the threshold should NOT be tightened in the same PR — leave
+the headroom widening as visible proof of the fix.
+
+## Adding a new bench
+
+1. Create `<name>.perf.ts` next to existing files.
+2. Use `describe` + `test` from `vitest`.
+3. Always include a warm-up call before timed measurement (let v8 optimise the
+   hot path).
+4. Run `node` directly with the same workload 5 times against `main`, capture
+   raw timings, document them in a header comment, and lock the worst × 1.5.
+
+## Why these three?
+
+These are the three concrete candidates surfaced in [#46](../../../../issues/46) — places where the implementation is correct at small scale but algorithmically quadratic+ at scale, currently invisible to all 325 unit tests. The bench suite is the regression net for the full sequence:
+
+- **PR 1 (this scaffold):** establish discipline; assertions pass on main.
+- **PR 2:** compiler fix (compileStep accepting pre-computed dagResult).
+- **PR 3:** expression cache (AST cache in `evaluate`).
+- **PR 4:** DAG sort tightening (eliminate per-pop queue sort and level-filter loop).
+
+After each fix, re-running `npm run bench` shows the assertion margin widening — which IS the proof.
diff --git a/packages/core/__perf__/_helpers.ts b/packages/core/__perf__/_helpers.ts
@@ -0,0 +1,67 @@
+// =============================================================================
+// Perf-test helpers — synthetic spec generators for scaling assertions
+// =============================================================================
+// These are NOT part of the public API. They live under __perf__/ and are only
+// used by the bench suite (`npm run bench`).
+// =============================================================================
+
+import type { LogicSpec, Step, WorkflowContext } from "../types.js";
+
+/**
+ * Generate a `LogicSpec` with `n` steps in a strict linear chain
+ * (step_0 → step_1 → … → step_{n-1}).
+ *
+ * Linear chains are the worst case for several scaling concerns:
+ *   - DAG resolve's level-grouping filter (D = N depths)
+ *   - compileWorkflow's per-step DAG re-resolution (N×(V+E) traversal)
+ *   - Token-budget warnings as the prompt segment grows.
+ */
+export function makeLinearChainSpec(n: number): LogicSpec {
+	if (n < 1) {
+		throw new Error(`makeLinearChainSpec requires n >= 1, got ${n}`);
+	}
+	const steps: Record<string, Step> = {
+		step_0: {
+			description: "first",
+			instructions: "first step in linear chain",
+		},
+	};
+	for (let i = 1; i < n; i++) {
+		steps[`step_${i}`] = {
+			description: `step ${i}`,
+			instructions: `step ${i} in linear chain`,
+			needs: [`step_${i - 1}`],
+		};
+	}
+	return {
+		spec_version: "1.0",
+		name: "linear-chain-perf",
+		steps,
+	};
+}
+
+/**
+ * Just the `steps` map from `makeLinearChainSpec(n)`.
+ * Useful when calling `resolve(steps)` directly.
+ */
+export function makeLinearChainSteps(n: number): Record<string, Step> {
+	const spec = makeLinearChainSpec(n);
+	return spec.steps as Record<string, Step>;
+}
+
+/**
+ * Default `WorkflowContext` for compile-bench measurements.
+ */
+export function makeWorkflowContext(): WorkflowContext {
+	return {
+		currentStep: "step_0",
+		previousOutputs: {},
+		input: {},
+		attemptNumber: 1,
+		branchReason: null,
+		previousFailureReason: null,
+		totalSteps: 0,
+		completedSteps: [],
+		dagLevels: [],
+	};
+}
diff --git a/packages/core/__perf__/compiler.perf.ts b/packages/core/__perf__/compiler.perf.ts
@@ -0,0 +1,60 @@
+// =============================================================================
+// Perf assertion: compileWorkflow scaling
+// =============================================================================
+// Pins the cost of compiling a 200-step linear-chain workflow against current
+// `main`. Linear chains are the worst-case shape for `compileWorkflow` because
+// every `compileStep` call re-resolves the full DAG (Candidate 1 in #46).
+//
+// Chain size of 200 (rather than 1000) keeps the bench under 2 seconds per
+// run; once Candidate 1's fix lands the same workload should drop ~100×, and
+// the assertion margin will widen dramatically — exactly the proof-of-fix
+// signal Rain asked for in his sequencing comment.
+//
+// Threshold calibration methodology (per #46 review, with +50% adjustment
+// noted in the calibration block below):
+//   1. Run on `main` repeatedly across developer-machine sessions with
+//      varying background load.
+//   2. Take the worst observed elapsed time.
+//   3. Multiply by 1.5 (Math.ceil) for slower-machine headroom.
+//   4. Lock that value in as the assertion threshold.
+//
+// Calibration data captured 2026-05-07 on Node v22.18.0 across multiple
+// developer-machine sessions with varying background load:
+//   quiet runs: 746ms, 778ms, 1318ms, 1326ms, 1398ms
+//   loaded runs: 2102ms, 2607ms, 2899ms
+//   worst observed = 2899ms  →  ceil(2899 × 1.5) = 4349ms  →  4500ms (rounded)
+//
+// The +50% headroom (rather than the +25% suggested in the original #46
+// review) reflects observed variance on Windows developer machines under
+// realistic background load. The bench is opt-in (`npm run bench`, NOT
+// default `npm test`), so this trade-off favours stable execution at the
+// cost of slightly weaker regression sensitivity. Once Candidate 1's fix
+// lands, the assertion margin will widen from ~1.5× to ~100×, providing
+// a much sharper proof-of-fix signal.
+// =============================================================================
+
+import { describe, expect, test } from "vitest";
+import { compileWorkflow } from "../index.js";
+import { makeLinearChainSpec, makeWorkflowContext } from "./_helpers.js";
+
+/**
+ * Calibrated threshold for compileWorkflow on a 200-step linear chain.
+ * See header comment for methodology and raw data.
+ */
+const COMPILE_200_STEP_THRESHOLD_MS = 4500;
+
+describe("perf: compileWorkflow scaling", () => {
+	test(`compileWorkflow on 200-step linear chain completes <${COMPILE_200_STEP_THRESHOLD_MS}ms`, () => {
+		const spec = makeLinearChainSpec(200);
+		const ctx = makeWorkflowContext();
+
+		// Warm-up: let v8 optimise the hot path before measurement.
+		compileWorkflow(spec, ctx);
+
+		const t0 = performance.now();
+		compileWorkflow(spec, ctx);
+		const elapsed = performance.now() - t0;
+
+		expect(elapsed).toBeLessThan(COMPILE_200_STEP_THRESHOLD_MS);
+	});
+});
diff --git a/packages/core/__perf__/dag.perf.ts b/packages/core/__perf__/dag.perf.ts
@@ -0,0 +1,43 @@
+// =============================================================================
+// Perf assertion: resolve() scaling on a 1000-step linear chain
+// =============================================================================
+// Pins the cost of topological sort + level grouping on the worst-case DAG
+// shape (linear chain, where depth = N). Catches regressions in the per-pop
+// queue sort, neighbour sort, and level-filter loop in `dag.ts`.
+// Threshold calibrated against current `main` (5 runs, take worst, +25%).
+// =============================================================================
+
+import { describe, expect, test } from "vitest";
+import { resolve } from "../index.js";
+import { makeLinearChainSteps } from "./_helpers.js";
+
+/**
+ * Calibrated threshold for resolve() on a 1000-step linear chain.
+ *
+ * Calibration methodology: multiple runs on `main` across developer-machine
+ * sessions with varying background load; take worst observed, multiply by 1.5
+ * for headroom.
+ *
+ * Calibration data captured 2026-05-07 on Node v22.18.0:
+ *   quiet runs: 117ms, 128ms, 143ms, 152ms, 215ms
+ *   loaded runs: 419ms, 484ms
+ *   worst observed = 484ms  →  ceil(484 × 1.5) = 727ms  →  800ms (rounded)
+ */
+const RESOLVE_1000_STEP_THRESHOLD_MS = 800;
+
+describe("perf: dag.resolve scaling", () => {
+	test(`resolve(1000-step linear chain) completes <${RESOLVE_1000_STEP_THRESHOLD_MS}ms`, () => {
+		const steps = makeLinearChainSteps(1000);
+
+		// Warm-up.
+		const warm = resolve(steps);
+		expect(warm.ok).toBe(true);
+
+		const t0 = performance.now();
+		const r = resolve(steps);
+		const elapsed = performance.now() - t0;
+
+		expect(r.ok).toBe(true);
+		expect(elapsed).toBeLessThan(RESOLVE_1000_STEP_THRESHOLD_MS);
+	});
+});
diff --git a/packages/core/__perf__/expression.perf.ts b/packages/core/__perf__/expression.perf.ts
@@ -0,0 +1,50 @@
+// =============================================================================
+// Perf assertion: evaluate() throughput on repeated expressions
+// =============================================================================
+// Pins the cost of evaluating the same `{{ ... }}` expression 10,000 times
+// against varying contexts. Catches regressions in tokenize/parse hot path
+// (e.g. accidental disabling of an AST cache once one is added in PR 3).
+// Threshold calibrated against current `main` (5 runs, take worst, +25%).
+// =============================================================================
+
+import { describe, expect, test } from "vitest";
+import { evaluate } from "../index.js";
+
+/**
+ * Calibrated threshold for 10,000 evaluate() calls on the same template.
+ *
+ * Calibration methodology: multiple runs on `main` across developer-machine
+ * sessions with varying background load; take worst observed, multiply by 1.5
+ * for headroom. The +50% (rather than the original +25%) reflects observed
+ * variance on Windows developer machines.
+ *
+ * Calibration data captured 2026-05-07 on Node v22.18.0:
+ *   quiet runs: 135ms, 197ms, 234ms, 268ms, 382ms
+ *   loaded runs: 617ms
+ *   worst observed = 617ms  →  ceil(617 × 1.5) = 926ms  →  1000ms (rounded)
+ */
+const EVAL_10K_THRESHOLD_MS = 1000;
+
+describe("perf: evaluate() throughput", () => {
+	test(`evaluate same expression 10,000 times <${EVAL_10K_THRESHOLD_MS}ms`, () => {
+		const tmpl = "{{ output.findings.length > 3 && output.confidence >= 0.6 }}";
+
+		// Warm-up: prime the parser path.
+		for (let i = 0; i < 100; i++) {
+			evaluate(tmpl, { output: { findings: [], confidence: 0 } });
+		}
+
+		const t0 = performance.now();
+		for (let i = 0; i < 10_000; i++) {
+			evaluate(tmpl, {
+				output: {
+					findings: new Array(i % 5),
+					confidence: (i % 100) / 100,
+				},
+			});
+		}
+		const elapsed = performance.now() - t0;
+
+		expect(elapsed).toBeLessThan(EVAL_10K_THRESHOLD_MS);
+	});
+});
diff --git a/packages/core/vitest.perf.config.ts b/packages/core/vitest.perf.config.ts
@@ -0,0 +1,25 @@
+// =============================================================================
+// Vitest config for the bench suite (`npm run bench`)
+// =============================================================================
+// Picks up only `__perf__/**/*.perf.ts`, runs them serially in a single fork
+// for stable timings, and bypasses the default `**/*.test.ts` glob so the
+// bench suite never runs as part of `npm test`.
+// =============================================================================
+
+import { defineConfig } from "vitest/config";
+
+export default defineConfig({
+	test: {
+		include: ["**/__perf__/**/*.perf.ts"],
+		// Serialise execution to minimise cross-test interference on timings.
+		// `pool: "forks"` alone does NOT serialise — in Vitest 4 the option
+		// that guarantees one-file-at-a-time execution is `fileParallelism:
+		// false` at the top of the `test` block. (Pre-Vitest-4 this was
+		// `poolOptions.forks.singleFork`; both `poolOptions` and the per-pool
+		// `singleFork` were removed in the v4 pool rework.)
+		pool: "forks",
+		fileParallelism: false,
+		// 60s ceiling — well above any realistic threshold; only fires on hangs.
+		testTimeout: 60_000,
+	},
+});