From 60aed86342a0a19dede6089507832dff917b83aa Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Sun, 17 May 2026 02:01:19 -0600 Subject: [PATCH 01/85] lint(scripts): add no-weak-assertions custom check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds scripts/lint/no-weak-assertions.ts mirroring the no-raw-typeof.ts shape. Catches four coverage-theater patterns in *.test.ts files: - assertion-free-test : it() / test() blocks with no expect() or expect-helper calls (expectUnauthorized, expectJsonResponse, etc.) - only-tobedefined : every direct expect() ends in a non-specific matcher (.toBeDefined / .toBeTruthy / .toBeFalsy / .not.toBeUndefined / .not.toBeNull) AND no expect-helper is called. (.toBeUndefined() and .toBeNull() alone are NOT flagged — they assert specific return values.) - bare-tohavebeencalled : .toHaveBeenCalled() without a matching .toHaveBeenCalledWith / .toHaveBeenCalledTimes in the same block. - large-snapshot : toMatchInlineSnapshot() body > 50 lines. File-level escape hatch: `// no-weak-assertions: disable` in the first 5 lines of a file skips the entire file. Add `bun lint:weak-assertions` and `bun test:scripts` plus a `scripts/vitest.config.ts` for the scripts test suite. The check is NOT wired into `scripts/check-all.ts` yet — running it against the repo surfaces ~27 pre-existing violations (mostly .toHaveBeenCalled() and .toBeDefined()-only blocks). Wiring into check-all.ts will follow once those are addressed in a separate cleanup PR. Includes 16 unit tests for the analyzer covering each rule's positive and negative cases, the disable comment, helper-function detection, and the it.todo / it.skip allowance. --- package.json | 2 + .../lint/__tests__/no-weak-assertions.test.ts | 225 +++++++++++ scripts/lint/no-weak-assertions.ts | 354 ++++++++++++++++++ scripts/vitest.config.ts | 20 + 4 files changed, 601 insertions(+) create mode 100644 scripts/lint/__tests__/no-weak-assertions.test.ts create mode 100644 scripts/lint/no-weak-assertions.ts create mode 100644 scripts/vitest.config.ts diff --git a/package.json b/package.json index 7fd9e3a8a2..81492e2a23 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,8 @@ "lefthook": "lefthook install", "lint": "biome check --write", "lint:custom": "bun run scripts/lint/no-raw-typeof.ts && bun run scripts/lint/no-raw-regex.ts && bun run packages/env/scripts/no-raw-process-env.ts && bun run scripts/lint/no-duplicate-guards.ts && bun run scripts/lint/no-unauth-routes.ts && bun run scripts/lint/check-drizzle-migrations.ts", + "lint:weak-assertions": "bun run scripts/lint/no-weak-assertions.ts", + "test:scripts": "vitest run --config scripts/vitest.config.ts", "lint:strict": "biome check && bun run lint:custom", "lint-unsafe": "biome check --write --unsafe", "mcp": "bun run --cwd packages/mcp dev", diff --git a/scripts/lint/__tests__/no-weak-assertions.test.ts b/scripts/lint/__tests__/no-weak-assertions.test.ts new file mode 100644 index 0000000000..eff2f2af69 --- /dev/null +++ b/scripts/lint/__tests__/no-weak-assertions.test.ts @@ -0,0 +1,225 @@ +import { describe, expect, it } from 'vitest'; +import { analyzeSource, isFileDisabled } from '../no-weak-assertions'; + +const fakeFile = 'apps/test/example.test.ts'; + +describe('no-weak-assertions', () => { + describe('assertion-free-test', () => { + it('flags an it() block with zero expect() or expect-helper calls', () => { + const src = ` + describe('thing', () => { + it('does something', () => { + const x = 1 + 1; + console.log(x); + }); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(1); + expect(violations[0]?.rule).toBe('assertion-free-test'); + expect(violations[0]?.line).toBe(3); + }); + + it('does NOT flag when a custom expect-helper is used', () => { + const src = ` + it('rejects unauth', async () => { + const res = await api('/x'); + expectUnauthorized(res); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.filter((v) => v.rule === 'assertion-free-test')).toHaveLength(0); + }); + + it('does NOT flag it.todo or it.skip blocks', () => { + const src = ` + it.todo('eventually'); + it.skip('not yet', () => { + const x = 1; + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(0); + }); + }); + + describe('only-tobedefined', () => { + it('flags a block where every expect() uses .toBeDefined()', () => { + const src = ` + it('returns something', () => { + const x = parse('foo'); + expect(x).toBeDefined(); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(1); + expect(violations[0]?.rule).toBe('only-tobedefined'); + }); + + it('flags .toBeTruthy() and .toBeFalsy() blocks', () => { + const src = ` + it('truthy', () => { + expect(x).toBeTruthy(); + }); + it('falsy', () => { + expect(y).toBeFalsy(); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.filter((v) => v.rule === 'only-tobedefined')).toHaveLength(2); + }); + + it('does NOT flag .toBeUndefined() — that asserts a specific return value', () => { + const src = ` + it('returns undefined when input is missing', () => { + expect(getNotes(item)).toBeUndefined(); + }); + it('returns null when not found', () => { + expect(lookup(id)).toBeNull(); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(0); + }); + + it('does NOT flag when at least one expect() uses a specific matcher', () => { + const src = ` + it('returns a valid result', () => { + const x = parse('foo'); + expect(x).toBeDefined(); + expect(x.value).toBe(42); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(0); + }); + + it('does NOT flag when an expect-helper is present', () => { + const src = ` + it('returns a valid result', () => { + const x = parse('foo'); + expect(x).toBeDefined(); + expectShape(x); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(0); + }); + }); + + describe('bare-tohavebeencalled', () => { + it('flags .toHaveBeenCalled() without .toHaveBeenCalledWith or Times', () => { + const src = ` + it('calls the thing', () => { + doIt(); + expect(spy).toHaveBeenCalled(); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.filter((v) => v.rule === 'bare-tohavebeencalled')).toHaveLength(1); + }); + + it('does NOT flag when .toHaveBeenCalledWith is present in the same block', () => { + const src = ` + it('calls the thing with the right arg', () => { + doIt('foo'); + expect(spy).toHaveBeenCalled(); + expect(spy).toHaveBeenCalledWith('foo'); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.filter((v) => v.rule === 'bare-tohavebeencalled')).toHaveLength(0); + }); + + it('does NOT flag when .toHaveBeenCalledTimes is present', () => { + const src = ` + it('calls the thing twice', () => { + doIt(); doIt(); + expect(spy).toHaveBeenCalled(); + expect(spy).toHaveBeenCalledTimes(2); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.filter((v) => v.rule === 'bare-tohavebeencalled')).toHaveLength(0); + }); + }); + + describe('large-snapshot', () => { + it('flags toMatchInlineSnapshot bodies > 50 lines', () => { + const snapshotBody = `\n${' line\n'.repeat(60)}`; + const src = ` + it('matches snapshot', () => { + expect(big).toMatchInlineSnapshot(\`${snapshotBody}\`); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.some((v) => v.rule === 'large-snapshot')).toBe(true); + }); + + it('does NOT flag small inline snapshots', () => { + const src = ` + it('matches snapshot', () => { + expect(small).toMatchInlineSnapshot(\`"hello"\`); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations.filter((v) => v.rule === 'large-snapshot')).toHaveLength(0); + }); + }); + + describe('file-level disable comment', () => { + it('skips the entire file when "no-weak-assertions: disable" is in the first 5 lines', () => { + const src = `// no-weak-assertions: disable + it('grandfathered', () => { + expect(x).toBeDefined(); + }); + it('also grandfathered', () => { + // assertion-free + }); + `; + expect(isFileDisabled(src)).toBe(true); + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(0); + }); + + it('does NOT skip when the disable comment is after line 5', () => { + const src = ` + // line 1 + // line 2 + // line 3 + // line 4 + // line 5 + // no-weak-assertions: disable + it('not grandfathered', () => { + const x = 1; + }); + `; + expect(isFileDisabled(src)).toBe(false); + const violations = analyzeSource(fakeFile, src); + expect(violations.length).toBeGreaterThan(0); + }); + }); + + describe('multiple violations in one file', () => { + it('reports each violation separately', () => { + const src = ` + it('first violation', () => { + const x = 1; + }); + it('second violation', () => { + expect(y).toBeDefined(); + }); + it('third violation', () => { + expect(spy).toHaveBeenCalled(); + }); + `; + const violations = analyzeSource(fakeFile, src); + expect(violations).toHaveLength(3); + expect(violations.map((v) => v.rule).sort()).toEqual([ + 'assertion-free-test', + 'bare-tohavebeencalled', + 'only-tobedefined', + ]); + }); + }); +}); diff --git a/scripts/lint/no-weak-assertions.ts b/scripts/lint/no-weak-assertions.ts new file mode 100644 index 0000000000..9c5c946c20 --- /dev/null +++ b/scripts/lint/no-weak-assertions.ts @@ -0,0 +1,354 @@ +#!/usr/bin/env bun +// +// no-weak-assertions.ts — catches coverage theater in test files. +// +// Walks every `.test.ts` / `.test.tsx` file under apps/* and packages/* and +// flags `it(...)` / `test(...)` blocks that contain one of the following: +// +// • assertion-free-test — zero `expect(` or `expect*(...)` calls in +// the block. Helper assertions whose names +// start with `expect` (e.g. `expectUnauthorized`, +// `expectJsonResponse`) count as assertions. +// • only-tobedefined — every direct `expect(...)` ends in a +// non-specific matcher (`.toBeDefined()`, +// `.toBeTruthy()`, `.toBeFalsy()`, +// `.not.toBeUndefined()`, or +// `.not.toBeNull()`) AND no expect-helper is +// called in the block. `.toBeUndefined()` and +// `.toBeNull()` alone are NOT flagged — they +// assert specific return values. +// • bare-tohavebeencalled — `.toHaveBeenCalled()` present without +// `.toHaveBeenCalledWith(...)` or +// `.toHaveBeenCalledTimes(...)` in the same +// block. +// • large-snapshot — `toMatchInlineSnapshot(...)` body > 50 lines. +// +// These patterns provably hide regressions and inflate coverage without +// proving behaviour. Tests should assert specific values, specific call +// shapes, or both. +// +// Escape hatch: +// // no-weak-assertions: disable +// placed within the file's first 5 lines skips the file entirely. Use +// sparingly — grandfathered tests only. +// +// `it.todo(...)`, `it.skip(...)`, `it.each(...)` are not flagged (they +// either have no body or carry parameterised bodies that this rule does +// not analyse). +// +// Exit code: +// 0 — no violations +// 1 — violations found + +import { readdirSync, readFileSync, statSync } from 'node:fs'; +import { join } from 'node:path'; + +const SCAN_ROOTS = ['apps', 'packages']; +const EXCLUDED_DIRS = new Set([ + 'node_modules', + 'dist', + 'build', + '.next', + '.expo', + '.wrangler', + 'coverage', +]); + +const LARGE_SNAPSHOT_THRESHOLD_LINES = 50; +const DISABLE_COMMENT = 'no-weak-assertions: disable'; + +// Matchers that assert *non-specific* shape — covered tests should usually +// assert a concrete value instead. `.toBeUndefined()` and `.toBeNull()` are +// intentionally NOT in this list: when the function under test is documented +// to return `undefined` / `null`, these are specific assertions, not weak. +const WEAK_MATCHER_PATTERN = + /\.(?:toBeDefined|toBeTruthy|toBeFalsy)\s*\(\s*\)|\.not\.(?:toBeUndefined|toBeNull)\s*\(\s*\)/; + +// Match `it(`, `test(`, `it.only(`, `test.only(`. Skip `.todo`, `.skip`, +// `.each`, `.concurrent`, `.failing` since they don't carry a runnable body +// the rule applies to. +const TEST_OPENER_PATTERN = /\b(?:it|test)(?:\.only)?\s*\(/g; + +export type WeakAssertionRule = + | 'assertion-free-test' + | 'only-tobedefined' + | 'bare-tohavebeencalled' + | 'large-snapshot'; + +export interface Violation { + file: string; + line: number; + rule: WeakAssertionRule; + message: string; +} + +export function isFileDisabled(src: string): boolean { + const head = src.split('\n').slice(0, 5).join('\n'); + return head.includes(DISABLE_COMMENT); +} + +// Locate the matching closing paren / brace, starting from `start` (the +// index of the opening character in `src`). Returns the index of the close, +// or -1 if not found (unbalanced). +function findMatchingClose(src: string, start: number, open: string, close: string): number { + let depth = 0; + let inString: '"' | "'" | '`' | null = null; + let inLineComment = false; + let inBlockComment = false; + let inRegex = false; + for (let i = start; i < src.length; i++) { + const ch = src[i] ?? ''; + const prev = src[i - 1] ?? ''; + const next = src[i + 1] ?? ''; + + if (inLineComment) { + if (ch === '\n') inLineComment = false; + continue; + } + if (inBlockComment) { + if (ch === '*' && next === '/') { + inBlockComment = false; + i++; + } + continue; + } + if (inString) { + if (ch === '\\') { + i++; + continue; + } + if (ch === inString) inString = null; + continue; + } + if (inRegex) { + if (ch === '\\') { + i++; + continue; + } + if (ch === '/') inRegex = false; + continue; + } + + if (ch === '/' && next === '/') { + inLineComment = true; + i++; + continue; + } + if (ch === '/' && next === '*') { + inBlockComment = true; + i++; + continue; + } + if (ch === '"' || ch === "'" || ch === '`') { + inString = ch as '"' | "'" | '`'; + continue; + } + if (ch === '/' && /[(,=:!&|?+\-*%~^[{;]/.test(prev)) { + inRegex = true; + continue; + } + + if (ch === open) depth++; + else if (ch === close) { + depth--; + if (depth === 0) return i; + } + } + return -1; +} + +function lineNumberOf(src: string, index: number): number { + let line = 1; + for (let i = 0; i < index; i++) { + if (src[i] === '\n') line++; + } + return line; +} + +function pushIfNew(violations: Violation[], v: Violation): void { + // Avoid duplicate (file, line, rule) entries when a block contains both + // a snapshot violation and a block-level violation on the same line. + if (violations.some((x) => x.file === v.file && x.line === v.line && x.rule === v.rule)) return; + violations.push(v); +} + +function checkInlineSnapshots(src: string, file: string, violations: Violation[]): void { + const pattern = /\.toMatchInlineSnapshot\s*\(/g; + let match: RegExpExecArray | null = pattern.exec(src); + while (match !== null) { + const openIdx = match.index + match[0].length - 1; + const closeIdx = findMatchingClose(src, openIdx, '(', ')'); + if (closeIdx !== -1) { + const snippet = src.slice(openIdx + 1, closeIdx); + const lineCount = snippet.split('\n').length; + if (lineCount > LARGE_SNAPSHOT_THRESHOLD_LINES) { + pushIfNew(violations, { + file, + line: lineNumberOf(src, match.index), + rule: 'large-snapshot', + message: `inline snapshot is ${lineCount} lines (limit ${LARGE_SNAPSHOT_THRESHOLD_LINES})`, + }); + } + } + match = pattern.exec(src); + } +} + +function checkTestBlocks(src: string, file: string, violations: Violation[]): void { + TEST_OPENER_PATTERN.lastIndex = 0; + let match: RegExpExecArray | null = TEST_OPENER_PATTERN.exec(src); + while (match !== null) { + const openParenIdx = match.index + match[0].length - 1; + const closeParenIdx = findMatchingClose(src, openParenIdx, '(', ')'); + if (closeParenIdx === -1) { + match = TEST_OPENER_PATTERN.exec(src); + continue; + } + + const head = src.slice(openParenIdx, closeParenIdx + 1); + const bodyBraceMatch = /=>\s*\{|function[^(]*\([^)]*\)\s*\{|async\s*\([^)]*\)\s*=>\s*\{/.exec( + head, + ); + if (!bodyBraceMatch) { + match = TEST_OPENER_PATTERN.exec(src); + continue; + } + const bodyBraceIdx = openParenIdx + bodyBraceMatch.index + bodyBraceMatch[0].length - 1; + const bodyCloseIdx = findMatchingClose(src, bodyBraceIdx, '{', '}'); + if (bodyCloseIdx === -1) { + match = TEST_OPENER_PATTERN.exec(src); + continue; + } + const body = src.slice(bodyBraceIdx + 1, bodyCloseIdx); + const startLine = lineNumberOf(src, match.index); + + // Count any function call whose name starts with `expect` — covers both + // bare `expect(` and convention-based helpers like `expectUnauthorized(`, + // `expectJsonResponse(`, `expectForbidden(`, etc., which encapsulate + // assertion logic inside a named helper. + const expectLikeCalls = (body.match(/\bexpect[A-Za-z0-9]*\s*\(/g) ?? []).length; + + if (expectLikeCalls === 0) { + pushIfNew(violations, { + file, + line: startLine, + rule: 'assertion-free-test', + message: 'test block has no expect() or expect-helper calls', + }); + match = TEST_OPENER_PATTERN.exec(src); + continue; + } + + // only-tobedefined: every direct `expect(...)` ends in a weak matcher AND + // there are no expect-helper calls (helpers like expectUnauthorized are + // assumed to assert specific shape internally). + const bareExpectSites = [...body.matchAll(/\bexpect\s*\(/g)]; + const helperExpectSites = expectLikeCalls - bareExpectSites.length; + let weakCount = 0; + for (const m of bareExpectSites) { + const after = body.slice(m.index ?? 0, (m.index ?? 0) + 200); + if (WEAK_MATCHER_PATTERN.test(after)) weakCount++; + } + if ( + helperExpectSites === 0 && + bareExpectSites.length > 0 && + weakCount === bareExpectSites.length + ) { + pushIfNew(violations, { + file, + line: startLine, + rule: 'only-tobedefined', + message: + 'every expect() uses a non-specific matcher (toBeDefined / toBeTruthy / toBeFalsy / .not.toBeUndefined / .not.toBeNull) — assert specific values', + }); + } + + // bare-tohavebeencalled: `.toHaveBeenCalled()` present without + // `.toHaveBeenCalledWith(` or `.toHaveBeenCalledTimes(` in the same block. + const hasBareCalled = /\.toHaveBeenCalled\s*\(\s*\)/.test(body); + const hasArgMatcher = /\.toHaveBeenCalledWith\s*\(|\.toHaveBeenCalledTimes\s*\(/.test(body); + if (hasBareCalled && !hasArgMatcher) { + pushIfNew(violations, { + file, + line: startLine, + rule: 'bare-tohavebeencalled', + message: + '.toHaveBeenCalled() without .toHaveBeenCalledWith(...) or .toHaveBeenCalledTimes(N) — assert the call shape', + }); + } + + match = TEST_OPENER_PATTERN.exec(src); + } +} + +// Analyse a single source string and return the violations it contains. +// `file` is used purely as a label in the returned violations; this function +// does not read from disk and is safe to call from tests with inline source. +export function analyzeSource(file: string, src: string): Violation[] { + if (isFileDisabled(src)) return []; + const violations: Violation[] = []; + checkInlineSnapshots(src, file, violations); + checkTestBlocks(src, file, violations); + return violations; +} + +function isTestFile(name: string): boolean { + return /\.(test|spec)\.(ts|tsx|cts|mts)$/.test(name); +} + +function walkDir(dir: string, relPath: string, files: string[]): void { + let entries: string[]; + try { + entries = readdirSync(dir); + } catch { + return; + } + for (const entry of entries) { + if (EXCLUDED_DIRS.has(entry)) continue; + const full = join(dir, entry); + const rel = `${relPath}/${entry}`; + let isDir = false; + try { + isDir = statSync(full).isDirectory(); + } catch { + continue; + } + if (isDir) { + walkDir(full, rel, files); + } else if (isTestFile(entry)) { + files.push(rel); + } + } +} + +if (import.meta.main) { + const ROOT = join(import.meta.dir, '..', '..'); + const files: string[] = []; + for (const root of SCAN_ROOTS) { + walkDir(join(ROOT, root), root, files); + } + + const violations: Violation[] = []; + for (const file of files) { + let src: string; + try { + src = readFileSync(join(ROOT, file), 'utf-8'); + } catch { + continue; + } + for (const v of analyzeSource(file, src)) violations.push(v); + } + + if (violations.length > 0) { + console.log(`Weak assertion patterns found (${violations.length} violations):\n`); + for (const v of violations) { + console.log(`${v.file}:${v.line}:${v.rule}: ${v.message}`); + } + console.log( + '\nFix by asserting specific values, specific call shapes, or both. See docs/testing.md.', + ); + process.exit(1); + } + + console.log('No weak-assertion patterns found.'); +} diff --git a/scripts/vitest.config.ts b/scripts/vitest.config.ts new file mode 100644 index 0000000000..5e8f9107c9 --- /dev/null +++ b/scripts/vitest.config.ts @@ -0,0 +1,20 @@ +import { resolve } from 'node:path'; +import { defineConfig } from 'vitest/config'; + +/** + * Vitest configuration for the repo-level scripts in `scripts/`. + * + * Run with: bun test:scripts + * + * Custom lint scripts (`scripts/lint/*.ts`) and the coverage ratchet + * (`scripts/lint/coverage-ratchet.ts`) get their own test coverage via + * files under `scripts/lint/__tests__/`. + */ +export default defineConfig({ + test: { + name: 'scripts-unit', + environment: 'node', + globals: true, + include: [resolve(__dirname, '**/__tests__/**/*.test.ts')], + }, +}); From 827f617c1db049bc69852b187cb3c2127d0e2a36 Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Tue, 19 May 2026 21:53:50 -0600 Subject: [PATCH 02/85] docs(plans): add 2026-05-19 coverage ratchet + quality gates plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supersedes the 2026-05-17 plan. The threshold ramp (its U2) was obsoleted by upstream work landed between 2026-05-17 and 2026-05-19 (api/expo/analytics/mcp/overpass are now at 95%+ thresholds with refined exclude lists and added middleware/image/storage tests). The remaining novel work — ratchet (U6) + assertion-strength lint (U9) + docs migration (U1) — survives unchanged in intent; baselines must be re-captured against upstream's current configs. --- ...coverage-ratchet-and-quality-gates-plan.md | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 docs/plans/2026-05-19-001-chore-coverage-ratchet-and-quality-gates-plan.md diff --git a/docs/plans/2026-05-19-001-chore-coverage-ratchet-and-quality-gates-plan.md b/docs/plans/2026-05-19-001-chore-coverage-ratchet-and-quality-gates-plan.md new file mode 100644 index 0000000000..1c7a0b4af3 --- /dev/null +++ b/docs/plans/2026-05-19-001-chore-coverage-ratchet-and-quality-gates-plan.md @@ -0,0 +1,184 @@ +--- +type: plan +status: active +plan_type: chore +title: Coverage ratchet and assertion-strength gates +created: 2026-05-19 +worktree: .worktrees/chore/ramp-test-coverage +branch: chore/ramp-test-coverage +supersedes: docs/plans/2026-05-17-001-chore-test-coverage-ramp-and-ci-gate-plan.md +--- + +# chore: Coverage ratchet and assertion-strength gates + +## Summary + +A 2026-05-17 plan proposed a 9-unit, 4-phase ramp toward 95%+ coverage across the monorepo with a CI gate that blocks regressions. While that plan was being written, upstream `main` independently landed the threshold ramp itself — Vitest configs across `packages/api`, `apps/expo`, `packages/analytics`, `packages/mcp`, and `packages/overpass` now sit at 95/92/97/95 (or close), with refined exclude lists and added unit tests for middleware, image utils, and storage. **U2 of the original plan is therefore obsoleted by upstream.** + +What remains novel and ship-ready is the *enforcement* layer the original plan introduced: + +- **U6** — a coverage **ratchet** (`scripts/lint/coverage-ratchet.ts` + committed `coverage-baselines.json`) that fails CI if any tracked workspace drops below its baseline. Complements Vitest's per-config thresholds: thresholds enforce the floor, the ratchet enforces no-regression toward the tier target. +- **U9** — an **assertion-strength lint** (`scripts/lint/no-weak-assertions.ts`) that catches coverage theater patterns (assertion-free tests, bare `.toBeDefined()`, bare `.toHaveBeenCalled()`, oversized snapshots). +- **U1** — migrate the testing guide off the repo root (`TESTING.md` → `docs/testing.md`, per the "no random md in root" convention) and rewrite around the ratchet + lint, not the obsolete tier ramp. + +Future phases (consolidated coverage workflow / Stryker mutation testing / per-workspace backfills for the still-untested packages) are deferred to follow-up plans. + +--- + +## Problem Frame + +PackRat's Vitest configs now declare strict per-package coverage thresholds (largely 95%+), but the surrounding enforcement is thin: + +1. **No regression gate.** Vitest's `thresholds` block fails the build when a single config's run drops below its declared floor. There is nothing that fails CI when an existing workspace silently slides from 95% to 89% if the threshold is lowered as a "temporary unblock" — exactly the pattern the Elysia migration's PR-2083 history shows (`4c2c00d19 fix(ci): separate API type-check, lower coverage threshold`). A floor that can be edited by the same PR that drops below it is not a gate. +2. **No quality gate beyond line counts.** Coverage rewards *executing* code, not *asserting* on it. The standing failure mode is `expect(x).toBeDefined()` after a parse, or `expect(spy).toHaveBeenCalled()` without arg matching — both fully covered, both regression-blind. +3. **Doc hygiene.** `TESTING.md` lives at the repo root alongside `CLAUDE.md` and `README.md`. The repo convention is that only those last two belong at root; everything else goes under `docs/`. The testing guide itself was already out of date relative to upstream's threshold ramp. + +This plan adds the missing gates and the missing doc move. It does **not** lower or rewrite any existing Vitest threshold. + +--- + +## Scope Boundaries + +### In scope + +- A `scripts/lint/coverage-ratchet.ts` enforcement script + `scripts/lint/coverage-baseline-update.ts` (CI-only baseline bump) + committed `coverage-baselines.json` at the repo root. +- `bun check:coverage` / `bun check:coverage:update` package.json scripts. +- A unit test suite for the ratchet at `scripts/lint/__tests__/coverage-ratchet.test.ts`. +- `scripts/lint/no-weak-assertions.ts` and its unit test suite at `scripts/lint/__tests__/no-weak-assertions.test.ts`. +- `bun lint:weak-assertions` script and a `scripts/vitest.config.ts` for the scripts test suite. +- Migrating `TESTING.md` to `docs/testing.md` with content rewritten around current reality (upstream's 95%+ thresholds plus the new ratchet + lint). + +### Deferred to follow-up work + +- **Consolidated `.github/workflows/coverage.yml` matrix workflow** that runs every Tier A/B/C workspace, posts per-workspace PR comments, and invokes the ratchet. Out of scope for this PR — easier to land on top of the ratchet once the gate is in place. Tracked separately. +- **Stryker mutation testing** on `packages/api/src/{services,middleware,utils}/**` and the nightly workflow. Follow-up. +- **Backfilling tests in currently-untested workspaces** (`apps/{admin,trails,web}`, `packages/{guards,env,app,cli,checks,config,osm-db,osm-import,web-ui,api-client,ui}`). Some of these gained tests via upstream's work; the rest are a separate effort. +- **Wiring `bun lint:weak-assertions` into `scripts/check-all.ts`** as a blocking check. The lint currently surfaces a handful of real findings against the upstream codebase (mostly in `packages/analytics`); wiring into the gate requires a small cleanup PR first. + +### Out of scope + +- Lowering or rewriting any Vitest threshold upstream put in place. +- Adding Codecov / Coveralls integration. +- E2E / visual-regression / mutation testing. + +--- + +## Requirements + +| ID | Requirement | +|---|---| +| R1 | `coverage-baselines.json` lives at the repo root and records, per tracked workspace: `summaryPath`, four-metric baseline, and `recordedAt`. Updates happen via the baseline-update script on green merges to `main` — never manually edited in feature PRs. | +| R2 | `scripts/lint/coverage-ratchet.ts` exits non-zero on any metric dropping below the baseline (modulo `_epsilon` to absorb v8 jitter), on a missing summary file, or on a malformed summary. | +| R3 | The ratchet's analysis logic is testable in isolation. `scripts/lint/__tests__/coverage-ratchet.test.ts` covers happy path, regressions, missing summaries, malformed summaries, multi-metric regressions, and baseline parsing. | +| R4 | `scripts/lint/no-weak-assertions.ts` flags the four documented coverage-theater patterns (assertion-free tests, `only-tobedefined`, `bare-tohavebeencalled`, `large-snapshot`) and respects a file-level `// no-weak-assertions: disable` escape hatch. | +| R5 | The lint's analysis logic is testable. `scripts/lint/__tests__/no-weak-assertions.test.ts` covers each rule's positive/negative cases, the disable comment, expect-helper detection, and multi-violation files. | +| R6 | `bun check:coverage` runs the ratchet. `bun check:coverage:update` runs the baseline-update script. `bun lint:weak-assertions` runs the assertion lint. `bun test:scripts` runs both unit suites. | +| R7 | `TESTING.md` no longer exists at the repo root. `docs/testing.md` is the single canonical testing guide. `CLAUDE.md`, `README.md`, `copilot-instructions.md`, and any solutions doc that linked to the old path point to the new one. | +| R8 | Baselines for U6 are captured from real coverage runs against current upstream configs (not from the obsolete numbers in the superseded plan). | + +--- + +## Key Technical Decisions + +| Decision | Choice | Why | +|---|---|---| +| Threshold authority | **Keep upstream's existing Vitest thresholds; do not touch them.** The ratchet adds a second layer of enforcement on top. | Upstream already ramped to 95%+. Touching their numbers reopens decisions that were settled in PRs the merged into `main` between 2026-05-17 and 2026-05-19. | +| Ratchet implementation | **In-repo Bun script + committed `coverage-baselines.json`** | No external service, no new secrets, zero dependencies beyond the test runs that already happen. Mirrors `scripts/lint/no-duplicate-deps.ts` shape. | +| Baseline update flow | **CI-only on `main` post-merge** | Auto-commit baseline bumps after green runs so the floor only moves up. PRs cannot edit `coverage-baselines.json` to silently lower the gate. | +| Epsilon | **0.5 percentage points** | V8 coverage instrumentation has small run-to-run variance on identical code (observed ~0.16% on `packages/analytics` functions metric). Epsilon absorbs this without making the gate meaningless. | +| Assertion-strength rule strictness | **Flag only genuinely weak matchers** (`toBeDefined`, `toBeTruthy`, `toBeFalsy`, `.not.toBe{Undefined,Null}`). `toBeUndefined()` and `toBeNull()` alone are NOT flagged — they assert specific return values. | Avoids false positives on tests that legitimately assert "this returns null". | +| Helper-assertion detection | **Any call to `expect[A-Z][A-Za-z0-9]*(` counts as an assertion** (e.g., `expectUnauthorized(res)`, `expectJsonResponse(res)`). | `packages/api/test/` uses this convention extensively. Treating helpers as black-box assertions avoids flagging them as `assertion-free-test`. | +| Lint gate enable | **Add the command, defer wiring into `check-all.ts`** | Surfaces ~handful of real findings in current upstream code. Wiring into the blocking check requires a cleanup PR first; the script ships so contributors can run it manually until then. | +| Docs location | **`docs/testing.md`** | Per the "no random md in root" convention — `CLAUDE.md` and `README.md` are the only root markdown files. | + +--- + +## Implementation Units + +### U1. Migrate `TESTING.md` → `docs/testing.md` + +- **Goal:** Honor the "no random md in root" convention. Rewrite the testing guide around current reality (upstream's 95%+ thresholds + the ratchet + the lint). +- **Requirements:** R7 +- **Dependencies:** none +- **Files:** + - `TESTING.md` (delete) + - `docs/testing.md` (new — moved + rewritten) + - `CLAUDE.md` (update Testing section: link to new path, summarize ratchet + lint) + - `README.md` (badge link → `/docs/testing.md`) + - `copilot-instructions.md` (testing section: point at `docs/testing.md`, mention new scripts) + - `docs/solutions/ui-bugs/android-textinput-keyboard-focus-loss.md` (cross-ref link) +- **Approach:** The original `TESTING.md` content remains useful for patterns. The "Test Statistics" block at the bottom is out of date; replace it with a "Coverage Tier Model" matrix reflecting upstream's actual thresholds (api/expo/analytics/mcp at ~95/92/97/95). Add new sections for the ratchet and the assertion-strength lint. +- **Test scenarios:** Test expectation: none — documentation move with no behaviour change. Verify by grep: `rg -n 'TESTING\.md' .` returns no matches outside of `docs/plans/`. +- **Verification:** `TESTING.md` does not exist at root; `docs/testing.md` opens and renders; every `TESTING.md` reference in code, docs, and configs is updated. + +--- + +### U6. Coverage ratchet + baseline file + +- **Goal:** Add the regression-blocking gate. Vitest thresholds enforce the floor in each workspace; the ratchet ensures the floor cannot quietly slide down across PRs. +- **Requirements:** R1, R2, R3, R6, R8 +- **Dependencies:** none structurally; baselines must be captured from current upstream configs before commit +- **Files:** + - `coverage-baselines.json` (new — repo root) + - `scripts/lint/coverage-ratchet.ts` (new) + - `scripts/lint/coverage-baseline-update.ts` (new — CI-only) + - `scripts/lint/__tests__/coverage-ratchet.test.ts` (new — 13 scenarios) + - `package.json` (add `check:coverage` and `check:coverage:update`) +- **Approach:** + - Each workspace baseline carries: `summaryPath`, `tier`, four metric percentages, `recordedAt`. + - The ratchet reads `coverage-baselines.json` and each workspace's `coverage-summary.json`, compares per-metric with epsilon 0.5, and exits 0/1. + - Missing summary file → exit 1 ("silent skipping is exactly the regression mode this script exists to prevent"). + - Malformed summary (missing required `total` fields) → exit 1. + - The baseline-update script bumps numbers upward when current > baseline + epsilon; never lowers. Designed for `main`-only auto-commit, not for PR-time manual updates. + - Initial baselines captured fresh from `bun run --cwd test:coverage` against upstream's configs. +- **Patterns to follow:** `scripts/lint/no-duplicate-deps.ts` for CLI shape; existing `scripts/lint/__tests__/` once U9 lands for the test layout. +- **Test scenarios:** Per the ratchet test file: + - `compareWorkspace` returns `ok` / `improvement` / `regression` based on metric deltas. + - Tolerates noise below epsilon (default 0.5). + - Rejects drops just above epsilon. + - Reports multiple regressions in one workspace. + - `runRatchet` fails on missing or invalid summaries. + - `loadBaseline` parses workspace entries, honors `_epsilon`, falls back to default, and skips malformed entries. +- **Verification:** `bun check:coverage` runs cleanly on the captured baselines; `bun test:scripts` includes the 13 ratchet tests. + +--- + +### U9. Assertion-strength lint + +- **Goal:** Catch coverage theater (assertion-free tests, weak matchers, oversized snapshots, untyped mock calls) at lint time. +- **Requirements:** R4, R5, R6 +- **Dependencies:** none — cherry-picked from prior Phase 1 work +- **Files:** + - `scripts/lint/no-weak-assertions.ts` + - `scripts/lint/__tests__/no-weak-assertions.test.ts` (16 scenarios) + - `scripts/vitest.config.ts` (for `bun test:scripts`) + - `package.json` (add `lint:weak-assertions`, `test:scripts`) +- **Approach:** See Key Technical Decisions for the matcher-strictness and helper-detection rules. +- **Patterns to follow:** `scripts/lint/no-raw-typeof.ts` for shape. +- **Test scenarios:** Each of the four rules: positive case (flags), negative case (does not flag), edge cases (`it.todo`, helper assertions, disable comment, multi-violation file). +- **Verification:** `bun test:scripts` passes; `bun lint:weak-assertions` runs in under 5 seconds across the repo and surfaces only the small set of known-current findings. + +--- + +## Risk Analysis & Mitigation + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Captured baselines too aggressive → first PR red | Medium | Medium | Epsilon 0.5 absorbs v8 jitter. Baselines captured from clean coverage runs, not from older numbers. | +| Lint flags too many existing tests | Confirmed (handful) | Low | Lint command exists but is NOT wired into the blocking `check-all.ts` until a separate cleanup PR. File-level `// no-weak-assertions: disable` escape hatch is available for grandfathered tests if needed. | +| Baseline file becomes a merge-conflict magnet | Low | Low | Updates happen only on `main` via the post-merge auto-commit. PR-level edits to `coverage-baselines.json` are not the workflow. | +| Future Vitest threshold change diverges from baseline | Low | Medium | Both layers gate independently. Vitest threshold is per-config; ratchet is per-baseline. A drop will trip whichever has a stricter floor — that's the right behaviour. Document the dual-layer model in `docs/testing.md`. | + +--- + +## Verification Strategy + +- **U1**: `rg -n 'TESTING\.md' .` returns no matches outside `docs/plans/`. `docs/testing.md` exists and renders. `bun check` passes. +- **U6**: `bun check:coverage` exits 0 on committed baselines; intentionally tweaking a baseline number downward makes a fresh coverage run trip the ratchet. `bun test:scripts` includes 13 ratchet tests, all passing. +- **U9**: `bun test:scripts` includes 16 assertion-lint tests, all passing. `bun lint:weak-assertions` runs in under 5 seconds and produces a stable list of findings. + +--- + +## Origin + +This plan supersedes `docs/plans/2026-05-17-001-chore-test-coverage-ramp-and-ci-gate-plan.md`. That document remains in place as historical context — its U2 (threshold ramp) was made obsolete by upstream work between 2026-05-17 and 2026-05-19, while its U1/U6/U9 carry forward unchanged in intent (only the baseline numbers were re-captured against the updated configs). From af2f382436c09cca007aca674f30ab49f75669b0 Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Tue, 19 May 2026 21:56:25 -0600 Subject: [PATCH 03/85] docs(testing): migrate TESTING.md to docs/testing.md Move the testing guide out of the repo root per the "no random md in root" convention (only CLAUDE.md + README.md belong there). The new docs/testing.md is rewritten around current reality: - Per-workspace Vitest thresholds (95%+ across packages/api, apps/expo, packages/mcp; 80% on packages/{analytics,overpass}; 100% on packages/units). - The coverage ratchet (docs reference; the script itself lands in the next commit). - The assertion-strength lint (docs the four rules + escape hatch). - The two-layer enforcement model: Vitest thresholds enforce the floor, the ratchet enforces no-regression toward target. Update CLAUDE.md with a Testing Policy summary linking to the new doc. Update copilot-instructions.md, README.md, and the android-textinput solutions doc that referenced the old path. Delete the dead test:api-client:types script that points at a non-existent vitest config. --- CLAUDE.md | 14 +- README.md | 2 +- TESTING.md | 453 ------------------ copilot-instructions.md | 12 +- .../android-textinput-keyboard-focus-loss.md | 2 +- docs/testing.md | 315 ++++++++++++ package.json | 1 - 7 files changed, 336 insertions(+), 463 deletions(-) delete mode 100644 TESTING.md create mode 100644 docs/testing.md diff --git a/CLAUDE.md b/CLAUDE.md index e86a7939f6..c035ff2bf8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -43,9 +43,13 @@ bun format # Biome format --write bun check # Biome check (no auto-fix, CI mode) bun check-types # tsc --noEmit -# Testing -bun test:api:unit # API unit tests (Vitest + Cloudflare pool) -bun test:expo # Expo tests (Vitest) +# Testing — see docs/testing.md for the full policy +bun test:api:unit # API unit tests (Vitest, Node env, deps mocked) +bun test:expo # Expo pure-TS tests (Vitest) +bun test:mcp # MCP package tests +bun test:scripts # scripts/lint analyzer tests (ratchet + assertion lint) +bun check:coverage # Coverage ratchet — fails on regression vs coverage-baselines.json +bun lint:weak-assertions # Catches assertion-free tests, bare .toBeDefined / .toHaveBeenCalled, oversized snapshots # Dependencies bun install # Install all workspaces (takes 120s+, never cancel) @@ -56,6 +60,10 @@ bun fix:deps # manypkg auto-fix dependency issues bun bump # Bump monorepo version ``` +## Testing Policy (summary) + +PackRat enforces coverage at two layers: each workspace's `vitest.config.ts` declares per-metric thresholds (mostly 95%+; `packages/units` 100%, `packages/{analytics,overpass}` 80%), and a **coverage ratchet** (`bun check:coverage` against `coverage-baselines.json`) blocks any PR that lowers a workspace's coverage. An **assertion-strength lint** (`bun lint:weak-assertions`) flags coverage-theater patterns (assertion-free tests, bare `.toBeDefined()`, bare `.toHaveBeenCalled()`, oversized snapshots). `packages/api` integration tests still run (`api-tests.yml`) but are not coverage-counted — V8 instrumentation is unsupported under the Cloudflare Workers pool. Full policy and patterns: **`docs/testing.md`**. + ## Code Style Enforced by **Biome 2.0** via lefthook pre-commit hook: diff --git a/README.md b/README.md index dbb2b34341..ecf9620d72 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ So pack your bags, grab your friends, and get ready for your next adventure with [![view - Documentation](https://img.shields.io/badge/view-Documentation-blue?style=for-the-badge)](/docs/ "Go to project documentation") -[![view - Testing Guide](https://img.shields.io/badge/view-Testing_Guide-green?style=for-the-badge)](/TESTING.md "Go to testing documentation") +[![view - Testing Guide](https://img.shields.io/badge/view-Testing_Guide-green?style=for-the-badge)](/docs/testing.md "Go to testing documentation") diff --git a/TESTING.md b/TESTING.md deleted file mode 100644 index 74b5939779..0000000000 --- a/TESTING.md +++ /dev/null @@ -1,453 +0,0 @@ -# Unit Testing Guide for PackRat - -This document outlines testing standards and patterns used in the PackRat codebase. - -## Overview - -PackRat uses **Vitest** as its primary testing framework across both API and Expo layers. This guide demonstrates the patterns established in our unit test suite. - ---- - -## Testing Infrastructure - -### API Layer (packages/api) - -**Configuration Files:** -- `vitest.config.ts` - Full integration tests with PostgreSQL + Cloudflare Workers -- `vitest.unit.config.ts` - Pure unit tests with mocked dependencies (recommended for most unit tests) - -**Commands:** -```bash -# From packages/api -bun test # Full integration tests (requires Docker) -bun test:unit # Unit tests only (no database) -bun test:unit:coverage # Unit tests with coverage report - -# From monorepo root -bun test:api:unit -``` - -**Coverage Configuration:** -- **Provider:** v8 -- **Reports:** text, lcov, html -- **Directory:** `packages/api/coverage/unit/` -- **Target:** 80%+ coverage for critical paths - -### Expo Layer (apps/expo) - -**Configuration File:** -- `vitest.config.ts` - Node environment for pure utility functions - -**Commands:** -```bash -# From apps/expo -bun test # Run utility tests -bun test:coverage # With coverage report - -# From monorepo root -bun test:expo -``` - -**Coverage Configuration:** -- **Provider:** v8 -- **Reports:** text, lcov, html -- **Directory:** `apps/expo/coverage/unit/` -- **Target:** 75%+ coverage for utility functions - -**Note:** Currently limited to pure utility functions. React Native hooks and components require additional setup (e.g., @testing-library/react-native). - ---- - -## Test Patterns - -### Pattern 1: Service Tests with Mocked Dependencies - -**Example:** `/packages/api/src/services/__tests__/catalogService.test.ts` - -```typescript -import { beforeEach, describe, expect, it, vi } from 'vitest'; -import { CatalogService } from '../catalogService'; -import * as embeddingService from '@packrat/api/services/embeddingService'; - -// Module-level mocks (hoisted) -vi.mock('@packrat/api/db', () => ({ - createDb: vi.fn(), - createDbClient: vi.fn(), -})); - -vi.mock('@packrat/api/services/embeddingService', () => ({ - generateEmbedding: vi.fn(), - generateManyEmbeddings: vi.fn(), -})); - -// Test suite -describe('CatalogService', () => { - let service: CatalogService; - - beforeEach(() => { - vi.clearAllMocks(); - service = new CatalogService(makeEnv(), false); - }); - - describe('vectorSearch', () => { - beforeEach(() => { - vi.mocked(embeddingService.generateEmbedding) - .mockResolvedValue(new Array(1536).fill(0.1)); - }); - - it('returns empty result for empty query string', async () => { - const result = await service.vectorSearch('', 10, 0); - - expect(result).toEqual({ - items: [], - total: 0, - limit: 10, - offset: 0, - nextOffset: 10, - }); - expect(embeddingService.generateEmbedding).not.toHaveBeenCalled(); - }); - }); -}); -``` - -**Key Points:** -- Use `vi.mock()` for module-level mocks (these are hoisted) -- Import mocked modules for type-safe access (e.g., `import * as embeddingService`) -- Use `vi.mocked()` for type-safe mock assertions -- Clear mocks in `beforeEach()` for test isolation - -### Pattern 2: API Service Tests with Fetch Mocking - -**Example:** `/packages/api/src/services/__tests__/weatherService.test.ts` - -```typescript -import { beforeEach, describe, expect, it, vi } from 'vitest'; -import { WeatherService } from '../weatherService'; - -describe('WeatherService', () => { - let service: WeatherService; - let fetchMock: ReturnType; - - beforeEach(() => { - vi.clearAllMocks(); - mockContext = makeMockContext(); - service = new WeatherService(mockContext); - - // Mock global fetch - fetchMock = vi.fn(); - global.fetch = fetchMock; - }); - - describe('getWeatherForLocation', () => { - it('returns formatted weather data for valid location', async () => { - const mockResponse = { - main: { temp: 72.5, humidity: 65 }, - weather: [{ main: 'Clear' }], - wind: { speed: 8.3 }, - }; - - fetchMock.mockResolvedValue({ - ok: true, - json: async () => mockResponse, - }); - - const result = await service.getWeatherForLocation('San Francisco'); - - expect(result.temperature).toBe(73); // Rounded - expect(result.conditions).toBe('Clear'); - }); - }); -}); -``` - -**Key Points:** -- Mock `global.fetch` in `beforeEach()` for fresh state -- Use `mockResolvedValue` for successful responses -- Test both success and error paths -- Verify API was called with correct parameters - -### Pattern 3: Pure Utility Function Tests - -**Example:** `/apps/expo/features/packs/utils/__tests__/convertToGrams.test.ts` - -```typescript -import { describe, expect, it } from 'vitest'; -import { convertToGrams } from '../convertToGrams'; - -describe('convertToGrams', () => { - describe('metric conversions', () => { - it('returns same value for grams', () => { - expect(convertToGrams(100, 'g')).toBe(100); - expect(convertToGrams(0, 'g')).toBe(0); - expect(convertToGrams(1, 'g')).toBe(1); - }); - - it('converts kilograms to grams correctly', () => { - expect(convertToGrams(1, 'kg')).toBe(1000); - expect(convertToGrams(2.5, 'kg')).toBe(2500); - }); - }); - - describe('edge cases', () => { - it('handles zero weight', () => { - expect(convertToGrams(0, 'kg')).toBe(0); - }); - - it('returns original value for unknown units', () => { - expect(convertToGrams(100, 'invalid')).toBe(100); - }); - }); -}); -``` - -**Key Points:** -- No mocking needed for pure functions -- Group related tests with nested `describe()` blocks -- Test edge cases (zero, negative, invalid input) -- Use `toBe()` for exact values, `toBeCloseTo()` for floating point -- Test real-world scenarios to ensure practical correctness - ---- - -## Best Practices - -### 1. Test Organization - -```typescript -describe('ServiceName', () => { - // Setup - let service: ServiceName; - - beforeEach(() => { - // Reset mocks and create fresh instances - }); - - describe('methodName', () => { - // Group related tests - - describe('when condition', () => { - // Nested context-specific tests - }); - }); -}); -``` - -### 2. Mock Isolation - -```typescript -beforeEach(() => { - vi.clearAllMocks(); // Reset all mock state - // Re-create service instances - // Set default mock return values -}); -``` - -### 3. Input Validation Tests - -Always test: -- ✅ Valid inputs (happy path) -- ✅ Invalid inputs (error paths) -- ✅ Edge cases (empty, null, undefined, zero, negative) -- ✅ Boundary conditions (min/max values) - -### 4. Floating Point Comparisons - -```typescript -// ❌ Don't use exact equality for floats -expect(convertToGrams(1, 'oz')).toBe(28.3495); - -// ✅ Use toBeCloseTo() with appropriate precision -expect(convertToGrams(1, 'oz')).toBeCloseTo(28.3495, 4); -``` - -### 5. Async Testing - -```typescript -// Test async functions -it('handles async operation', async () => { - const result = await service.fetchData(); - expect(result).toBeDefined(); -}); - -// Test error handling -it('throws on invalid input', async () => { - await expect(service.process(null)).rejects.toThrow('Invalid input'); -}); -``` - -### 6. Mock Configuration Patterns - -```typescript -// Default behavior for all tests in describe block -beforeEach(() => { - mockFunction.mockResolvedValue(defaultValue); -}); - -// Override for specific test -it('handles special case', async () => { - mockFunction.mockResolvedValueOnce(specialValue); - // ... -}); -``` - ---- - -## Coverage Guidelines - -### What to Test (Priority Order) - -1. **Critical Business Logic** - - Payment processing - - User authentication - - Data validation - - Core algorithms - -2. **Public APIs** - - All exported functions - - All route handlers - - Service methods - -3. **Edge Cases** - - Null/undefined handling - - Empty collections - - Boundary values - - Invalid inputs - -4. **Error Paths** - - Exception handling - - Validation errors - - Network failures - - Database errors - -### What NOT to Test - -- Third-party library internals -- Simple getters/setters with no logic -- Generated code -- Configuration files -- Type definitions - ---- - -## Running Tests in CI - -### API Tests (GitHub Actions) - -```yaml -- name: Run API Unit Tests - run: | - cd packages/api - bun test:unit --coverage -``` - -### Expo Tests (GitHub Actions) - -```yaml -- name: Run Expo Tests - run: | - cd apps/expo - bun test --coverage -``` - -### Coverage Reports - -Coverage reports are generated in: -- API: `packages/api/coverage/unit/` -- Expo: `apps/expo/coverage/unit/` - -Open `index.html` to view detailed coverage reports locally. - ---- - -## Troubleshooting - -### "Cannot access before initialization" Error - -**Problem:** Trying to use a variable declared after `vi.mock()` - -**Solution:** Import the mocked module after the mock declaration - -```typescript -// ❌ Won't work - hoisting issue -const mockFn = vi.fn(); -vi.mock('./module', () => ({ fn: mockFn })); - -// ✅ Works - import after mock -vi.mock('./module', () => ({ fn: vi.fn() })); -import * as module from './module'; -// Use vi.mocked(module.fn) in tests -``` - -### Mock Not Resetting Between Tests - -**Problem:** Mock state persists across tests - -**Solution:** Always call `vi.clearAllMocks()` in `beforeEach()` - -```typescript -beforeEach(() => { - vi.clearAllMocks(); // Resets all mock history and implementations -}); -``` - -### Floating Point Precision Errors - -**Problem:** `expect(0.1 + 0.2).toBe(0.3)` fails due to floating point arithmetic - -**Solution:** Use `toBeCloseTo()` with appropriate precision - -```typescript -expect(0.1 + 0.2).toBeCloseTo(0.3, 10); // 10 decimal places -``` - ---- - -## Resources - -- [Vitest Documentation](https://vitest.dev/) -- [Testing Best Practices](https://testingjavascript.com/) -- [Mocking in Vitest](https://vitest.dev/guide/mocking.html) -- [Coverage Configuration](https://vitest.dev/guide/coverage.html) - ---- - -## Test Statistics (Current) - -### API Layer -- **Test Files:** 8 -- **Tests:** 101 passing -- **Coverage Target:** 80%+ - -### Expo Layer -- **Test Files:** 8 -- **Tests:** 93 passing (excluding pre-existing failures) -- **Coverage Target:** 75%+ - -### Recent Additions -- ✅ `CatalogService` - Vector search, batch operations, input validation -- ✅ `WeatherService` - API calls, error handling, data transformations -- ✅ `convertToGrams` - Unit conversions, edge cases, real-world scenarios -- ✅ `convertFromGrams` - Reverse conversions, precision handling - ---- - -## Contributing - -When adding new features: - -1. **Write tests first** (TDD approach) or alongside implementation -2. **Aim for 80%+ coverage** for new code -3. **Test all code paths** including error cases -4. **Use existing patterns** from this guide -5. **Update this document** if introducing new patterns - -When fixing bugs: - -1. **Write a failing test** that reproduces the bug -2. **Fix the bug** until the test passes -3. **Verify** no regressions with full test suite - ---- - -*Last Updated: 2026-04-01* diff --git a/copilot-instructions.md b/copilot-instructions.md index f52facf997..b158ea8fc2 100644 --- a/copilot-instructions.md +++ b/copilot-instructions.md @@ -88,10 +88,14 @@ cd apps/guides && bun dev - NEVER CANCEL: Takes ~1.4 seconds to start, set timeout to 30+ seconds - Runs on `http://localhost:3001` (if 3000 is taken) -#### **Testing** -- **API Unit Tests**: `bun test:api:unit` -- NEVER CANCEL: Takes ~5 seconds -- **Expo Tests**: `bun test:expo` -- runs Expo/React Native unit tests -- Tests run sequentially (`fileParallelism: false` in `packages/api/vitest.unit.config.ts`) to avoid database deadlocks +#### **Testing** — see `docs/testing.md` for the full policy +- **API Unit Tests**: `bun test:api:unit` -- Node env, deps mocked. Runtime varies with suite size +- **Expo Tests**: `bun test:expo` -- Vitest, pure-TS modules only (no native imports) +- **MCP Tests**: `bun test:mcp` +- **Scripts Tests**: `bun test:scripts` -- analyzer tests for the coverage ratchet and assertion lint +- **Coverage ratchet**: `bun check:coverage` -- compares each tracked workspace's `coverage/[unit/]coverage-summary.json` against `coverage-baselines.json` at the repo root. Fails CI on regression +- **Assertion-strength lint**: `bun lint:weak-assertions` -- catches assertion-free tests, bare `.toBeDefined()`, bare `.toHaveBeenCalled()`, oversized inline snapshots +- **Integration tests** (`bun run --cwd packages/api test`): require Docker (Postgres + neon-wsproxy), run sequentially (`fileParallelism: false`) to avoid database deadlocks. NOT coverage-counted (V8 unsupported under Cloudflare Workers pool) - Tests expect environment variables to be configured (see `.env.example`) #### **Build Commands** diff --git a/docs/solutions/ui-bugs/android-textinput-keyboard-focus-loss.md b/docs/solutions/ui-bugs/android-textinput-keyboard-focus-loss.md index 7af27b181c..1933e67f11 100644 --- a/docs/solutions/ui-bugs/android-textinput-keyboard-focus-loss.md +++ b/docs/solutions/ui-bugs/android-textinput-keyboard-focus-loss.md @@ -325,7 +325,7 @@ const inputRef = useRef(null); ## Cross References - **Architecture**: [CLAUDE.md](../../../CLAUDE.md#L79-L96) - Mobile app architecture patterns -- **Testing**: [TESTING.md](../../../TESTING.md#L57-L61) - Mobile component testing patterns +- **Testing**: [testing.md](../../testing.md) - Mobile component testing patterns - **Component Patterns**: Enhanced component pattern can be applied to other third-party UI components ## Verification diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 0000000000..4d53ad8a37 --- /dev/null +++ b/docs/testing.md @@ -0,0 +1,315 @@ +# PackRat Testing Guide + +PackRat uses **Vitest** across every workspace that runs tests. This document is the source of truth for: + +- the **per-workspace coverage thresholds** that each Vitest config enforces +- the **coverage ratchet** that gates PRs in CI against regression +- the **assertion-strength lint** that catches coverage theater +- per-pattern testing conventions for services, fetch mocking, and pure utilities + +The current numbers below reflect the state of the configs on `main`. The policy that produced them is tracked in `docs/plans/2026-05-19-001-chore-coverage-ratchet-and-quality-gates-plan.md` (and its 2026-05-17 predecessor). + +--- + +## Coverage Thresholds — Two Layers + +PackRat gates coverage at **two layers** that fail builds for different reasons: + +1. **Vitest per-config thresholds** (declared in each workspace's `vitest.config.ts` / `vitest.unit.config.ts`) fail that workspace's coverage run when *its own* numbers drop below the floor. +2. **The coverage ratchet** (`scripts/lint/coverage-ratchet.ts` + `coverage-baselines.json`) fails the build when *any* tracked workspace drops below the baseline recorded for it on the last green `main`. The ratchet defends the threshold itself — if a PR lowers a Vitest threshold and the coverage drops accordingly, the Vitest gate passes but the ratchet does not. + +Current per-workspace thresholds (all four metrics: lines / branches / functions / statements): + +| Workspace | Lines | Branches | Functions | Statements | +|---|---:|---:|---:|---:| +| `packages/api` (unit suite) | 95 | 92 | 97 | 95 | +| `apps/expo` | 95 | 92 | 97 | 95 | +| `packages/mcp` | 95 | 90 | 95 | 95 | +| `packages/analytics` | 80 | 80 | 85 | 80 | +| `packages/overpass` | 80 | 70 | 80 | 80 | +| `packages/units` | 100 | 100 | 100 | 100 | + +`packages/api` integration tests (the `@cloudflare/vitest-pool-workers` suite in `vitest.config.ts`) are **not** counted toward coverage. V8 coverage is unsupported under the Workers pool and the Istanbul path has an open upstream regression. The unit suite (`vitest.unit.config.ts`) is the coverage source of truth for that workspace. Integration tests still run in `api-tests.yml`. + +Untracked (no coverage threshold today): `apps/{admin,trails,web,landing,guides}`, `packages/{cli,osm-db,osm-import,web-ui,api-client,ui,guards,env,app,checks,config}`. These are deferred to follow-up plans. + +--- + +## Coverage Ratchet + +Every PR is gated by a ratchet that fails CI if any workspace's coverage drops below the baseline in `coverage-baselines.json` (committed at the repo root). + +```bash +# Local check — reads each workspace's coverage/[unit/]coverage-summary.json +# and compares to coverage-baselines.json. Exits 1 on any regression. +bun check:coverage +``` + +On a green push to `main`, the consolidated coverage workflow (deferred to a follow-up plan) auto-commits any baseline improvements back to `coverage-baselines.json` via: + +```bash +bun check:coverage:update +``` + +The baseline only ever moves up. There is no manual edit step in the normal flow. + +To run coverage for a single workspace: + +```bash +bun run --cwd packages/api test:unit:coverage +bun run --cwd apps/expo test:coverage +bun run --cwd packages/mcp test --coverage +bun run --cwd packages/analytics test --coverage +bun run --cwd packages/overpass test --coverage +bun run --cwd packages/units test --coverage +``` + +To run the unit suite for the scripts themselves: + +```bash +bun test:scripts +``` + +When a workspace's coverage genuinely improves, the ratchet's output reports the improvement and prints what the baseline-update script would commit — but day-to-day you don't apply it by hand: CI does it on merge to `main`. + +--- + +## Assertion-Strength Lint + +`scripts/lint/no-weak-assertions.ts` walks every `*.test.ts` / `*.test.tsx` file under `apps/*` and `packages/*` and flags four coverage-theater patterns: + +| Rule | Flags | +|---|---| +| `assertion-free-test` | `it(...)` / `test(...)` blocks with zero `expect(...)` calls. Helper assertions (any call whose name starts with `expect`, e.g. `expectUnauthorized(res)`, `expectJsonResponse(res)`) count as assertions and prevent this rule from firing. | +| `only-tobedefined` | `it(...)` blocks whose only assertions are `.toBeDefined()`, `.toBeTruthy()`, `.toBeFalsy()`, `.not.toBeUndefined()`, or `.not.toBeNull()`. **`.toBeUndefined()` and `.toBeNull()` alone are NOT flagged** — they assert specific return values. | +| `bare-tohavebeencalled` | `.toHaveBeenCalled()` without a matching `.toHaveBeenCalledWith(...)` or `.toHaveBeenCalledTimes(N)` in the same block. | +| `large-snapshot` | `toMatchInlineSnapshot(...)` body > 50 lines. | + +Run with: + +```bash +bun lint:weak-assertions +``` + +File-level escape hatch: `// no-weak-assertions: disable` in the first 5 lines of a file skips the entire file. Use sparingly — grandfathered tests only. + +--- + +## Test Patterns + +### Pattern 1 — Service tests with mocked dependencies + +```ts +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { CatalogService } from '../catalogService'; +import * as embeddingService from '@packrat/api/services/embeddingService'; + +vi.mock('@packrat/api/db', () => ({ + createDb: vi.fn(), + createDbClient: vi.fn(), +})); + +vi.mock('@packrat/api/services/embeddingService', () => ({ + generateEmbedding: vi.fn(), + generateManyEmbeddings: vi.fn(), +})); + +describe('CatalogService', () => { + let service: CatalogService; + + beforeEach(() => { + vi.clearAllMocks(); + service = new CatalogService(makeEnv(), false); + }); + + describe('vectorSearch', () => { + beforeEach(() => { + vi.mocked(embeddingService.generateEmbedding).mockResolvedValue( + new Array(1536).fill(0.1), + ); + }); + + it('returns empty result for empty query string', async () => { + const result = await service.vectorSearch('', 10, 0); + expect(result).toEqual({ + items: [], + total: 0, + limit: 10, + offset: 0, + nextOffset: 10, + }); + expect(embeddingService.generateEmbedding).not.toHaveBeenCalled(); + }); + }); +}); +``` + +Reference: `packages/api/src/services/__tests__/catalogService.test.ts` + +Key points: +- `vi.mock()` for module-level mocks (hoisted to the top of the file). +- `import * as service` then `vi.mocked(service.fn)` for type-safe mock assertions. +- `vi.clearAllMocks()` in `beforeEach()` for test isolation. + +### Pattern 2 — API service tests with fetch mocking + +```ts +beforeEach(() => { + vi.clearAllMocks(); + fetchMock = vi.fn(); + global.fetch = fetchMock; +}); + +it('returns formatted weather data for valid location', async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => ({ main: { temp: 72.5, humidity: 65 }, weather: [{ main: 'Clear' }] }), + }); + const result = await service.getWeatherForLocation('San Francisco'); + expect(result.temperature).toBe(73); + expect(result.conditions).toBe('Clear'); +}); +``` + +Reference: `packages/api/src/services/__tests__/weatherService.test.ts` + +### Pattern 3 — Pure utility function tests + +```ts +import { describe, expect, it } from 'vitest'; +import { convertToGrams } from '../convertToGrams'; + +describe('convertToGrams', () => { + describe('metric conversions', () => { + it('returns same value for grams', () => { + expect(convertToGrams(100, 'g')).toBe(100); + expect(convertToGrams(0, 'g')).toBe(0); + }); + + it('converts kilograms to grams correctly', () => { + expect(convertToGrams(1, 'kg')).toBe(1000); + expect(convertToGrams(2.5, 'kg')).toBe(2500); + }); + }); + + describe('edge cases', () => { + it('handles zero weight', () => { + expect(convertToGrams(0, 'kg')).toBe(0); + }); + + it('returns original value for unknown units', () => { + expect(convertToGrams(100, 'invalid')).toBe(100); + }); + }); +}); +``` + +Reference: `apps/expo/features/packs/utils/__tests__/convertToGrams.test.ts` + +Floating-point comparisons: + +```ts +// Avoid exact equality for floats +expect(convertToGrams(1, 'oz')).toBeCloseTo(28.3495, 4); +``` + +### Pattern 4 — Integration tests (`packages/api/test/*.test.ts`) + +Run with `bun test` from `packages/api/`. Requires Docker (Postgres + neon-wsproxy via `docker-compose.test.yml`). Auth helpers live in `packages/api/test/utils/test-helpers.ts` — use `apiWithAuth`, `apiWithAdmin`, `apiWithApiKey`, never construct sessions by hand. + +Test fixtures must seed users through `userService.createUser`. Do not write new integration tests that `db.insert(users).values(...)` directly. + +--- + +## What to Test (Priority Order) + +For every feature-bearing implementation unit, include scenarios from each category that applies: + +1. **Happy path** — core functionality with expected inputs and outputs. +2. **Edge cases** — boundary values, empty inputs, nullish states, concurrent access. +3. **Error paths** — invalid input, downstream service failures, timeout behavior, permission denials. +4. **Integration** — behaviors that mocks alone will not prove (callback chains, middleware, multi-layer interactions). + +Avoid testing: +- Third-party library internals. +- Pure getters/setters with no logic. +- Generated code (drizzle migrations, OpenAPI types). +- Configuration files. +- Pure type definitions. + +--- + +## Commands + +```bash +# Per-workspace coverage +bun test:api:unit # packages/api unit suite (Node env, all deps mocked) +bun test:expo # apps/expo pure-TS tests +bun test:mcp # packages/mcp +bun run --cwd packages/units test +bun run --cwd packages/overpass test +bun run --cwd packages/analytics test + +# Integration (requires Docker) +bun run --cwd packages/api test # full pool-workers integration suite + +# Coverage gates +bun check:coverage # ratchet against coverage-baselines.json +bun lint:weak-assertions # custom lint over test files + +# Scripts test suite (ratchet + lint analyzer) +bun test:scripts +``` + +Coverage reports for each workspace: +- `packages/api/coverage/unit/index.html` +- `apps/expo/coverage/unit/index.html` +- `packages/mcp/coverage/index.html` +- `packages/analytics/coverage/index.html` +- `packages/overpass/coverage/index.html` +- `packages/units/coverage/index.html` + +--- + +## Troubleshooting + +### "Cannot access before initialization" in test files + +`vi.mock()` calls are hoisted to the top of the file by Vitest. Variables declared after the hoisted mock cannot be referenced inside it. + +```ts +// Won't work +const mockFn = vi.fn(); +vi.mock('./module', () => ({ fn: mockFn })); + +// Works +vi.mock('./module', () => ({ fn: vi.fn() })); +import * as module from './module'; +// Use vi.mocked(module.fn) inside tests +``` + +### Mock not resetting between tests + +Always call `vi.clearAllMocks()` in `beforeEach()`. Without it, call histories leak across tests. + +### Floating-point precision errors + +```ts +expect(0.1 + 0.2).toBeCloseTo(0.3, 10); // 10 decimal places +``` + +### Coverage ratchet fails locally but passes in CI + +Coverage outputs are workspace-local. Make sure you ran `--coverage` for the workspace that's failing — the ratchet treats a missing `coverage-summary.json` as a regression on purpose (silent skipping is exactly the mode the gate exists to prevent). + +### Lint flags a legitimate test as `assertion-free-test` + +Helpers whose names start with `expect` count as assertions. If your helper is named differently (e.g., `assertResponseShape(res)`), the lint will not see it. Either rename to `expectShape(res)` or add the file-level `// no-weak-assertions: disable` comment. + +--- + +## Resources + +- [Vitest Documentation](https://vitest.dev/) +- [Cloudflare Vitest pool — known issues](https://developers.cloudflare.com/workers/testing/vitest-integration/known-issues/) (why integration tests are not coverage-instrumented) +- The plan that established the ratchet + lint policy: `docs/plans/2026-05-19-001-chore-coverage-ratchet-and-quality-gates-plan.md` diff --git a/package.json b/package.json index 81492e2a23..63187e5b92 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,6 @@ "mcp": "bun run --cwd packages/mcp dev", "mcp:deploy": "bun run --cwd packages/mcp deploy", "test:api:unit": "vitest run --config packages/api/vitest.unit.config.ts", - "test:api-client:types": "vitest run --config packages/api-client/vitest.config.ts", "test:e2e:android": "bash .github/scripts/e2e.sh android", "test:e2e:ios": "bash .github/scripts/e2e.sh ios", "test:expo": "vitest run --config apps/expo/vitest.config.ts", From b4bf4e03be8d4352edd5659a26366d393840a9ce Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Tue, 19 May 2026 21:57:18 -0600 Subject: [PATCH 04/85] ci(coverage): add ratchet script and baseline file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The coverage ratchet is the in-repo gate that fails CI if any tracked workspace's coverage drops below its baseline (modulo a small epsilon to absorb v8 instrumentation noise). Pairs with the per-config Vitest thresholds that upstream landed — thresholds enforce the floor, the ratchet enforces no-regression toward the tier target and prevents the "lower threshold to unblock, never re-raise" pattern. New files: - coverage-baselines.json : committed source of truth at repo root. Keyed by workspace path; records the summaryPath, tier, four-metric floor, and recordedAt. Initial baselines captured from actual coverage runs on 2026-05-19: packages/api 98.31 / 95.43 / 100 / 98.31 apps/expo 97.36 / 95.00 / 100 / 97.36 packages/mcp 98.87 / 98.38 / 100 / 98.87 packages/analytics 84.48 / 83.33 / 89.13 / 84.48 packages/overpass 100 / 95.65 / 100 / 100 packages/units 100 / 100 / 100 / 100 Epsilon 0.5 absorbs v8 jitter. - scripts/lint/coverage-ratchet.ts : the gate. Exit 1 on any regression, missing summary, or malformed summary. Exit 0 on equality / improvement. - scripts/lint/coverage-baseline-update.ts : CI-only on `main` — bumps baselines upward after a green run. Never lowers. - scripts/lint/__tests__/coverage-ratchet.test.ts : 13 tests covering the compare logic, missing/invalid summaries, epsilon noise tolerance, multi-metric regressions, and loadBaseline parsing. New package.json scripts: - bun check:coverage : run the ratchet against current summaries - bun check:coverage:update : bump baselines (used by the consolidated coverage CI workflow once it lands) NOT wired into scripts/check-all.ts in this commit — running the ratchet requires per-workspace coverage data to exist on disk first. The consolidated coverage CI workflow (follow-up plan) runs all matrix coverage jobs before invoking the ratchet; locally, contributors run `bun check:coverage` explicitly after a coverage suite. --- coverage-baselines.json | 58 +++++ package.json | 2 + .../lint/__tests__/coverage-ratchet.test.ts | 223 +++++++++++++++++ scripts/lint/coverage-baseline-update.ts | 99 ++++++++ scripts/lint/coverage-ratchet.ts | 232 ++++++++++++++++++ 5 files changed, 614 insertions(+) create mode 100644 coverage-baselines.json create mode 100644 scripts/lint/__tests__/coverage-ratchet.test.ts create mode 100644 scripts/lint/coverage-baseline-update.ts create mode 100644 scripts/lint/coverage-ratchet.ts diff --git a/coverage-baselines.json b/coverage-baselines.json new file mode 100644 index 0000000000..4e1fcdc037 --- /dev/null +++ b/coverage-baselines.json @@ -0,0 +1,58 @@ +{ + "_comment": "Coverage ratchet baselines — see docs/testing.md and scripts/lint/coverage-ratchet.ts. Each entry is a workspace's floor: a PR that drops any metric below the baseline (modulo epsilon) fails CI. CI on main auto-bumps these numbers upward via scripts/lint/coverage-baseline-update.ts after a green run.", + "_epsilon": 0.5, + "packages/api": { + "summaryPath": "packages/api/coverage/unit/coverage-summary.json", + "tier": "B", + "lines": 98.31, + "branches": 95.43, + "functions": 100, + "statements": 98.31, + "recordedAt": "2026-05-19" + }, + "apps/expo": { + "summaryPath": "apps/expo/coverage/unit/coverage-summary.json", + "tier": "C", + "lines": 97.36, + "branches": 95, + "functions": 100, + "statements": 97.36, + "recordedAt": "2026-05-19" + }, + "packages/mcp": { + "summaryPath": "packages/mcp/coverage/coverage-summary.json", + "tier": "B", + "lines": 98.87, + "branches": 98.38, + "functions": 100, + "statements": 98.87, + "recordedAt": "2026-05-19" + }, + "packages/analytics": { + "summaryPath": "packages/analytics/coverage/coverage-summary.json", + "tier": "B", + "lines": 84.48, + "branches": 83.33, + "functions": 89.13, + "statements": 84.48, + "recordedAt": "2026-05-19" + }, + "packages/overpass": { + "summaryPath": "packages/overpass/coverage/coverage-summary.json", + "tier": "A", + "lines": 100, + "branches": 95.65, + "functions": 100, + "statements": 100, + "recordedAt": "2026-05-19" + }, + "packages/units": { + "summaryPath": "packages/units/coverage/coverage-summary.json", + "tier": "A", + "lines": 100, + "branches": 100, + "functions": 100, + "statements": 100, + "recordedAt": "2026-05-19" + } +} diff --git a/package.json b/package.json index 63187e5b92..fa6ec0d984 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,8 @@ "check:casts:strict": "bun run --cwd packages/checks check:casts:strict", "check:catalog": "bun scripts/lint/no-duplicate-deps.ts", "check:circular": "bun scripts/lint/no-circular-deps.ts", + "check:coverage": "bun run scripts/lint/coverage-ratchet.ts", + "check:coverage:update": "bun run scripts/lint/coverage-baseline-update.ts", "check:deps": "manypkg check", "check:magic-strings": "bun run --cwd packages/checks check:magic-strings", "check:package-json": "bun scripts/format/sort-package-json.ts --check", diff --git a/scripts/lint/__tests__/coverage-ratchet.test.ts b/scripts/lint/__tests__/coverage-ratchet.test.ts new file mode 100644 index 0000000000..f07b0aa326 --- /dev/null +++ b/scripts/lint/__tests__/coverage-ratchet.test.ts @@ -0,0 +1,223 @@ +import { describe, expect, it } from 'vitest'; +import { + type CoverageSummary, + compareWorkspace, + loadBaseline, + runRatchet, + type WorkspaceBaseline, +} from '../coverage-ratchet'; + +function makeBaseline(overrides: Partial = {}): WorkspaceBaseline { + return { + summaryPath: 'pkg/coverage/coverage-summary.json', + tier: 'A', + lines: 80, + branches: 70, + functions: 90, + statements: 80, + recordedAt: '2026-05-17', + ...overrides, + }; +} + +function makeSummary(pct: number): CoverageSummary { + return { + total: { + lines: { pct }, + branches: { pct }, + functions: { pct }, + statements: { pct }, + }, + }; +} + +function makeMixedSummary( + lines: number, + branches: number, + functions: number, + statements: number, +): CoverageSummary { + return { + total: { + lines: { pct: lines }, + branches: { pct: branches }, + functions: { pct: functions }, + statements: { pct: statements }, + }, + }; +} + +describe('compareWorkspace', () => { + it('passes when every metric matches or exceeds the baseline', () => { + const result = compareWorkspace('pkg', makeBaseline(), makeMixedSummary(85, 75, 95, 85), 0.5); + expect(result.status).toBe('improvement'); + expect(result.regressions).toBeUndefined(); + }); + + it('passes (status=ok) when every metric is exactly the baseline', () => { + const result = compareWorkspace('pkg', makeBaseline(), makeMixedSummary(80, 70, 90, 80), 0.5); + expect(result.status).toBe('ok'); + }); + + it('flags regression when one metric drops more than epsilon', () => { + const result = compareWorkspace( + 'pkg', + makeBaseline({ branches: 70 }), + makeMixedSummary(80, 65, 90, 80), + 0.5, + ); + expect(result.status).toBe('regression'); + expect(result.regressions).toEqual([{ metric: 'branches', before: 70, after: 65 }]); + }); + + it('tolerates noise below epsilon (default 0.5)', () => { + // baseline 80.0 vs current 79.7 — within epsilon, not a regression. + const result = compareWorkspace('pkg', makeBaseline(), makeMixedSummary(79.7, 70, 90, 80), 0.5); + expect(result.status).toBe('ok'); + }); + + it('rejects drops just above epsilon', () => { + // baseline 80.0 vs current 79.4 — drop of 0.6 > epsilon 0.5. + const result = compareWorkspace('pkg', makeBaseline(), makeMixedSummary(79.4, 70, 90, 80), 0.5); + expect(result.status).toBe('regression'); + expect(result.regressions?.[0]?.metric).toBe('lines'); + }); + + it('reports multiple regressions in one workspace', () => { + const result = compareWorkspace('pkg', makeBaseline(), makeMixedSummary(60, 50, 70, 60), 0.5); + expect(result.status).toBe('regression'); + expect(result.regressions).toHaveLength(4); + }); +}); + +describe('runRatchet', () => { + it('passes when every workspace meets its baseline', () => { + const baseline = { + 'packages/a': makeBaseline({ + summaryPath: 'a/coverage-summary.json', + lines: 80, + branches: 70, + functions: 90, + statements: 80, + }), + 'packages/b': makeBaseline({ + summaryPath: 'b/coverage-summary.json', + lines: 60, + branches: 50, + functions: 70, + statements: 60, + }), + }; + const summaries: Record = { + 'a/coverage-summary.json': makeMixedSummary(85, 75, 95, 85), // beats packages/a + 'b/coverage-summary.json': makeMixedSummary(70, 60, 80, 70), // beats packages/b + }; + const report = runRatchet(baseline, 0.5, (path) => summaries[path] ?? null); + expect(report.passed).toBe(true); + }); + + it('fails when any workspace regresses', () => { + const baseline = { + 'packages/a': makeBaseline({ + summaryPath: 'a/coverage-summary.json', + lines: 80, + branches: 70, + functions: 90, + statements: 80, + }), + 'packages/b': makeBaseline({ + summaryPath: 'b/coverage-summary.json', + lines: 60, + branches: 50, + functions: 70, + statements: 60, + }), + }; + const summaries: Record = { + 'a/coverage-summary.json': makeSummary(85), + 'b/coverage-summary.json': makeSummary(40), + }; + const report = runRatchet(baseline, 0.5, (path) => summaries[path] ?? null); + expect(report.passed).toBe(false); + const failed = report.checks.find((c) => c.workspace === 'packages/b'); + expect(failed?.status).toBe('regression'); + }); + + it('fails when a workspace has no coverage summary on disk', () => { + const baseline = { + 'packages/a': makeBaseline({ + summaryPath: 'missing/coverage-summary.json', + }), + }; + const report = runRatchet(baseline, 0.5, () => null); + expect(report.passed).toBe(false); + expect(report.checks[0]?.status).toBe('missing-summary'); + }); + + it('fails when the summary file is missing required total metrics', () => { + const baseline = { + 'packages/a': makeBaseline({ summaryPath: 'a/coverage-summary.json' }), + }; + const malformed = { total: { lines: { pct: 80 } } } as unknown as CoverageSummary; + const report = runRatchet(baseline, 0.5, (path) => + path === 'a/coverage-summary.json' ? malformed : null, + ); + expect(report.passed).toBe(false); + expect(report.checks[0]?.status).toBe('invalid-summary'); + }); +}); + +describe('loadBaseline', () => { + it('parses workspace entries and ignores comment keys', () => { + const json = JSON.stringify({ + _comment: 'ignored', + _epsilon: 0.3, + 'packages/a': { + summaryPath: 'a/x.json', + tier: 'A', + lines: 80, + branches: 70, + functions: 90, + statements: 80, + recordedAt: '2026-05-17', + }, + }); + const { baseline, epsilon } = loadBaseline(json); + expect(epsilon).toBe(0.3); + expect(Object.keys(baseline)).toEqual(['packages/a']); + expect(baseline['packages/a']?.lines).toBe(80); + }); + + it('falls back to default epsilon when not specified', () => { + const json = JSON.stringify({ + 'packages/a': { + summaryPath: 'a/x.json', + tier: 'A', + lines: 80, + branches: 70, + functions: 90, + statements: 80, + recordedAt: '2026-05-17', + }, + }); + const { epsilon } = loadBaseline(json); + expect(epsilon).toBe(0.05); + }); + + it('skips entries that look malformed', () => { + const json = JSON.stringify({ + 'packages/a': { summaryPath: 'a/x.json' }, // missing metric fields + 'packages/b': { + summaryPath: 'b/x.json', + tier: 'A', + lines: 80, + branches: 70, + functions: 90, + statements: 80, + recordedAt: '2026-05-17', + }, + }); + const { baseline } = loadBaseline(json); + expect(Object.keys(baseline)).toEqual(['packages/b']); + }); +}); diff --git a/scripts/lint/coverage-baseline-update.ts b/scripts/lint/coverage-baseline-update.ts new file mode 100644 index 0000000000..56c52744e1 --- /dev/null +++ b/scripts/lint/coverage-baseline-update.ts @@ -0,0 +1,99 @@ +#!/usr/bin/env bun +// +// coverage-baseline-update.ts — bumps coverage-baselines.json upward. +// +// For every workspace in `coverage-baselines.json`, read its current +// `coverage-summary.json` and update the baseline metric if (and only if) +// the current value is higher. Never lowers a baseline — that's what the +// ratchet is for. +// +// Designed to run on `main` post-merge from CI via: +// `bun scripts/lint/coverage-baseline-update.ts` +// followed by an auto-commit of `coverage-baselines.json`. Do not invoke +// this from PR workflows — it would silently move the floor up before the +// PR's coverage drops below it. +// +// Exit code: +// 0 — file updated (or no changes needed) +// 1 — fatal error (missing baseline file, malformed summaries) +// +// Honours the same `_epsilon` value the ratchet uses — improvements smaller +// than epsilon are ignored so we don't churn the baseline on v8 jitter. + +import { existsSync, readFileSync, writeFileSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { + type BaselineFile, + type CoverageSummary, + loadBaseline, + METRICS, + type Metric, + type WorkspaceBaseline, +} from './coverage-ratchet'; + +interface Bump { + workspace: string; + metric: Metric; + before: number; + after: number; +} + +if (import.meta.main) { + const HERE = dirname(fileURLToPath(import.meta.url)); + const ROOT = join(HERE, '..', '..'); + const BASELINE_PATH = join(ROOT, 'coverage-baselines.json'); + + if (!existsSync(BASELINE_PATH)) { + console.error(`coverage-baselines.json not found at ${BASELINE_PATH}`); + process.exit(1); + } + const raw = readFileSync(BASELINE_PATH, 'utf-8'); + const parsed = JSON.parse(raw) as BaselineFile; + const { baseline, epsilon } = loadBaseline(raw); + + const today = new Date().toISOString().slice(0, 10); + const bumps: Bump[] = []; + + for (const [workspace, entry] of Object.entries(baseline)) { + const abs = join(ROOT, entry.summaryPath); + if (!existsSync(abs)) { + console.warn(`Skipping ${workspace} — no summary at ${entry.summaryPath}`); + continue; + } + let summary: CoverageSummary; + try { + summary = JSON.parse(readFileSync(abs, 'utf-8')) as CoverageSummary; + } catch (err) { + console.warn(`Skipping ${workspace} — malformed summary: ${(err as Error).message}`); + continue; + } + const next: WorkspaceBaseline = { ...entry }; + let changed = false; + for (const metric of METRICS) { + const before = entry[metric]; + const after = summary.total[metric].pct; + if (after - before > epsilon) { + next[metric] = after; + changed = true; + bumps.push({ workspace, metric, before, after }); + } + } + if (changed) { + next.recordedAt = today; + parsed[workspace] = next; + } + } + + if (bumps.length === 0) { + console.log('Coverage baselines: no improvements above epsilon. File unchanged.'); + process.exit(0); + } + + writeFileSync(BASELINE_PATH, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8'); + console.log(`Coverage baselines: bumped ${bumps.length} metric(s).`); + for (const b of bumps) { + console.log(` ${b.workspace} ${b.metric}: ${b.before.toFixed(2)}% → ${b.after.toFixed(2)}%`); + } + process.exit(0); +} diff --git a/scripts/lint/coverage-ratchet.ts b/scripts/lint/coverage-ratchet.ts new file mode 100644 index 0000000000..12ea09f8ff --- /dev/null +++ b/scripts/lint/coverage-ratchet.ts @@ -0,0 +1,232 @@ +#!/usr/bin/env bun +// +// coverage-ratchet.ts — enforces per-workspace coverage baselines. +// +// Reads `coverage-baselines.json` at the repo root, then for each workspace +// entry reads its coverage summary (vitest's `coverage-summary.json`, +// emitted by the `json-summary` reporter) and compares each metric (lines / +// branches / functions / statements) to the baseline. +// +// A regression on any metric fails the run. A workspace that's in the +// baseline but missing a coverage summary also fails — silent skipping is +// exactly the mode the ratchet exists to prevent. +// +// Exit code: +// 0 — every baseline metric met or exceeded +// 1 — at least one regression (or missing summary) +// +// Coverage *improvements* are reported but never required to update the +// baseline locally; the `coverage-baseline-update.ts` script handles that +// on the main branch via CI. + +import { existsSync, readFileSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +export const METRICS = ['lines', 'branches', 'functions', 'statements'] as const; +export type Metric = (typeof METRICS)[number]; + +export interface WorkspaceBaseline { + summaryPath: string; + tier: 'A' | 'B' | 'C'; + lines: number; + branches: number; + functions: number; + statements: number; + recordedAt: string; +} + +export interface BaselineFile { + _comment?: string; + _epsilon?: number; + [workspace: string]: WorkspaceBaseline | string | number | undefined; +} + +export interface CoverageSummary { + total: { + lines: { pct: number }; + branches: { pct: number }; + functions: { pct: number }; + statements: { pct: number }; + }; +} + +export interface RatchetCheck { + workspace: string; + status: 'ok' | 'regression' | 'improvement' | 'missing-summary' | 'invalid-summary'; + before?: Record; + after?: Record; + regressions?: Array<{ metric: Metric; before: number; after: number }>; + message?: string; +} + +export interface RatchetReport { + checks: RatchetCheck[]; + passed: boolean; +} + +const DEFAULT_EPSILON = 0.05; + +function isBaselineEntry(v: unknown): v is WorkspaceBaseline { + if (v === null || typeof v !== 'object') return false; + const e = v as Record; + return ( + typeof e.summaryPath === 'string' && + typeof e.lines === 'number' && + typeof e.branches === 'number' && + typeof e.functions === 'number' && + typeof e.statements === 'number' + ); +} + +export function loadBaseline(baselineJson: string): { + baseline: Record; + epsilon: number; +} { + const parsed = JSON.parse(baselineJson) as BaselineFile; + const epsilon = typeof parsed._epsilon === 'number' ? parsed._epsilon : DEFAULT_EPSILON; + const baseline: Record = {}; + for (const [key, value] of Object.entries(parsed)) { + if (key.startsWith('_')) continue; + if (isBaselineEntry(value)) baseline[key] = value; + } + return { baseline, epsilon }; +} + +export function compareWorkspace( + workspace: string, + baseline: WorkspaceBaseline, + summary: CoverageSummary, + epsilon: number, +): RatchetCheck { + const before: Record = { + lines: baseline.lines, + branches: baseline.branches, + functions: baseline.functions, + statements: baseline.statements, + }; + const after: Record = { + lines: summary.total.lines.pct, + branches: summary.total.branches.pct, + functions: summary.total.functions.pct, + statements: summary.total.statements.pct, + }; + const regressions: Array<{ metric: Metric; before: number; after: number }> = []; + let improved = false; + for (const metric of METRICS) { + const drop = before[metric] - after[metric]; + if (drop > epsilon) { + regressions.push({ metric, before: before[metric], after: after[metric] }); + } else if (after[metric] - before[metric] > epsilon) { + improved = true; + } + } + if (regressions.length > 0) { + return { workspace, status: 'regression', before, after, regressions }; + } + if (improved) { + return { workspace, status: 'improvement', before, after }; + } + return { workspace, status: 'ok', before, after }; +} + +export function runRatchet( + baseline: Record, + epsilon: number, + readSummary: (path: string) => CoverageSummary | null, +): RatchetReport { + const checks: RatchetCheck[] = []; + for (const [workspace, entry] of Object.entries(baseline)) { + const summary = readSummary(entry.summaryPath); + if (summary === null) { + checks.push({ + workspace, + status: 'missing-summary', + message: `no coverage summary at ${entry.summaryPath} — run the workspace's coverage script before the ratchet`, + }); + continue; + } + if ( + typeof summary?.total?.lines?.pct !== 'number' || + typeof summary?.total?.branches?.pct !== 'number' || + typeof summary?.total?.functions?.pct !== 'number' || + typeof summary?.total?.statements?.pct !== 'number' + ) { + checks.push({ + workspace, + status: 'invalid-summary', + message: `coverage summary at ${entry.summaryPath} is missing required total metrics`, + }); + continue; + } + checks.push(compareWorkspace(workspace, entry, summary, epsilon)); + } + const passed = checks.every((c) => c.status === 'ok' || c.status === 'improvement'); + return { checks, passed }; +} + +function fmtPct(n: number): string { + return `${n.toFixed(2)}%`; +} + +function renderReport(report: RatchetReport): string { + const lines: string[] = []; + const DIVIDER = '─'.repeat(60); + lines.push('\nCoverage Ratchet'); + lines.push(DIVIDER); + for (const c of report.checks) { + if (c.status === 'ok') { + lines.push(`✅ ${c.workspace} — baseline met`); + } else if (c.status === 'improvement') { + lines.push(`📈 ${c.workspace} — coverage improved (CI on main will bump baseline)`); + if (c.before && c.after) { + for (const m of METRICS) { + if (c.after[m] - c.before[m] > 0.05) { + lines.push(` ${m}: ${fmtPct(c.before[m])} → ${fmtPct(c.after[m])}`); + } + } + } + } else if (c.status === 'regression' && c.regressions) { + lines.push(`❌ ${c.workspace} — REGRESSION:`); + for (const r of c.regressions) { + lines.push(` ${r.metric}: ${fmtPct(r.before)} → ${fmtPct(r.after)}`); + } + } else if (c.status === 'missing-summary' || c.status === 'invalid-summary') { + lines.push(`❌ ${c.workspace} — ${c.message}`); + } + } + lines.push(DIVIDER); + if (report.passed) { + lines.push('All workspaces ≥ baseline.'); + } else { + lines.push('One or more workspaces regressed. Run the workspace coverage'); + lines.push('script locally to reproduce, add tests for the affected files,'); + lines.push('and commit until the ratchet passes. See docs/testing.md.'); + } + return lines.join('\n'); +} + +if (import.meta.main) { + const HERE = dirname(fileURLToPath(import.meta.url)); + const ROOT = join(HERE, '..', '..'); + const BASELINE_PATH = join(ROOT, 'coverage-baselines.json'); + + if (!existsSync(BASELINE_PATH)) { + console.error(`coverage-baselines.json not found at ${BASELINE_PATH}`); + process.exit(1); + } + const { baseline, epsilon } = loadBaseline(readFileSync(BASELINE_PATH, 'utf-8')); + + const report = runRatchet(baseline, epsilon, (relPath) => { + const abs = join(ROOT, relPath); + if (!existsSync(abs)) return null; + try { + return JSON.parse(readFileSync(abs, 'utf-8')) as CoverageSummary; + } catch { + return null; + } + }); + + console.log(renderReport(report)); + process.exit(report.passed ? 0 : 1); +} From a72d6d409c0c77c1f0ce3db7bb381f627b73fcfc Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Tue, 19 May 2026 21:57:29 -0600 Subject: [PATCH 05/85] chore(package-json): re-sort scripts after Phase 1 additions bun format:package-json moves check:coverage / check:coverage:update and lint:weak-assertions / test:scripts into alphabetical position within the scripts block. No behaviour change. --- package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index fa6ec0d984..5efb6cfca9 100644 --- a/package.json +++ b/package.json @@ -37,9 +37,8 @@ "lefthook": "lefthook install", "lint": "biome check --write", "lint:custom": "bun run scripts/lint/no-raw-typeof.ts && bun run scripts/lint/no-raw-regex.ts && bun run packages/env/scripts/no-raw-process-env.ts && bun run scripts/lint/no-duplicate-guards.ts && bun run scripts/lint/no-unauth-routes.ts && bun run scripts/lint/check-drizzle-migrations.ts", - "lint:weak-assertions": "bun run scripts/lint/no-weak-assertions.ts", - "test:scripts": "vitest run --config scripts/vitest.config.ts", "lint:strict": "biome check && bun run lint:custom", + "lint:weak-assertions": "bun run scripts/lint/no-weak-assertions.ts", "lint-unsafe": "biome check --write --unsafe", "mcp": "bun run --cwd packages/mcp dev", "mcp:deploy": "bun run --cwd packages/mcp deploy", @@ -51,6 +50,7 @@ "test:guides": "vitest run --config apps/guides/vitest.config.ts", "test:landing": "vitest run --config apps/landing/vitest.config.ts", "test:mcp": "bun run --cwd packages/mcp test", + "test:scripts": "vitest run --config scripts/vitest.config.ts", "trails": "bun run --cwd apps/trails dev", "web": "bun run --cwd apps/web dev" }, From d76bd361062506661ec52fa0fe4a60c6969985e4 Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Tue, 19 May 2026 22:01:53 -0600 Subject: [PATCH 06/85] ci(coverage): add matrix-driven coverage workflow with ratchet gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces unit-tests.yml with a single Coverage workflow that runs coverage for every tracked workspace in a matrix, posts per-workspace PR comments via davelosert/vitest-coverage-report-action, aggregates the summaries, and runs `bun check:coverage` as a hard gate against regression. Workflow shape: coverage (matrix) → run vitest --coverage per workspace │ upload coverage-summary.json artifact ▼ post PR comment with delta ratchet → download all summaries run `bun check:coverage` against coverage-baselines.json fails on any regression / missing summary │ ▼ (main only) bump-baseline → re-run check:coverage:update auto-commit any baseline improvements Plus a scripts-tests job that runs `bun test:scripts` (16 lint analyzer tests + 13 ratchet tests) on every PR. Tracked workspaces in the matrix: packages/api (unit), apps/expo, packages/mcp, packages/analytics, packages/overpass, packages/units. fail-fast: false so one workspace's regression doesn't mask another's. Path filters are loose by design — any change under apps/** or packages/** triggers the matrix. The narrow path filters in the previous unit-tests.yml were how 12 workspaces ended up untracked in CI. The matrix's own job-level conditions skip work cheaply when nothing to run. Permissions: contents:write for the main-only baseline auto-commit; pull-requests:write for the coverage report comments. The PR-time coverage step explicitly guards on \`github.event_name == 'pull_request'\` so push events don't try to comment on nothing. Deletes unit-tests.yml — fully subsumed by the new workflow. Coordinate the branch-protection required-check rename from "Unit Tests" → "Coverage / Coverage Ratchet" before merging. --- .github/workflows/coverage.yml | 264 +++++++++++++++++++++++++++++++ .github/workflows/unit-tests.yml | 99 ------------ 2 files changed, 264 insertions(+), 99 deletions(-) create mode 100644 .github/workflows/coverage.yml delete mode 100644 .github/workflows/unit-tests.yml diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 0000000000..ac20064a3a --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,264 @@ +name: Coverage + +on: + push: + branches: ["main", "development"] + paths: + - "package.json" + - "bun.lock" + - "apps/**" + - "packages/**" + - "scripts/lint/coverage-ratchet.ts" + - "scripts/lint/coverage-baseline-update.ts" + - "scripts/lint/no-weak-assertions.ts" + - "scripts/vitest.config.ts" + - "coverage-baselines.json" + - ".github/workflows/coverage.yml" + pull_request: + branches: ["**"] + paths: + - "package.json" + - "bun.lock" + - "apps/**" + - "packages/**" + - "scripts/lint/coverage-ratchet.ts" + - "scripts/lint/coverage-baseline-update.ts" + - "scripts/lint/no-weak-assertions.ts" + - "scripts/vitest.config.ts" + - "coverage-baselines.json" + - ".github/workflows/coverage.yml" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: write # for baseline auto-commit on main + pull-requests: write # for vitest-coverage-report-action comments + +jobs: + # One coverage run per tracked workspace. Uploads the coverage-summary.json + # as an artifact for the ratchet job to aggregate. + coverage: + name: Coverage (${{ matrix.name }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - name: packages/api + artifact_slug: packages-api + test_command: bun run --cwd packages/api test:unit:coverage + summary_path: packages/api/coverage/unit/coverage-summary.json + final_path: packages/api/coverage/unit/coverage-final.json + vite_config_path: ./vitest.unit.config.ts + working_directory: ./packages/api + - name: apps/expo + artifact_slug: apps-expo + test_command: bun run --cwd apps/expo test:coverage + summary_path: apps/expo/coverage/unit/coverage-summary.json + final_path: apps/expo/coverage/unit/coverage-final.json + vite_config_path: ./vitest.config.ts + working_directory: ./apps/expo + - name: packages/mcp + artifact_slug: packages-mcp + test_command: bun run --cwd packages/mcp test --coverage + summary_path: packages/mcp/coverage/coverage-summary.json + final_path: packages/mcp/coverage/coverage-final.json + vite_config_path: ./vitest.config.ts + working_directory: ./packages/mcp + - name: packages/analytics + artifact_slug: packages-analytics + test_command: bun run --cwd packages/analytics test --coverage + summary_path: packages/analytics/coverage/coverage-summary.json + final_path: packages/analytics/coverage/coverage-final.json + vite_config_path: ./vitest.config.ts + working_directory: ./packages/analytics + - name: packages/overpass + artifact_slug: packages-overpass + test_command: bun run --cwd packages/overpass test --coverage + summary_path: packages/overpass/coverage/coverage-summary.json + final_path: packages/overpass/coverage/coverage-final.json + vite_config_path: ./vitest.config.ts + working_directory: ./packages/overpass + - name: packages/units + artifact_slug: packages-units + test_command: bun run --cwd packages/units test --coverage + summary_path: packages/units/coverage/coverage-summary.json + final_path: packages/units/coverage/coverage-final.json + vite_config_path: ./vitest.config.ts + working_directory: ./packages/units + steps: + - uses: actions/checkout@v6 + + - uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install dependencies + env: + PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN: ${{ secrets.PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN }} + run: bun install --frozen-lockfile + + - name: Run coverage for ${{ matrix.name }} + run: ${{ matrix.test_command }} + + - name: Report coverage on PR + if: always() && github.event_name == 'pull_request' + uses: davelosert/vitest-coverage-report-action@v2 + with: + name: ${{ matrix.name }} + json-summary-path: ./${{ matrix.summary_path }} + json-final-path: ./${{ matrix.final_path }} + vite-config-path: ${{ matrix.vite_config_path }} + working-directory: ${{ matrix.working_directory }} + + - name: Upload coverage summary artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: coverage-summary-${{ matrix.artifact_slug }} + path: ${{ matrix.summary_path }} + if-no-files-found: error + retention-days: 7 + + # Aggregate every workspace's coverage-summary.json and run the ratchet. + # Fails the workflow if any workspace dropped below its baseline. + ratchet: + name: Coverage Ratchet + runs-on: ubuntu-latest + needs: coverage + if: always() + steps: + - uses: actions/checkout@v6 + + - uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install dependencies + env: + PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN: ${{ secrets.PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN }} + run: bun install --frozen-lockfile + + - name: Download all coverage summaries + uses: actions/download-artifact@v4 + with: + pattern: coverage-summary-* + path: artifacts + + - name: Restore summaries to their workspace paths + run: | + set -euo pipefail + # Each artifact arrives as artifacts/coverage-summary-/ + # actions/download-artifact@v4 unzips a single-file artifact into a directory + # named after the artifact, preserving the source file's basename. + # Copy each back to its expected location so coverage-baselines.json's + # summaryPath entries resolve. + declare -A targets=( + [packages-api]=packages/api/coverage/unit/coverage-summary.json + [apps-expo]=apps/expo/coverage/unit/coverage-summary.json + [packages-mcp]=packages/mcp/coverage/coverage-summary.json + [packages-analytics]=packages/analytics/coverage/coverage-summary.json + [packages-overpass]=packages/overpass/coverage/coverage-summary.json + [packages-units]=packages/units/coverage/coverage-summary.json + ) + for slug in "${!targets[@]}"; do + target="${targets[$slug]}" + src_dir="artifacts/coverage-summary-${slug}" + if [ ! -d "$src_dir" ]; then + echo "::warning::missing artifact for $slug — coverage job may have failed" + continue + fi + mkdir -p "$(dirname "$target")" + # Find the single JSON file inside (path may be flat or preserved). + src_file=$(find "$src_dir" -name 'coverage-summary.json' | head -n1) + if [ -z "$src_file" ]; then + echo "::warning::no coverage-summary.json inside artifacts/coverage-summary-${slug}" + continue + fi + cp "$src_file" "$target" + echo "restored $slug → $target" + done + + - name: Run coverage ratchet + run: bun check:coverage + + # On a green push to main, auto-bump coverage-baselines.json upward. + # Never runs on PRs — PRs cannot edit the baseline file silently. + bump-baseline: + name: Bump Coverage Baselines + runs-on: ubuntu-latest + needs: ratchet + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + steps: + - uses: actions/checkout@v6 + with: + # Need full token to push the auto-commit back to main. + token: ${{ secrets.GITHUB_TOKEN }} + + - uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install dependencies + env: + PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN: ${{ secrets.PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN }} + run: bun install --frozen-lockfile + + - name: Download all coverage summaries + uses: actions/download-artifact@v4 + with: + pattern: coverage-summary-* + path: artifacts + + - name: Restore summaries to their workspace paths + run: | + set -euo pipefail + declare -A targets=( + [packages-api]=packages/api/coverage/unit/coverage-summary.json + [apps-expo]=apps/expo/coverage/unit/coverage-summary.json + [packages-mcp]=packages/mcp/coverage/coverage-summary.json + [packages-analytics]=packages/analytics/coverage/coverage-summary.json + [packages-overpass]=packages/overpass/coverage/coverage-summary.json + [packages-units]=packages/units/coverage/coverage-summary.json + ) + for slug in "${!targets[@]}"; do + target="${targets[$slug]}" + src_dir="artifacts/coverage-summary-${slug}" + if [ ! -d "$src_dir" ]; then + continue + fi + mkdir -p "$(dirname "$target")" + src_file=$(find "$src_dir" -name 'coverage-summary.json' | head -n1) + if [ -n "$src_file" ]; then + cp "$src_file" "$target" + fi + done + + - name: Compute baseline updates + run: bun check:coverage:update + + - name: Commit baseline updates + uses: stefanzweifel/git-auto-commit-action@v6 + with: + commit_message: "chore(coverage): bump baselines after green main" + file_pattern: coverage-baselines.json + + # The scripts test suite — verifies the ratchet and assertion-lint analyzers + # themselves on every PR that touches them or their tests. + scripts-tests: + name: Scripts Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + - name: Install dependencies + env: + PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN: ${{ secrets.PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN }} + run: bun install --frozen-lockfile + - name: Run scripts test suite + run: bun test:scripts diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml deleted file mode 100644 index 3faee06299..0000000000 --- a/.github/workflows/unit-tests.yml +++ /dev/null @@ -1,99 +0,0 @@ -name: Unit Tests - -on: - push: - branches: ["main", "development"] - paths: - - "package.json" - - "bun.lock" - - "packages/api/src/**" - - "packages/api/package.json" - - "packages/api/vitest.unit.config.ts" - - "apps/expo/package.json" - - "apps/expo/vitest.config.ts" - - "apps/expo/utils/**" - - "apps/expo/lib/utils/**" - - "apps/expo/features/**/utils/**" - - ".github/workflows/unit-tests.yml" - pull_request: - branches: ["**"] - paths: - - "package.json" - - "bun.lock" - - "packages/api/src/**" - - "packages/api/package.json" - - "packages/api/vitest.unit.config.ts" - - "apps/expo/package.json" - - "apps/expo/vitest.config.ts" - - "apps/expo/utils/**" - - "apps/expo/lib/utils/**" - - "apps/expo/features/**/utils/**" - - ".github/workflows/unit-tests.yml" - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - pull-requests: write - -jobs: - api-unit-tests: - name: API Unit Tests - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v6 - - - uses: oven-sh/setup-bun@v2 - with: - bun-version: latest - - - name: Install dependencies - env: - PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN: ${{ secrets.PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN }} - run: bun install --frozen-lockfile - - - name: Run API unit tests - run: bun run --cwd packages/api test:unit:coverage - - - name: Report API coverage - if: always() - uses: davelosert/vitest-coverage-report-action@v2 - with: - name: API Unit Tests Coverage - json-summary-path: ./coverage/unit/coverage-summary.json - json-final-path: ./coverage/unit/coverage-final.json - vite-config-path: ./vitest.unit.config.ts - working-directory: ./packages/api - - expo-unit-tests: - name: Expo Unit Tests - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v6 - - - uses: oven-sh/setup-bun@v2 - with: - bun-version: latest - - - name: Install dependencies - env: - PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN: ${{ secrets.PACKRAT_NATIVEWIND_UI_GITHUB_TOKEN }} - run: bun install --frozen-lockfile - - - name: Run Expo unit tests - run: bun run --cwd apps/expo test:coverage - - - name: Report Expo coverage - if: always() - uses: davelosert/vitest-coverage-report-action@v2 - with: - name: Expo Unit Tests Coverage - json-summary-path: ./coverage/unit/coverage-summary.json - json-final-path: ./coverage/unit/coverage-final.json - vite-config-path: ./vitest.config.ts - working-directory: ./apps/expo From 491ebb2b1898c9ca812e13810f8489ee2dee15de Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Tue, 19 May 2026 22:04:32 -0600 Subject: [PATCH 07/85] fix(ci): coverage report action path doubling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vitest-coverage-report-action step joins working-directory with json-summary-path internally, so passing a repo-relative summary path produced './apps/expo/apps/expo/coverage/unit/coverage-summary.json' and failed every matrix job with ENOENT. The test commands themselves ran successfully and the artifacts uploaded — only the PR comment step errored. Split the matrix into two path sets: - summary_path / final_path : repo-relative (artifact upload + ratchet) - summary_relative / final_relative : working-directory-relative (action) --- .github/workflows/coverage.yml | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index ac20064a3a..989fa81e72 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -47,11 +47,17 @@ jobs: fail-fast: false matrix: include: + # `summary_path` / `final_path` are repo-relative (used by artifact + # upload + ratchet restore). `summary_relative` / `final_relative` + # are relative to `working_directory` (used by the coverage report + # action, which joins them with working_directory internally). - name: packages/api artifact_slug: packages-api test_command: bun run --cwd packages/api test:unit:coverage summary_path: packages/api/coverage/unit/coverage-summary.json final_path: packages/api/coverage/unit/coverage-final.json + summary_relative: ./coverage/unit/coverage-summary.json + final_relative: ./coverage/unit/coverage-final.json vite_config_path: ./vitest.unit.config.ts working_directory: ./packages/api - name: apps/expo @@ -59,6 +65,8 @@ jobs: test_command: bun run --cwd apps/expo test:coverage summary_path: apps/expo/coverage/unit/coverage-summary.json final_path: apps/expo/coverage/unit/coverage-final.json + summary_relative: ./coverage/unit/coverage-summary.json + final_relative: ./coverage/unit/coverage-final.json vite_config_path: ./vitest.config.ts working_directory: ./apps/expo - name: packages/mcp @@ -66,6 +74,8 @@ jobs: test_command: bun run --cwd packages/mcp test --coverage summary_path: packages/mcp/coverage/coverage-summary.json final_path: packages/mcp/coverage/coverage-final.json + summary_relative: ./coverage/coverage-summary.json + final_relative: ./coverage/coverage-final.json vite_config_path: ./vitest.config.ts working_directory: ./packages/mcp - name: packages/analytics @@ -73,6 +83,8 @@ jobs: test_command: bun run --cwd packages/analytics test --coverage summary_path: packages/analytics/coverage/coverage-summary.json final_path: packages/analytics/coverage/coverage-final.json + summary_relative: ./coverage/coverage-summary.json + final_relative: ./coverage/coverage-final.json vite_config_path: ./vitest.config.ts working_directory: ./packages/analytics - name: packages/overpass @@ -80,6 +92,8 @@ jobs: test_command: bun run --cwd packages/overpass test --coverage summary_path: packages/overpass/coverage/coverage-summary.json final_path: packages/overpass/coverage/coverage-final.json + summary_relative: ./coverage/coverage-summary.json + final_relative: ./coverage/coverage-final.json vite_config_path: ./vitest.config.ts working_directory: ./packages/overpass - name: packages/units @@ -87,6 +101,8 @@ jobs: test_command: bun run --cwd packages/units test --coverage summary_path: packages/units/coverage/coverage-summary.json final_path: packages/units/coverage/coverage-final.json + summary_relative: ./coverage/coverage-summary.json + final_relative: ./coverage/coverage-final.json vite_config_path: ./vitest.config.ts working_directory: ./packages/units steps: @@ -109,8 +125,8 @@ jobs: uses: davelosert/vitest-coverage-report-action@v2 with: name: ${{ matrix.name }} - json-summary-path: ./${{ matrix.summary_path }} - json-final-path: ./${{ matrix.final_path }} + json-summary-path: ${{ matrix.summary_relative }} + json-final-path: ${{ matrix.final_relative }} vite-config-path: ${{ matrix.vite_config_path }} working-directory: ${{ matrix.working_directory }} From c4b01d52cbf13870697cc547a6216e37b2fc22b6 Mon Sep 17 00:00:00 2001 From: Andrew Bierman Date: Wed, 20 May 2026 00:47:30 -0600 Subject: [PATCH 08/85] docs(etl): pivot remediation plan from Queues+outbox to Workflows Mark the 2026-05-19 audit-remediation plan as superseded and replace it with a Workflows-based plan that natively provides the durable-step + idempotency + retry + state semantics the prior plan reconstructed manually on top of Queues + Postgres. Audit findings about CSV correctness, validator hardening, observability, retention, and the operational runbook carry into the new plan; the queue-as-state-machine subplot is dropped. Net unit count drops from 15 to 9. Also includes the underlying audit (docs/audits/2026-05-16-etl-audit.md) that grounds both plans. --- docs/audits/2026-05-16-etl-audit.md | 183 +++ ...fix-etl-pipeline-audit-remediation-plan.md | 1062 +++++++++++++++++ ...x-etl-pipeline-workflows-migration-plan.md | 769 ++++++++++++ 3 files changed, 2014 insertions(+) create mode 100644 docs/audits/2026-05-16-etl-audit.md create mode 100644 docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md create mode 100644 docs/plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md diff --git a/docs/audits/2026-05-16-etl-audit.md b/docs/audits/2026-05-16-etl-audit.md new file mode 100644 index 0000000000..84f1d69449 --- /dev/null +++ b/docs/audits/2026-05-16-etl-audit.md @@ -0,0 +1,183 @@ +# ETL Pipeline Audit — 2026-05-16 + +## Summary + +The catalog ETL pipeline works end-to-end and has been hardened through a recent series of fixes (OOM, CPU-time budget, atomic counters, byte-range chunking), but it is not production-ready: chunking + a single shared `jobId` produces double-counted `totalProcessed`, mis-marks jobs `completed` after the first chunk finishes, and lacks any dead-letter / retry policy at the queue layer. Catastrophic per-message failures silently swallow errors in `processQueueBatch` (`try/catch` with `console.error` only), so the queue happily acks bad chunks. The retry endpoint also re-queues only the original object key, ignoring multi-chunk jobs entirely. + +**Top 3 risks**: (1) cross-chunk job-status race (any one chunk's completion marks the entire job `completed`), (2) consumer swallows errors so failed messages never retry/DLQ, (3) retry endpoint and stuck-job sweep are incompatible with byte-range chunking. + +## Architecture + +``` +POST /api/catalog/etl ── api-key auth + │ body: { filename, chunks[], source, scraperRevision } + ▼ +1. INSERT etl_jobs (status='running') +2. For each objectKey: R2.head() → split into 20 MB byte-range chunks +3. queueCatalogETL → ETL_QUEUE.sendBatch (one message per chunk, same jobId) + +ETL_QUEUE (max_batch_size=1, max_concurrency=1) + ▼ +processQueueBatch + ▼ +processCatalogETL ── per chunk + ├── R2.get(key, {range}) ── stream body + ├── if non-first chunk: GET first 4 KB → extract header → inject; skip partial row + ├── csv-parse stream w/ backpressure (parser.write returns false → wait 'drain') + ├── yield every 100 rows (setTimeout(1)) + ├── flush at BATCH_SIZE=100 rows: + │ valid → processValidItemsBatch → mergeBySku → embeddings → catalogService.upsert → updateEtlJobProgress + │ invalid → processLogsBatch → invalid_item_logs → updateEtlJobProgress + └── on success: UPDATE etl_jobs SET status='completed' ◀── PROBLEM with multi-chunk jobs + on throw: UPDATE etl_jobs SET status='failed' + rethrow +``` + +Counters are atomic per call (`COALESCE(col, 0) + n` in SQL). Job rows are not. + +## Findings + +### [P0] Multi-chunk jobs are marked `completed` after the first chunk finishes +- **What**: All chunks for a single source file share one `jobId`; each chunk independently sets `status='completed'` on success. +- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:188-191`; chunks created in `packages/api/src/routes/catalog/index.ts:182-200`. +- **Why it matters**: A 100 MB file becomes 5 chunks → 5 messages → the first message to finish flips the job to `completed`, even though 80% of rows haven't been processed yet. The dashboard, `success_rate`, and any downstream check ("is the catalog refresh done?") fire prematurely. Subsequent chunks continue to mutate `totalProcessed/totalValid/totalInvalid`, so the row reads as `completed` with rising counters. +- **Recommendation**: Track per-chunk completion. Two options: (a) add a `chunks_total` and `chunks_completed` column; only set `completed` when `chunks_completed = chunks_total`. (b) give each chunk its own jobId and group by a parent `batch_id`. Option (a) is the smaller change. + +### [P0] `processQueueBatch` swallows errors — failed chunks never retry or DLQ +- **What**: Per-message exceptions are caught and logged but never rethrown; CF Queues auto-acks every message in the batch. +- **Where**: `packages/api/src/services/etl/queue.ts:50-60`. +- **Why it matters**: A transient DB error, OpenAI 429, or R2 read failure permanently loses the chunk. The job is marked `failed` (good) but the message is acked (bad) — there is no retry, no dead-letter queue, and `wrangler.jsonc` does not declare a `dead_letter_queue` or `max_retries`. Combined with the multi-chunk issue above, a single failure can corrupt the job state while other chunks succeed and mark it `completed`. +- **Recommendation**: Rethrow in the catch (or call `message.retry()` explicitly on the specific message). Add `dead_letter_queue` and `max_retries: 3` to the ETL queue consumer in `wrangler.jsonc:76-82`. Process messages with `for...of` calling `message.ack()` / `message.retry()` explicitly so partial-batch semantics are correct even though `max_batch_size=1` today. + +### [P1] Retry endpoint discards multi-chunk structure +- **What**: `POST /admin/etl/:jobId/retry` re-queues exactly one chunk built from `v2/${source}/${filename}` with no chunking. +- **Where**: `packages/api/src/routes/admin/analytics/catalog.ts:434-450`. +- **Why it matters**: If the original job was chunked (20 MB+ files), retry blasts the entire file at one Worker invocation, blowing past the 300s CPU-time limit that prompted the chunking work in the first place. Result: retries of any large failed job silently re-fail. +- **Recommendation**: Re-run the same R2.head + chunk-split logic the producer endpoint uses (lines 182-200). Extract that to a shared helper so both call sites stay in sync. + +### [P1] Stuck-job sweep is wall-clock based and incompatible with serial chunked jobs +- **What**: `POST /admin/etl/reset-stuck` flips any job in `running` for >30 min to `failed`. +- **Where**: `packages/api/src/routes/admin/analytics/catalog.ts:384-403`. +- **Why it matters**: With `max_concurrency=1` and 20 MB chunks each consuming most of a 300s CPU budget, a 500 MB file produces 25 chunks at up to ~5 minutes each → comfortably past 30 minutes. Healthy long jobs will be marked `failed`. The trigger should be "no progress for N minutes" (e.g., `totalProcessed` unchanged), not "started >30 min ago". +- **Recommendation**: Add `lastProgressAt` updated on each `updateEtlJobProgress` call; sweep on `lastProgressAt < now - 15min`. Or check `completedAt IS NULL AND startedAt < now - 2h` for the absolute floor. + +### [P1] First-chunk header injection assumes the first 4 KB contains a complete header +- **What**: For non-first chunks, the parser fetches `bytes 0-4095` and uses `headerText.split('\n')[0]` as the header row. +- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:53-58`. +- **Why it matters**: If the header row exceeds 4 KB (wide CSVs with 30+ columns and long names — possible here given the catalog schema has 25+ fields), `split('\n')[0]` returns a *truncated* header, so `fieldMap` silently maps the last column wrong. There is also no validation that the slice actually contained a newline before `byteEnd=4095`. +- **Recommendation**: Loop the range request (or use a streaming `until newline` reader). At minimum, throw if no `\n` appears in the first 4 KB so the failure is loud, not silent. + +### [P1] Partial-row skip can drop a valid full row when chunk boundary lands on a newline +- **What**: `skipPartialRow` discards everything up to and including the first `\n` after `byteStart`. If `byteStart` happens to be the first byte *after* a newline (i.e., the previous chunk's last byte is `\n`), the producing chunk processed the full row, and this chunk correctly starts on a row boundary — but the skip logic still throws away the first whole row. +- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:95-108`. +- **Why it matters**: Off-by-one row drop at every chunk boundary in worst case (data loss, not just dup). For 25-chunk file → potentially 24 lost catalog items. No test covers the boundary-aligned case. +- **Recommendation**: When splitting chunks at line 195-196 of `routes/catalog/index.ts`, do not split on arbitrary 20 MB offsets — peek at R2 with a short range request and align `byteEnd` to a newline so the skip logic is unnecessary, *or* skip only when the previous byte (range `byteStart-1`) was non-newline. + +### [P1] CSV row spanning chunk boundary is never reassembled +- **What**: A row beginning before `byteEnd` and ending after will be cut in half. The producing chunk parses a truncated row (likely fails validation); the next chunk discards the tail. +- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:95-108` (skip logic), `routes/catalog/index.ts:182-200` (chunk creation). +- **Why it matters**: Every chunk boundary loses (or invalidates) one row. Symptom would be `totalInvalid` rising by ~N per N-chunk job, with field-shaped errors. Severity depends on row width vs 20 MB. +- **Recommendation**: Same as above — align chunk boundaries to row boundaries in the producer. Alternatively, the producing chunk should fetch ~64 KB beyond `byteEnd` to complete its final row, and the next chunk skip logic stays. + +### [P2] `console.log`/`console.error` only — no structured logging, no Sentry +- **What**: Every log uses `console.log` with emoji prefixes; no Sentry integration in ETL paths despite Sentry being a documented monitoring tool. +- **Where**: All ETL files; verified by `grep -rn "Sentry|captureException" packages/api/src/services/etl/` → no results. Same applies to `packages/api/src/`. +- **Why it matters**: A stuck job cannot be debugged without paging through CF Workers logs by hand. No correlation IDs (other than jobId), no per-chunk structured fields (`byteStart`, `rowsProcessed`, `elapsed_ms`), no error categorization. Failures in `processLogsBatch` are caught and `console.error`-ed without rethrow (`packages/api/src/services/etl/processLogsBatch.ts:25-27`) — invalid logs can fail to write and nobody knows. +- **Recommendation**: Add a thin logger (`logger.info({ jobId, chunk: { byteStart, byteEnd }, event: 'chunk_start' })`). Call `Sentry.captureException(err, { tags: { jobId, objectKey } })` in the `processCatalogETL` catch block. + +### [P2] `processLogsBatch` swallows DB errors silently +- **What**: Catch logs to console and returns normally — caller has no idea logs were dropped. +- **Where**: `packages/api/src/services/etl/processLogsBatch.ts:25-27`. +- **Why it matters**: Invalid-item logs are the *only* forensic record of what failed validation. If the INSERT fails (Neon hiccup, payload size, FK violation), we lose visibility forever. The `updateEtlJobProgress` call is also inside the try, so `totalInvalid`/`totalProcessed` will be undercounted. +- **Recommendation**: Rethrow. Let the outer ETL catch flip the job to `failed` — the alternative is silent data quality erosion. + +### [P2] Embedding failure path silently drops embeddings without marking it +- **What**: When `generateManyEmbeddings` throws, items are upserted with `embedding=undefined` (i.e., NULL) but the job still reports as fully successful. +- **Where**: `packages/api/src/services/etl/processValidItemsBatch.ts:52-63`. +- **Why it matters**: No metric distinguishes "successful with embeddings" from "successful but degraded". The `/admin/embeddings` route reports coverage but cannot attribute the drop to a specific job. A backfill is required to recover, and there is no automatic re-queue. +- **Recommendation**: Add a `totalEmbeddingFailures` column on `etl_jobs`, increment it in the fallback path, and surface in the admin dashboard. Optionally enqueue the affected SKUs into `EMBEDDINGS_QUEUE` from the fallback for automatic backfill. + +### [P2] `parser.end()` is called inside a fire-and-forget IIFE — errors are unhandled +- **What**: The async writer is invoked as `(async () => { ... })()` with no `.catch()`. Any stream read error or `parser.write` throw becomes an unhandled rejection. +- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:89-117`. +- **Why it matters**: In CF Workers, unhandled rejections can terminate the isolate. More commonly the outer `for await (const record of parser)` loop will just hang on a stalled parser if the writer rejected. The job will sit in `running` until the stuck-job sweep notices. +- **Recommendation**: Wrap in an explicit promise: `const writerPromise = (async () => { ... })().catch(err => parser.destroy(err));` and `await writerPromise` after the `for await` loop. Surface the error to the outer catch. + +### [P2] `setTimeout(resolve, 1)` every 100 rows is a fragile yield mechanism +- **What**: Used to yield to event loop / give GC a chance. +- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:120`. +- **Why it matters**: `setTimeout` consumes wall-clock budget. Workers have a 30s wall-clock per invocation (separate from `cpu_ms`). At 1ms × 600 yields per 60k-row chunk = 0.6s — fine today, but the comment mentions a previous "per-row yield hits the CF Worker wall-clock limit". The thresholds are tightly coupled and undocumented. +- **Recommendation**: Replace with `await scheduler.yield()` (CF supports it) or `await new Promise(setImmediate)`-equivalent. Add a unit test that verifies a 100k-row CSV completes within wall-clock. + +### [P2] `BATCH_SIZE = 100` is exported but reads inconsistent with comment/runtime +- **What**: `processCatalogEtl.ts:13` exports `BATCH_SIZE = 100`. The catalog OpenAI embedding API supports 1000+ per call, so this is conservative; meanwhile the queue's `batchSize` for `sendBatch` is hard-coded at 100 (`queue.ts:17`) for an unrelated reason (max batch size from CF). Reusing the symbol `100` for two different concepts is fragile. +- **Where**: `processCatalogEtl.ts:13`, `queue.ts:17`. +- **Recommendation**: Rename to `ITEM_FLUSH_BATCH_SIZE` and `CF_QUEUE_BATCH_SIZE`, hoist both to a shared constants file. + +### [P3] `mergeItemsBySku` logs change diff on every merge — unbounded console output +- **What**: Logs a `🔄 Merged SKU` line for every SKU collision with every changed field. +- **Where**: `packages/api/src/services/etl/mergeItemsBySku.ts:34-48`. +- **Why it matters**: On a 500 MB CSV with many duplicate SKUs across chunks, this can produce millions of log lines, polluting CF logs and possibly hitting `logpush` quotas. +- **Recommendation**: Aggregate into a single per-batch summary or gate behind a debug flag. + +### [P3] Validator: no URL scheme check, no length limits, no SKU charset rules +- **What**: `isValidUrl` allows any `new URL()`-parseable input (e.g., `mailto:`, `javascript:`, `file:`). +- **Where**: `packages/api/src/services/etl/CatalogItemValidator.ts:60-67`. +- **Why it matters**: `productUrl` is rendered in the mobile app and on the guides site. A scraper bug could inject `javascript:` URLs that survive to the UI. +- **Recommendation**: Restrict to `http:`/`https:`. Add length caps (`name` ≤ 500, `description` ≤ 50k, `sku` matches `[A-Za-z0-9_.\-/]+`). + +### [P3] Soft-delete is not handled by the upsert +- **What**: `catalogItems` has no `deletedAt` column (verified — grep returns nothing). CLAUDE.md notes "Soft deletes for all user content" but catalog items are scraper-controlled, so this may be intentional. However, an item that disappears from the source CSV is never marked unavailable. +- **Where**: `packages/api/src/db/schema.ts:132-215`; `packages/api/src/services/catalogService.ts:337-407`. +- **Why it matters**: The catalog grows monotonically. Discontinued products keep their `availability` from the last successful upsert. There is no "items present in last job but not in this one → mark out-of-stock" reconciliation. +- **Recommendation**: After a successful ETL, run `UPDATE catalog_items SET availability='OutOfStock' WHERE NOT EXISTS (SELECT 1 FROM catalog_item_etl_jobs WHERE catalog_item_id = id AND etl_job_id IN (last N jobs for this source))`. Or accept the limitation and document it. + +### [P3] No invalid-items retention policy +- **What**: `invalid_item_logs` grows forever; no TTL/sweep. +- **Where**: `packages/api/src/db/schema.ts:481-490`. +- **Why it matters**: Each bad row stores `raw_data` as JSONB plus an `errors` array — a single bad upload can write hundreds of MB to Neon. +- **Recommendation**: Add a scheduled task (or CF Cron Trigger) to drop logs >90 days. + +### [P3] No runbook / deploy docs +- **What**: No `docs/runbooks/etl.md`. `grep "etl|ETL"` in `README.md`/`docs/` returns only stale plan files. +- **Where**: N/A (missing). +- **Recommendation**: Write a 1-page runbook: how to trigger an ETL, how to inspect queue depth (`wrangler queues list/info packrat-etl-queue`), how to retry a failed job, how to drain the queue (`wrangler queues consumer remove`), how to interpret `success_rate`. Reference admin endpoints `/admin/etl/*`. + +## Test Coverage Gaps + +Tests cover the happy path with mocked R2 and globally-mocked DB. The following are **not** tested: + +- **Byte-range chunk processing** — no test sets `byteStart`/`byteEnd` in the message. The injected-header fetch, partial-row skip, and boundary off-by-ones (P1 above) are entirely uncovered. +- **Multi-message job (same jobId, multiple chunks)** — no integration test exercises the "two chunks complete sequentially" path, so the P0 premature-completion bug is invisible to CI. +- **Header > 4 KB** — see P1 finding. +- **Row spanning chunk boundary** — see P1 finding. +- **Embedding service failure path** — `processValidItemsBatch.test` mocks the rejection but does not assert that items were upserted without embeddings (the actual fallback behavior). +- **`processLogsBatch` DB failure** — no test for the swallowed-error case. +- **Backpressure** — `parser.write` returning `false` and waiting on `'drain'` is not unit-testable with the current mock (whole CSV emitted in one chunk). +- **Yield/wall-clock budget** — no test asserts a 100k-row CSV completes under wall-clock. +- **`processQueueBatch`** — no direct test; the per-message catch-and-swallow (P0) is untested. +- **Retry endpoint** — no integration test verifies the retry produces a new running job and a queue send. +- **Stuck-job sweep** — no test for the 30-minute cutoff. +- **Concurrent updates to same job row** — no race-condition test (e.g., two batches calling `updateEtlJobProgress` interleaved). Atomicity at the SQL level is good but a parallel-batch test would lock it in. +- **`mergeItemsBySku` cross-chunk SKU collisions** — merging happens within a single batch; SKUs duplicated across batches (or across chunks) hit the DB upsert path, not the merge path. No test for that. +- **Header injection — wrong column ordering** — what if the source CSV has a BOM, or quoted headers with commas inside? + +## Production Readiness Checklist + +- [ ] Multi-chunk job completion tracked correctly (chunks_total / chunks_completed columns) — addresses P0 #1 +- [ ] Queue consumer rethrows on per-message failure; DLQ + max_retries configured in `wrangler.jsonc` — addresses P0 #2 +- [ ] Retry endpoint chunks large files the same way the producer does — addresses P1 #1 +- [ ] Stuck-job sweep keyed on `lastProgressAt`, not `startedAt` — addresses P1 #2 +- [ ] Chunk boundaries aligned to row boundaries in the producer (or reassembly in the consumer) — addresses P1 #3 and P1 #4 +- [ ] Header injection validates first 4 KB contains a `\n`; tested with wide CSV — addresses P1 #5 +- [ ] Sentry integration in ETL paths with `jobId`/`objectKey` tags — addresses P2 #1 +- [ ] `processLogsBatch` rethrows on DB failure — addresses P2 #2 +- [ ] Embedding fallback tracked via counter and visible in admin dashboard — addresses P2 #3 +- [ ] Writer IIFE error attached to outer flow — addresses P2 #4 +- [ ] Yield mechanism uses `scheduler.yield()` and has a wall-clock test — addresses P2 #5 +- [ ] Rename ambiguous `BATCH_SIZE` constants — addresses P2 #6 +- [ ] `mergeItemsBySku` summary log instead of per-SKU — addresses P3 #1 +- [ ] Validator enforces `http(s):` scheme and length caps — addresses P3 #2 +- [ ] Discontinued-item reconciliation strategy chosen and documented — addresses P3 #3 +- [ ] `invalid_item_logs` retention policy — addresses P3 #4 +- [ ] Runbook checked in at `docs/runbooks/etl.md` — addresses P3 #5 +- [ ] Test coverage added for all gaps listed above diff --git a/docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md b/docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md new file mode 100644 index 0000000000..cf8f434675 --- /dev/null +++ b/docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md @@ -0,0 +1,1062 @@ +--- +title: "fix: ETL pipeline audit remediation" +type: fix +status: superseded +supersededBy: docs/plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md +supersededReason: "Pivoted execution engine from Cloudflare Queues + outbox to Cloudflare Workflows on 2026-05-20. Workflows natively provides the durable-step + idempotency + retry + state semantics that ~8 of the 15 units in this plan were manually reconstructing. The audit findings about CSV correctness, validator hardening, observability, retention, and runbook remain real and carry into the successor plan; the queue-as-state-machine subplot is dropped." +date: 2026-05-19 +deepened: 2026-05-19 +origin: docs/audits/2026-05-16-etl-audit.md +--- + +# fix: ETL pipeline audit remediation + +## Summary + +Remediate the catalog ETL pipeline against every finding in the 2026-05-16 audit (2 P0, 5 P1, 6 P2, 3 P3), correct two stale assumptions the audit made about Cloudflare runtime APIs, add bucket-vs-job reconciliation (both an admin-triggered tool and automatic post-job verification), and add a "re-ingest from the top" recovery path for jobs the buggy stuck-job sweep has already corrupted. Delivered as one master plan in four sequenced phases — schema + P0 blockers first, then chunking correctness, then observability + reconciliation, then hardening + runbook. + +--- + +## Problem Frame + +The pipeline ingests scraper CSVs from R2 (`packrat-scrapy-bucket`) into Neon Postgres via a Cloudflare Queue consumer. It is currently silently incorrect: live prod admin data (192 runs / 74 failed = 38% failure rate) shows seven large jobs from 2026-05-14 marked `failed` with identical `completedAt` timestamps — the wall-clock-based stuck-job sweep firing on healthy long jobs — while the dashboard reports `successRate: 100%` on those same failed jobs. Audit `docs/audits/2026-05-16-etl-audit.md` enumerates the structural causes: a single shared `jobId` across byte-range chunks lets the first finishing chunk flip the parent job to `completed`, per-message exceptions are swallowed (no DLQ, no retry), byte-range chunk boundaries silently drop or invalidate rows that span them, retries discard chunking entirely, and there is no Sentry / structured logging anywhere in the ETL path. + +The user's stated concern — *"some [data] is missing or falsely labeling as success"* — is corroborated on both ends: `completed` jobs can be premature (P0 #1), and `failed` jobs can be false failures (P1 #2). Either way the catalog count `totalItemsIngested: 304,431` cannot currently be trusted. + +--- + +## Requirements + +- R1. **No chunk causes premature job completion.** A multi-chunk job transitions to `completed` only when every chunk has succeeded. +- R2. **Per-message queue failures retry and ultimately DLQ.** No exception thrown by chunk processing is silently swallowed. +- R3. **Stuck-job sweep is progress-based, not wall-clock-based.** Healthy long-running jobs are not falsely marked `failed`. +- R4. **Chunk boundaries do not drop or invalidate rows.** Every row in the source CSV is processed exactly once. +- R5. **Retry / repair endpoints chunk the same way the producer does.** Retrying a large file does not single-shot it. +- R6. **CSV header injection for non-first chunks is correct or fails loudly.** No silent column misalignment. +- R7. **Every ETL job has post-ingestion verification.** R2 row count is compared to `totalProcessed` and the result is observable; significant deltas are surfaced. +- R8. **Operators can trigger a "from scratch" repair of any historical job** without invoking the original producer endpoint. +- R9. **Failures emit Sentry events with structured context.** Operators can debug a stuck job without paging through raw Worker logs. +- R10. **Embedding-fallback degradation is observable.** A job that completed without embeddings is distinguishable from a fully-successful one. +- R11. **Validator rejects unsafe URLs and oversize fields.** Mobile/web cannot be tricked into rendering `javascript:` URLs from the catalog. +- R12. **`invalid_item_logs` retention is bounded.** A bad upload cannot fill Neon storage indefinitely. +- R13. **A documented runbook exists for ETL operations.** A new on-caller can trigger / inspect / retry / drain without reading source. +- R14. **Test coverage exists for every behavior in R1–R12.** Specifically including the cases the global queue-mock in `packages/api/test/setup.ts` currently hides. + +--- + +## Scope Boundaries + +- The plan does not raise `max_concurrency` above 1 for the ETL queue. Concurrency bump is blocked on per-chunk idempotency keys that this plan introduces; the actual bump is a follow-up after this lands and bakes. +- The plan does not add a DLQ to the embeddings queue. ETL queue DLQ only. +- The plan does not migrate or rewrite the existing `etl_jobs` row data for the 7 historical jobs falsely marked `failed`. The repair-from-scratch endpoint introduced in U6 is the mechanism operators will use; the actual recovery run is operational, not a code unit. +- The plan does not change the producer endpoint's authentication, the source CSV schema, or the scraper revision pinning. +- The plan does not introduce a new ETL Worker — the current `packages/api` Elysia Worker continues to host both the HTTP routes and the queue consumer. +- The plan does not address `apps/landing` / `apps/guides` / `apps/expo` consumers of catalog data even when bucket-vs-job reconciliation finds drift. Surfacing inconsistencies is in scope; downstream cache invalidation is not. + +### Deferred to Follow-Up Work + +- **Concurrency bump on `packrat-etl-queue` consumer**: separate PR after this plan ships and per-chunk idempotency is verified in production for ≥2 weeks. +- **Embeddings-queue DLQ + retry policy**: separate plan; same shape as ETL DLQ work in U3, but a distinct surface. +- **Catalog reconciliation across multiple historical jobs**: only per-job reconciliation is in scope. Historical cross-source rollup ("did we lose 5% of the catalog last quarter?") is a separate analytics workstream. +- **Soft-delete / discontinued-item reconciliation** (audit P3 #3): documented as accepted limitation in the runbook (catalog is scraper-controlled, not user content). A future plan can add `availability='OutOfStock'` reconciliation if business requirements emerge. +- **CLI subcommand surface in `packages/cli/src/commands/admin/etl.ts`**: U12 wires the new admin endpoints into the existing CLI command file. Broader CLI ergonomics work is out of scope. + +--- + +## Context & Research + +### Relevant Code and Patterns + +- **Producer endpoint:** `packages/api/src/routes/catalog/index.ts:229-293` — `POST /catalog/etl`, R2 head + 20 MB chunking at `:253-271`. Chunk creation logic to extract into a shared helper used by U6. +- **Queue producer:** `packages/api/src/services/etl/queue.ts:6-41` — `queueCatalogETL`; uses `sendBatch` with `batchSize: 100` (CF queue per-call cap). +- **Queue consumer dispatch:** `packages/api/src/services/etl/queue.ts:43-61` — `processQueueBatch` with the swallowed catch at `:50-60`. **This is the core P0 #2 surface.** +- **Per-chunk processor:** `packages/api/src/services/etl/processCatalogEtl.ts` — header injection (`:50-58`), partial-row skip (`:95-108`), batch flush (`:120-187`), per-chunk completion (`:188-191`), per-chunk failure (`:201-204`). +- **Atomic counter pattern (mirror this):** `packages/api/src/services/etl/updateEtlJobProgress.ts:16-23` — `sql\`COALESCE(${col}, 0) + ${n}\``. New `chunks_completed` / `total_embedding_failures` increments use the same idiom; the "set status=completed when chunks_completed+1 == chunks_total" branch uses a single `UPDATE ... SET ... WHERE` with a `CASE` expression in the same transaction. +- **Embeddings queue pattern (mirror this):** `packages/api/src/services/catalogService.ts:461-507` — consumer rethrows on failure so CF Queue retries fire. ETL consumer must adopt the same shape. +- **Admin routing pattern:** `packages/api/src/routes/admin/index.ts:117-237` mounts the admin prefix; `:230-237` enforces `adminAuthGuard` on every sub-route. New endpoints in `packages/api/src/routes/admin/analytics/catalog.ts` inherit the guard. +- **R2 access (S3-API not Workers binding):** `packages/api/src/services/r2-bucket.ts:193-360` — `R2BucketService({ env, bucketType: 'catalog' })` wraps `@aws-sdk/client-s3` against the R2 S3 endpoint. `r2.head(key)` and `r2.get(key, { range: { offset, length } })` are the surface. Range format `bytes=offset-(offset+length-1)` at `:675-691`. +- **Schema location:** `packages/db/src/schema.ts:446-510` — `etlJobs`, `invalidItemLogs`, `catalogItemEtlJobs`, status enum at `:460`. **Audit cites a stale path (`packages/api/src/db/schema.ts`); the file was extracted into the `packages/db` package — see merge `b14f4dbd5`.** +- **Drizzle migration location:** `packages/api/drizzle/NNNN_.sql` + `meta/NNNN_snapshot.json` + `_journal.json`. Latest is `0047_cute_bloodscream.sql`; new migrations land at `0048` and `0049` (split per Drizzle Kit's enum-add constraint). Generated via `bun run --cwd packages/api db:generate`. Custom linter at `scripts/lint/check-drizzle-migrations.ts` runs in `lint:custom`. +- **Existing ETL integration test:** `packages/api/test/etl.test.ts` — mocks `R2BucketService` per-test, uses real Postgres via wsproxy at `localhost:5434`. Setup at `packages/api/test/setup.ts:535-572` globally mocks both `queueCatalogETL` and `processQueueBatch` (lines `:544-551`) — this is precisely *why* the per-message swallow in P0 #2 is invisible to CI today, and U14 must un-mock to cover it. +- **Wrangler config:** `packages/api/wrangler.jsonc:65-89` (prod queues) and `:161-194` (dev). Currently `max_batch_size: 1, max_concurrency: 1`, **no `dead_letter_queue`, no `max_retries`** on either consumer. Queue routing handler at `packages/api/src/index.ts:109-124`. +- **Admin CLI surface:** `packages/cli/src/commands/admin/etl.ts` already exists. New endpoints in U6 and U12 add corresponding subcommands. + +### Institutional Learnings + +- `docs/solutions/` has no prior ETL, Cloudflare Queues, R2 byte-range, or Sentry-in-Workers learnings — only an unrelated Better Auth CLI note and an Android UI bug. This remediation is greenfield from an institutional-knowledge standpoint, which makes it a strong `/ce-compound` target after each phase ships. + +### External References + +- **Cloudflare Queues — ack/retry semantics:** `message.ack()` / `message.retry({ delaySeconds })` / `ackAll()` / `retryAll()` documented at . Throwing fails the un-acked remainder of the batch. `retryDelaySeconds` max is 24h per . +- **Cloudflare Queues — DLQ:** `dead_letter_queue` (string name) + `max_retries` (default 3, max 100) in the consumer block per . +- **Cloudflare Workers Scheduler:** Only `scheduler.wait(ms)` is documented at . **`scheduler.yield()` does not exist** — the audit P2 #5 recommendation is wrong on this. Use `await scheduler.wait(0)` instead. +- **Wall-clock limit:** Queue consumer wall-clock cap is **15 minutes**, not 30 seconds, per . The audit's "30 s wall-clock" framing under P2 #5 is stale. +- **Sentry on Cloudflare:** Prefer the first-party `@sentry/cloudflare` over toucan-js. Wrap via `Sentry.withSentry(optsFn, { fetch, queue })` per . Queue instrumentation guidance at . +- **Drizzle enum-add limitation:** `ALTER TYPE … ADD VALUE` inside the same transaction as code that uses the new value fails. Split migrations. Tracked at . +- **R2 range reads with AWS SDK:** R2's S3 API fully supports the `Range` header — `GetObjectCommand({ Range: 'bytes=0-1023' })` behaves identically to S3 per . + +--- + +## Key Technical Decisions + +- **Track chunk completion via two new columns (`chunks_total`, `chunks_completed`) on the existing `etl_jobs` row, gated by a per-chunk idempotency table `etl_job_chunks(job_id, chunk_index, completed_at)` with PK on `(job_id, chunk_index)`.** Rationale: even at `max_concurrency: 1` today, Cloudflare Queues are *at-least-once* — a chunk whose DB writes succeed but whose ack is lost will be redelivered, which would double-increment a naive `chunks_completed = chunks_completed + 1` and either crash through `chunks_total` or transition the job to `completed` while a sibling chunk is still pending. The idempotency table makes the increment a deterministic side-effect of `INSERT … ON CONFLICT (job_id, chunk_index) DO NOTHING RETURNING 1`; the counter only bumps when the insert created a new row. This was originally scoped as a follow-up under "Deferred" but the deepening pass surfaced it as a correctness prerequisite — pulled forward into U1/U2. +- **No new `partial` enum value on `etl_job_status`.** Embedding-fallback degradation is observable via `total_embedding_failures > 0` on a `completed` row. Adding an enum value would force the audit P2 #3 split into two migrations (Drizzle Kit limitation) and complicate every admin filter without observable benefit. +- **Use `@sentry/cloudflare` (first-party), not toucan-js as the audit suggested.** Toucan still works but is no longer the recommended Sentry path on Workers as of 2026. `withSentry({ fetch, queue })` wraps both entry points in one call; no manual `waitUntil` plumbing needed. +- **Use `await scheduler.wait(0)` for yielding, not the non-existent `scheduler.yield()`.** Audit P2 #5 is corrected here. +- **Stuck-job sweep keyed on `last_progress_at < now() - interval '15 minutes'` AND `status = 'running'`,** not on `started_at`. The 15-min figure derives from the actual CF Queue consumer wall-clock cap (15 min), not the audit's stale 30 s/30 min framing. With per-chunk progress updates writing `last_progress_at`, any chunk making real progress is safe; only truly stalled jobs flip to `failed`. +- **Row-boundary alignment happens in the producer**, not the consumer. The producer's `r2.head(key)` flow does an extra small range read on each chunk-end region (e.g., 64 KB) to find the last `\n` and emits chunks with newline-aligned `byteEnd`. This eliminates both the partial-row skip bug (P1 #4) and the row-spanning-chunk bug (P1 #5) in one place. Consumer's `skipPartialRow` logic is removed. +- **CSV header re-read with bounded loop, not a fixed 4 KB slice.** For non-first chunks, the consumer fetches `[0, 4096)`, and if no `\n` appears, expands to `[0, 16384)`, then `[0, 65536)`. If still no newline, throw — header is malformed. Eliminates P1 #3 silent column misalignment. +- **Per-chunk idempotency key is `(jobId, chunkIndex)`** — added to `CatalogETLMessage`. Even though `max_concurrency: 1` means de-facto serialization today, threading the key now unblocks the future concurrency bump without another migration. +- **DLQ is a dedicated new queue `packrat-etl-dlq`** with a minimal consumer that captures the failure to Sentry, persists a row to a new `etl_dlq_events` table for forensics, and acks. The DLQ does *not* attempt to re-process — it's an event sink + visibility tool. +- **Reconciliation runs as both a manual admin endpoint and an automatic post-job step, with the automatic step on its own queue.** Manual endpoint stays synchronous (operator-explicit, scoped to one job). Automatic step is dispatched as a queue message to a new `packrat-etl-reconcile-queue` on the final-chunk completion transition, *not* via `ctx.waitUntil` — `waitUntil` shares the queue invocation's wall-clock budget, which for a multi-GB CSV exceeds the 15-min cap when added on top of the chunk's own processing time. The reconcile consumer streams the file in 100 MB byte-range windows with progress checkpointed to a transient column so retries resume. The consumer's `INSERT … RETURNING` includes `verified_at IS NULL` as an idempotency gate so a redelivered reconcile message is a no-op. Warning threshold remains `> max(10, ceil(0.01 * total_processed))`. +- **Repair-from-scratch endpoint creates a NEW `etl_jobs` row and links it to the old via a new nullable `superseded_by_job_id` column with `ON DELETE SET NULL` and a paired `superseded_at timestamp`.** No mutation of the old row's counters — preserves audit trail and lets the dashboard show "originally failed, repaired by job X". `ON DELETE SET NULL` (not `CASCADE`) so deleting one row never silently nukes a chain of repair attempts. A CHECK constraint prevents self-reference (`superseded_by_job_id != id`). The runbook procedure (U15) requires verifying R2 source presence + ETag match before invoking repair, so an overwritten source cannot silently re-ingest the wrong file. +- **Structured logger lives at `packages/api/src/utils/logger.ts`** as a thin wrapper around `console.*` for now, accepting a `LogContext` (jobId, chunkIndex, r2Key, etc.) and emitting JSON-prefixed lines. Sentry breadcrumbs piggyback on the same call surface. Not a full logger framework — that's a separate decision. + +--- + +## Open Questions + +### Resolved During Planning + +- **Should the chunk completion track go on `etl_jobs` columns alone, or be paired with a per-chunk idempotency table?** Resolved during deepening: both. `etl_jobs.{chunks_total, chunks_completed}` are the counters; `etl_job_chunks(job_id, chunk_index)` is the idempotency gate that makes the increment safe under at-least-once delivery. See Key Technical Decisions. +- **Should embedding-fallback get a new enum value `partial`?** Resolved: no — use `total_embedding_failures` counter on a `completed` row. +- **Toucan-js or `@sentry/cloudflare`?** Resolved: `@sentry/cloudflare`. See External References. +- **Wall-clock budget for the stuck-job sweep cutoff?** Resolved: `last_progress_at < now() - interval '15 minutes'`, matching the actual queue-consumer wall-clock cap. +- **Should the row-boundary alignment happen in producer or consumer?** Resolved: producer. Single source of truth for chunk boundaries. +- **Should auto-reconcile use `ctx.waitUntil` or its own queue?** Resolved during deepening: dedicated queue (`packrat-etl-reconcile-queue`) with resumable byte-range streaming. `waitUntil` shares the chunk consumer's wall-clock budget, which fails at multi-GB files. +- **Should the DLQ consumer's INSERT + status UPDATE be transactional?** Resolved during deepening: yes, single `db.transaction()`. Same for the sweep's UPDATE + sentinel-event INSERT. +- **Should the migration split into 0048a/0048b/0048c?** Resolved during deepening: no — at ~200 rows, the single-migration approach is fine. Splitting becomes correct when `etl_jobs` exceeds ~100k rows, and the migration header carries a comment to revisit at that scale. + +### Deferred to Implementation + +- **Exact Drizzle migration sequencing within Phase 1.** All six columns + the partial index + the new `etl_dlq_events` table can land in a single migration `0048` since none touch the enum. Whether to split `superseded_by_job_id` (added later in U6) into its own migration `0049` or include it in `0048` is decided at U1 implementation. Either way the enum stays untouched in this plan. +- **`@sentry/cloudflare` instrumentation depth for the queue consumer.** The exact `Sentry.startSpan` attributes per queue message (some attributes are conventional, some are CF-specific) get finalized when U8 lands. +- **Sentry sampling rate** for the queue consumer. Default to `tracesSampleRate: 0.1` and tune in production; not a plan-time decision. +- **Exact threshold for "significant" reconciliation delta** that triggers a Sentry warning vs informational event. Default: `> max(10, ceil(0.01 * total_processed))` rows of delta. Tunable in production. +- **Cron schedule for `invalid_item_logs` retention sweep.** Daily at 09:00 UTC unless ops has a quieter window. + +--- + +## High-Level Technical Design + +> *This illustrates the intended approach and is directional guidance for review, not implementation specification. The implementing agent should treat it as context, not code to reproduce.* + +```text +Producer ─── POST /catalog/etl ──┐ + │ + ▼ + ┌─────────────────────────────────────────────┐ + │ chunkCsvForR2(key) (NEW shared helper) │ + │ 1. r2.head(key) -> size │ + │ 2. for each 20 MB window: │ + │ peek (next 64 KB) to find last '\n' │ + │ emit chunk with byteEnd = newline-1 │ + │ 3. tag each chunk: { jobId, chunkIndex, │ + │ chunksTotal, byteRange } + └─────────────────────────────────────────────┘ + │ + INSERT etl_jobs + (status='running', + chunks_total=N, + chunks_completed=0) + │ + ETL_QUEUE.sendBatch(chunks) + │ + ▼ + ┌─────────────────────────────────────────────┐ + │ processQueueBatch (REWRITE) │ + │ for message of batch: │ + │ try { │ + │ processCatalogETL(msg) │ + │ message.ack() │ + │ } catch (err) { │ + │ Sentry.captureException(err, {...}) │ + │ message.retry({ delaySeconds: 30 }) │ + │ } │ + └─────────────────────────────────────────────┘ + │ + ▼ + ┌─────────────────────────────────────────────┐ + │ processCatalogETL (per chunk) │ + │ r2.get(key, range) -> stream │ + │ if chunkIndex > 0: re-fetch header │ + │ (expand 4K→16K→64K, throw if no '\n') │ + │ parse rows (csv-parse, backpressure) │ + │ per 100 rows: scheduler.wait(0) │ + │ flush valid -> processValidItemsBatch │ + │ (embedding fallback increments │ + │ total_embedding_failures atomically) │ + │ flush invalid -> processLogsBatch │ + │ (now RETHROWS on DB failure) │ + │ on success: │ + │ UPDATE etl_jobs │ + │ SET chunks_completed = chunks_completed+1, + │ last_progress_at = now(), │ + │ status = CASE │ + │ WHEN chunks_completed+1 │ + │ = chunks_total │ + │ THEN 'completed' │ + │ ELSE status │ + │ END │ + │ if completed (in same txn): │ + │ enqueue ReconcileMessage to │ + │ packrat-etl-reconcile-queue │ + └─────────────────────────────────────────────┘ + │ + (on completion transition) + ▼ + ┌─────────────────────────────────────────────┐ + │ processReconcileBatch │ + │ reconcileJob(jobId, resumeFromByte=0): │ + │ if verified_at IS NOT NULL: ack │ + │ stream 100 MB byte-range windows │ + │ checkpoint to │ + │ verified_row_count_partial │ + │ if budget low: throw ResumeError │ + │ (consumer re-enqueues) │ + │ on EOF: UPDATE verified_at, count │ + │ if delta > threshold: Sentry warning │ + └─────────────────────────────────────────────┘ + │ + (on any thrown error after retries) + ▼ + packrat-etl-dlq + │ + ▼ + ┌─────────────────────────────────────────────┐ + │ dlqConsumer │ + │ Sentry.captureException │ + │ INSERT etl_dlq_events │ + │ ack │ + └─────────────────────────────────────────────┘ + +Background (CF Cron): + stuck-job sweep: status='running' AND last_progress_at < now()-15min + -> status='failed', emit Sentry warning + invalid-log retention: DELETE FROM invalid_item_logs WHERE created_at < now()-90d +``` + +--- + +## Implementation Units + +### U1. Schema migration: chunk tracking, idempotency table, progress timestamp, embedding failures, reconciliation columns, DLQ events table, constraint hardening + +**Goal:** Add the columns, tables, indexes, and constraints that the rest of the plan reads and writes. Lands first so every subsequent unit can compile and migrate against a known schema. Single migration `0048` is acceptable at the current ~200-row scale of `etl_jobs`; splitting into multiple migrations is unnecessary engineering at this size (revisit if `etl_jobs` exceeds ~100k rows). + +**Requirements:** R1, R3, R7, R8, R10 + +**Dependencies:** None + +**Files:** +- Modify: `packages/db/src/schema.ts` (add columns to `etlJobs`; add new `etlJobChunks` table; add new `etlDlqEvents` table; add UNIQUE constraint to `catalogItemEtlJobs`; export all) +- Create: `packages/api/drizzle/0048_etl_chunking_and_observability.sql` +- Create: `packages/api/drizzle/meta/0048_snapshot.json` (generated) +- Modify: `packages/api/drizzle/meta/_journal.json` (generated) +- Test: `packages/api/test/db-schema-etl.test.ts` (new — schema smoke test asserting columns exist with expected defaults; uses the existing Docker Postgres wsproxy setup at `localhost:5434`) + +**Approach:** +- Columns added to `etl_jobs`: + - `chunks_total integer` (nullable — single-chunk legacy jobs leave it null) + - `chunks_completed integer DEFAULT 0 NOT NULL` + - `last_progress_at timestamp` (nullable initially; backfilled to `started_at` for legacy rows in the same migration) + - `total_embedding_failures integer DEFAULT 0 NOT NULL` + - `verified_at timestamp` (nullable) + - `verified_row_count integer` (nullable) + - `verified_row_count_partial integer` (nullable — checkpoint for resumable reconcile in U10) + - `superseded_by_job_id text` (nullable, FK to `etl_jobs.id` `ON DELETE SET NULL`) + - `superseded_at timestamp` (nullable — paired with `superseded_by_job_id` so the timeline survives even after FK cleanup) + - `source_etag text` (nullable — captured on producer insert from `r2.head(objectKey).etag`; U6's repair endpoint uses this for failure-closed source verification) + - `source_last_modified timestamp` (nullable — same capture; redundant with etag but cheap) +- CHECK constraints on `etl_jobs`: + - `etl_jobs_chunks_completed_lte_total CHECK (chunks_total IS NULL OR chunks_completed <= chunks_total)` — fail loudly on over-count. + - `etl_jobs_no_self_supersede CHECK (superseded_by_job_id IS NULL OR superseded_by_job_id <> id)` — prevent self-referential repair loop. +- New indexes on `etl_jobs`: + - Partial: `etl_jobs_running_progress_idx` on `(status, last_progress_at)` `WHERE status = 'running'` — for the U5 stuck-job sweep. + - Partial: `etl_jobs_unverified_idx` on `(verified_at)` `WHERE status = 'completed' AND verified_at IS NULL` — for the U10 watchdog scan. + - `etl_jobs_superseded_by_idx` on `(superseded_by_job_id)` — for the admin dashboard's "is this job superseded?" lookup. +- New table `etl_job_chunks` (per-chunk idempotency, see Key Technical Decisions): + - `job_id text NOT NULL` (FK to `etl_jobs.id` `ON DELETE CASCADE`) + - `chunk_index integer NOT NULL` + - `completed_at timestamp DEFAULT now() NOT NULL` + - `PRIMARY KEY (job_id, chunk_index)` +- New table `etl_dlq_events`: `id text PK`, `job_id text` (FK, nullable, `ON DELETE SET NULL`), `chunk_index integer`, `message_body jsonb`, `error_message text`, `error_stack text`, `attempts integer`, `source text` (one of `consumer`, `sweep`; defaults to `consumer`), `created_at timestamp DEFAULT now() NOT NULL`. Index on `created_at`. +- Modification to `catalog_item_etl_jobs`: add `UNIQUE (catalog_item_id, etl_job_id)` so a redelivered chunk's upsert can use `ON CONFLICT DO NOTHING` and not produce duplicate provenance rows. +- Backfill: `UPDATE etl_jobs SET last_progress_at = started_at WHERE last_progress_at IS NULL`. Safe — `etl_jobs` is ~200 rows; sub-100ms on Neon. +- Drizzle generator: `bun run --cwd packages/api db:generate` then verify the SQL file matches the design. **Verify Drizzle Kit emits `DEFAULT 0 NOT NULL` literally in the SQL** — Drizzle sometimes drops the SQL-side default and keeps only the JS-side, which would break inserts from in-flight old workers during a rolling deploy. **Do NOT touch the `etl_job_status` enum in this migration** — no new enum value is needed (see Key Technical Decisions). +- Drizzle Kit does not auto-emit `CONCURRENTLY` for indexes. At 200 rows the index build is instant so `CONCURRENTLY` is nice-to-have, not blocking. If the table grows >100k rows before this lands, hand-edit the generated SQL to use `CREATE INDEX CONCURRENTLY IF NOT EXISTS` and split each index into its own statement-breakpoint block. + +**Patterns to follow:** +- Existing `etl_jobs` definition at `packages/db/src/schema.ts:460-479` for column shape and import style. +- Migration `0027_past_madrox.sql` (added `scraper_revision` + index) for the "add column + partial index" pattern. +- `scripts/lint/check-drizzle-migrations.ts` runs in `lint:custom`; the new migration must pass it. + +**Test scenarios:** +- Happy path: After migration runs against a populated test DB, all 8 new `etl_jobs` columns are present with the documented defaults; `etl_job_chunks` and `etl_dlq_events` exist; the three new partial/normal indexes are queryable (`EXPLAIN SELECT ... WHERE status='running' ...` uses the running-progress index; the unverified index serves the watchdog). +- Happy path: `INSERT INTO etl_job_chunks (job_id, chunk_index) VALUES ('j1', 0)` succeeds; a duplicate insert returns no row via `ON CONFLICT DO NOTHING RETURNING 1` and the table still contains exactly one row. +- Edge case: Legacy rows have `chunks_total = NULL` and `last_progress_at` backfilled to `started_at`. +- Edge case: `chunks_completed DEFAULT 0` is correctly applied to existing rows (verify with a row that has `chunks_completed = 0` post-migration). The generated SQL must literally include `DEFAULT 0 NOT NULL` — assert via SQL `information_schema.columns`. +- Edge case: `UNIQUE (catalog_item_id, etl_job_id)` on `catalog_item_etl_jobs` prevents a duplicate-insert (returns conflict). +- Error path: Attempting to insert a row with `chunks_completed > chunks_total` violates the CHECK constraint and errors clearly. +- Error path: Attempting to set `superseded_by_job_id = id` violates the no-self-supersede CHECK. +- Error path: Re-running the migration on an already-migrated DB is a no-op (Drizzle's migration log handles this; smoke-test the up/down via `bun run --cwd packages/api db:migrate`). +- Edge case: Down-migration cleanly drops the new columns/tables on a DB with no Phase 2+ data. **Once Phase 2 ships and writes start landing in the new columns, the migration is forward-only** — document in the migration header comment. + +**Verification:** +- `bun run --cwd packages/api db:migrate` applies cleanly against a fresh Docker Postgres + against a Postgres seeded with current-prod-shape `etl_jobs` rows. +- `bun lint:custom` passes on the new migration. +- `bun test:api:unit` includes the new schema test and it passes. + +--- + +### U2. P0 #1 fix: chunk-completion lifecycle in producer + consumer + +**Goal:** A multi-chunk job's `status` transitions to `completed` only after every chunk has finished. Premature completion eliminated. + +**Requirements:** R1 + +**Dependencies:** U1 + +**Files:** +- Modify: `packages/api/src/routes/catalog/index.ts` (producer endpoint sets `chunks_total` on `etl_jobs` insert and tags each `CatalogETLMessage` with `chunkIndex` and `chunksTotal`) +- Modify: `packages/api/src/services/etl/types.ts` (extend `CatalogETLMessage.data` with `chunkIndex: number` and `chunksTotal: number`; `byteStart`/`byteEnd` remain) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (rewrite the `:188-191` success-path UPDATE to use the `CASE` expression that flips status only when `chunks_completed + 1 = chunks_total`; also update `last_progress_at` on every counter write) +- Modify: `packages/api/src/services/etl/updateEtlJobProgress.ts` (include `last_progress_at: sql\`now()\`` in the update set so every progress write refreshes the sweep timestamp) +- Test: `packages/api/test/etl-chunk-completion.test.ts` (new) + +**Approach:** +- Producer: compute `chunks` first, then `INSERT etl_jobs (..., chunks_total) VALUES (..., ${chunks.length})` — a single round-trip including `chunks_total`. Then `sendBatch` with each message carrying `chunkIndex` 0..N-1 and `chunksTotal: N`. Setting `chunks_total` in the initial INSERT (rather than a separate follow-up UPDATE) eliminates a window where a chunk consumer could observe `chunks_total IS NULL` and silently fail the `chunks_completed + 1 = chunks_total` CASE comparison. +- Consumer success path runs inside a single Drizzle `db.transaction()`: + 1. `INSERT INTO etl_job_chunks (job_id, chunk_index) VALUES ($1, $2) ON CONFLICT (job_id, chunk_index) DO NOTHING RETURNING 1` — the idempotency gate. If no row returned, this is a redelivery; skip the increment, ack the message, return. + 2. If the insert created a row, run the atomic UPDATE: `UPDATE etl_jobs SET chunks_completed = chunks_completed + 1, last_progress_at = now(), status = CASE WHEN chunks_completed + 1 = chunks_total THEN 'completed' ELSE status END, completed_at = CASE WHEN chunks_completed + 1 = chunks_total THEN now() ELSE completed_at END WHERE id = $1 AND status = 'running' RETURNING status, chunks_completed, chunks_total`. + 3. The `WHERE status = 'running'` gate prevents clobbering a row the U5 sweep has already flipped to `failed` (status-flip-flop hazard). + 4. If the returned row shows the transition to `completed`, *and* this transaction was the one that created the chunk-row in step 1, send a message to `packrat-etl-reconcile-queue` (see U10) for the auto-reconcile. +- On per-chunk failure: the consumer no longer flips the parent job to `failed` immediately. Instead it lets the message throw / retry. The parent job only flips to `failed` via (a) DLQ consumer when retries are exhausted, or (b) the stuck-job sweep (U5). +- Single-chunk legacy jobs: when `chunks_total IS NULL`, the `etl_job_chunks` insert still gates the increment; legacy rows backfilled to `chunks_total = 1` migrate cleanly. Backwards-compatible with any in-flight legacy messages. +- The CHECK constraint `chunks_completed <= chunks_total` from U1 is the loud-failure safety net — if the idempotency gate ever leaks (e.g., a code bug bypasses the chunk-table insert), the next `UPDATE` errors with a constraint violation rather than silently corrupting the counter. + +**Patterns to follow:** +- Atomic SQL update idiom at `packages/api/src/services/etl/updateEtlJobProgress.ts:16-23`. +- Drizzle transaction shape: `await db.transaction(async (tx) => { ... })`. + +**Test scenarios:** +- Happy path: 5-chunk job; chunks 0..3 complete successfully → status remains `running` with `chunks_completed = 4`; chunk 4 completes → status flips to `completed`, `completed_at` set, `etl_job_chunks` has 5 rows. +- Happy path (idempotency): Chunk 2 succeeds, ack lost, CF redelivers → second attempt's `INSERT … ON CONFLICT DO NOTHING RETURNING` returns no row → increment is skipped → `chunks_completed` increments exactly once over the two deliveries. +- Edge case: Chunks complete out of order (chunk 3 finishes before chunk 1) → status flips only when all five have incremented; the `etl_job_chunks` rows record actual completion order. +- Edge case: Single-chunk legacy job (`chunks_total = 1`) → flips to `completed` on its one success; `etl_job_chunks` has 1 row. +- Edge case: Sweep flips job to `failed` mid-flight; the next chunk's UPDATE `WHERE … AND status = 'running'` returns zero rows → transaction sees the conflict, logs warning, lets the operator route to repair-from-scratch. +- Error path: One chunk throws; other chunks succeed → parent job stays `running` while CF Queue retries the failed chunk; if retries exhaust, DLQ consumer (U3) handles state transition. +- Error path: CHECK constraint trips (hypothetical leaked-idempotency bug) → UPDATE errors loudly, chunk retries, no silent corruption. +- Integration: With `R2BucketService` mocked to return a small CSV split into 3 chunks via `byteRange`, the full producer→queue→consumer cycle ends in exactly one `status=completed` transition for the parent job AND exactly one reconcile message enqueued. +- Integration (idempotency at scale): Replay every chunk message twice → `etl_job_chunks` has exactly `chunks_total` rows, counters match, status = `completed`. + +**Verification:** +- Re-running `etl.test.ts` plus the new test under `bun test:api` shows no `status='completed'` write until `chunks_completed = chunks_total`. +- A manual prod-shape replay (`POST /catalog/etl` against the dev Worker with a CSV that produces ≥3 chunks) shows the dashboard's `successRate` remain at the running state until all chunks finish. + +--- + +### U3. P0 #2 fix: explicit ack/retry + DLQ wiring + +**Goal:** No per-message exception is silently swallowed. Failures retry; exhausted retries land in a dedicated DLQ that emits Sentry events and persists for forensics. + +**Requirements:** R2, R9 + +**Dependencies:** U1 (for `etl_dlq_events` table) + +**Files:** +- Modify: `packages/api/src/services/etl/queue.ts` (rewrite `processQueueBatch` for explicit per-message ack/retry; remove the swallow at `:50-60`) +- Create: `packages/api/src/services/etl/processDlqEvent.ts` (DLQ consumer; INSERT into `etl_dlq_events`, capture Sentry exception, ack) +- Modify: `packages/api/src/index.ts` (extend the `queue()` switch at `:109-124` with arms for `packrat-etl-dlq` and `packrat-etl-dlq-dev`) +- Modify: `packages/api/wrangler.jsonc` (declare `packrat-etl-dlq` and `packrat-etl-dlq-dev` as producer + consumer; add `dead_letter_queue: "packrat-etl-dlq"` and `max_retries: 3` to the ETL consumer block at `:78-82` and dev equivalent at `:178-182`) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (when a chunk's processing throws, also UPDATE `last_progress_at` and increment a transient `last_error_at` if useful — see Approach for trade-off; primary work is removing the per-chunk `status='failed'` write at `:201-204` since the DLQ consumer is now responsible for state transition) +- Test: `packages/api/test/etl-queue-retry.test.ts` (new — covers the global-mock blind spot in `setup.ts:544-551`) + +**Approach:** +- Rewrite `processQueueBatch`: + ```text + for (const message of batch.messages) { + try { + await processCatalogETL({ message: message.body, env }); + message.ack(); + } catch (err) { + logger.error('etl.chunk.failed', { jobId, chunkIndex, err }); + Sentry.captureException(err, { tags: { jobId, chunkIndex, r2Key }, contexts: { queue: { messageId: message.id, attempts: message.attempts } } }); + message.retry({ delaySeconds: 30 }); + } + } + ``` + (Sentry wiring lives in U8; in U3 the call sites are added as no-ops that U8 fills in.) +- DLQ consumer reads from `packrat-etl-dlq` and, inside a single `db.transaction()`, performs: (1) `INSERT INTO etl_dlq_events (… source = 'consumer')` capturing `{ jobId, chunkIndex, message_body, error_message, error_stack, attempts }`, (2) `UPDATE etl_jobs SET status = 'failed', completed_at = now() WHERE id = $1 AND status = 'running'` — the `WHERE status = 'running'` clause is the no-op gate that prevents racing the U5 sweep. `Sentry.captureException` fires *before* the transaction (so the event survives even if the DB transaction rolls back) with tags `{ jobId, chunkIndex, r2Key }`. The `error_stack` field is contractually free of raw CSV row data — only structural error messages — to avoid accidental PII capture (documented at the call site). +- Wrangler config additions: + ```text + // producer + { "queue": "packrat-etl-dlq", "binding": "ETL_DLQ" } + // consumer + { "queue": "packrat-etl-dlq", "max_batch_size": 10, "max_batch_timeout": 30 } + // on the existing ETL consumer: + "dead_letter_queue": "packrat-etl-dlq", + "max_retries": 3 + ``` + Same shape applied to `*-dev` queues. +- The removal of the per-chunk `status='failed'` write at `processCatalogEtl.ts:201-204` is critical — leaving it would race with the DLQ consumer's state transition. +- `processCatalogETL` rethrows on any internal failure (it already does); no behavioral change other than the consumer's catch now retries instead of swallowing. + +**Patterns to follow:** +- Embeddings consumer pattern at `packages/api/src/services/catalogService.ts:461-507` for the rethrow shape. +- Existing `queue()` dispatch at `packages/api/src/index.ts:109-124` for the new DLQ arm. + +**Test scenarios:** +- Happy path: Single message processes successfully → `message.ack()` called exactly once; no retry; no DLQ row. +- Error path: Transient throw (simulated R2 5xx) → first call: `message.retry({ delaySeconds: 30 })` and no DLQ; second call succeeds → ack. Total DLQ rows = 0. +- Error path: Permanent throw (4 attempts all fail) → exhausts `max_retries: 3` → message routed to `packrat-etl-dlq` → DLQ consumer inserts row in `etl_dlq_events` with `attempts = 4`, captures Sentry, flips `etl_jobs.status = 'failed'`. +- Integration: Un-mock `processQueueBatch` (override `setup.ts:544-551` per-file with `vi.doUnmock`) and exercise the real consumer against an in-memory queue stub. +- Edge case: Two messages in a batch, first throws and second succeeds (this should not happen at `max_batch_size: 1` but the code path supports it) → first retries, second acks; no cross-contamination of state. + +**Verification:** +- New test passes with the per-message catch removed; passes with the catch present too (so the test actually proves the new behavior). +- `bun test:api` overall still green. +- Inspecting `packrat-etl-dlq` queue depth in `wrangler queues info packrat-etl-dlq-dev` after a forced failure shows zero (because the DLQ consumer drains immediately). + +--- + +### U4. Sweep cleanup: remove the broken wall-clock stuck-job sweep before U5 replaces it + +**Goal:** Take the existing `POST /admin/etl/reset-stuck` endpoint out of production rotation before U5's progress-based replacement lands, to stop new false-failures while the rest of Phase 2 ships. + +**Requirements:** R3 + +**Dependencies:** None (independent of U1; this is a code removal) + +**Files:** +- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (remove or guard the `POST /admin/etl/reset-stuck` route at `:384-409`; if removed, also remove from the OpenAPI spec) +- Modify: `packages/cli/src/commands/admin/etl.ts` (drop any subcommand wired to the removed endpoint) +- Test: `packages/api/test/admin-etl-routes.test.ts` (new or extend existing — assert the route returns 410 Gone or is absent) + +**Approach:** +- Two options, both acceptable: + - **Remove the route entirely.** Anyone calling it gets a 404. Cleanest. Recommended if no automation depends on it. + - **Replace the route body with a 410 Gone response** that links to the runbook (added in U15) and the new sweep design from U5. Use if there's any concern about external automation calling it. +- Existing endpoint logic at `:384-409` does `UPDATE etl_jobs SET status='failed' WHERE status='running' AND started_at < now() - interval '30 minutes'`. This is the SQL that wrongly failed the 7 jobs on 2026-05-14. +- This unit ships before U5 lands the replacement, so for a short window there is no automated sweep at all. Acceptable because stuck-job recovery in that window is operational (U15 runbook documents the manual SQL). + +**Patterns to follow:** +- Existing admin route removal pattern (none in repo as of this writing); fall back to standard Elysia route definition omission. + +**Test scenarios:** +- Happy path: `POST /admin/etl/reset-stuck` returns 410 (or 404 if removed) — test asserts on the chosen behavior. +- Edge case: Admin CLI subcommand for the old endpoint no longer exists (or returns a clear "removed, see runbook" message). + +**Verification:** +- `bun test:api` passes with the new assertion. +- Manual `curl` against dev Worker returns the chosen status code. + +--- + +### U5. P1 #2 fix: progress-based stuck-job sweep + +**Goal:** Replace the wall-clock-based sweep with one that uses `last_progress_at` so healthy long jobs (e.g., 50,100-row `evo` file) are not falsely failed. + +**Requirements:** R3 + +**Dependencies:** U1 (for `last_progress_at`), U2 (for the `last_progress_at` write-on-progress), U4 (so the old sweep is gone first) + +**Files:** +- Create: `packages/api/src/services/etl/sweepStuckJobs.ts` (the sweep function — pure DB logic, no HTTP) +- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (new `POST /admin/etl/sweep-stuck` endpoint that calls `sweepStuckJobs` and returns the affected rows; for manual triggering) +- Modify: `packages/api/wrangler.jsonc` (declare a CF Cron Trigger for the sweep, e.g., `*/5 * * * *`) +- Modify: `packages/api/src/index.ts` (add `scheduled()` handler that invokes `sweepStuckJobs` on the cron event; if a `scheduled` handler doesn't yet exist, add one) +- Test: `packages/api/test/etl-stuck-job-sweep.test.ts` (new) + +**Approach:** +- Sweep runs inside a single `db.transaction()`: + 1. `UPDATE etl_jobs SET status='failed', completed_at = now() WHERE status='running' AND COALESCE(last_progress_at, started_at) < now() - interval '15 minutes' RETURNING id, source, filename, started_at, last_progress_at, chunks_total, chunks_completed`. (The `COALESCE` defends against any legacy row that somehow escaped the U1 backfill.) + 2. For each returned row, `INSERT INTO etl_dlq_events (job_id, error_message, source) VALUES ($1, 'sweep:no_progress', 'sweep')` so the forensic table is the single source of truth for *every* failed transition — whether triggered by the consumer DLQ or by the sweep. `chunk_index = NULL` in sweep-sourced events. +- Returned rows also feed a Sentry warning event per affected job (`level: warning`, tags `{ jobId, source: 'sweep' }`, extra includes `chunks_completed/chunks_total` so the operator immediately sees how far the job got). +- 15-minute interval matches the CF Queue consumer wall-clock cap. Any chunk making real progress writes `last_progress_at = now()` (via U2's modification to `updateEtlJobProgress`), so this only catches truly stalled jobs. +- CF Cron Trigger every 5 minutes (configurable via env if needed). The cron handler is idempotent — the partial index from U1 keeps the query cheap even at thousands of jobs. Wrangler config shape: `"triggers": { "crons": ["*/5 * * * *"] }` — top-level `triggers` object wrapping a `crons` array, not a bare top-level `crons` key. +- Manual admin endpoint exists for on-demand sweep — useful during incident response. + +**Patterns to follow:** +- Admin route structure at `packages/api/src/routes/admin/analytics/catalog.ts` for the new endpoint. +- CF Cron Triggers config in `wrangler.jsonc` (the repo has none today — this is the first; reference ). + +**Test scenarios:** +- Happy path: Insert a job with `status='running'`, `last_progress_at = now() - 30min` → sweep flips it to `failed`. +- Edge case: Insert a job with `status='running'`, `last_progress_at = now() - 5min` → sweep leaves it alone (within budget). +- Edge case: Insert a job with `last_progress_at = NULL` (somehow — legacy row that escaped backfill) → COALESCE the column with `started_at` in the WHERE clause so it still gets evaluated. +- Edge case: 50,100-row job in progress — chunks write `last_progress_at = now()` every 100 rows → sweep never fires on it. +- Integration: Cron-event simulation calls the same code path as the admin endpoint; both return identical results for the same DB state. +- Error path: Sweep query fails (DB down) → caller observes the error; Sentry captures; cron does not silently mask. + +**Verification:** +- After running the sweep against a DB with the seeded test cases, exactly the long-stalled rows are affected. +- `bun test:api` includes the new test and passes. +- Dev cron schedule fires (`wrangler dev --test-scheduled`) and exercises the handler. + +--- + +### U6. P1 #1 fix: shared chunking helper + retry endpoint + repair-from-scratch endpoint + +**Goal:** Both retry and repair use the same producer chunking logic. The repair endpoint creates a brand-new `etl_jobs` row linked to the broken historical one — directly enabling the operational recovery of the 7 wrongly-`failed` jobs from 2026-05-14. + +**Requirements:** R5, R8 + +**Dependencies:** U1 (for `superseded_by_job_id`), U2 (for `chunks_total` write semantics) + +**Files:** +- Create: `packages/api/src/services/etl/chunkCsvForR2.ts` (extracted shared helper: takes `objectKey`, returns an array of `{ chunkIndex, chunksTotal, byteStart, byteEnd }` with newline-aligned boundaries — newline alignment itself ships in U7) +- Modify: `packages/api/src/routes/catalog/index.ts` (replace inline chunking at `:253-271` with a call to `chunkCsvForR2`) +- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (rewrite `POST /admin/etl/:jobId/retry` at `:413-470` to use `chunkCsvForR2`; add new `POST /admin/etl/:jobId/repair-from-scratch`) +- Modify: `packages/api/src/services/etl/queue.ts` (extend `queueCatalogETL` to accept pre-computed chunks rather than constructing them — or accept either, with the chunk-construction path migrating to the shared helper) +- Modify: `packages/cli/src/commands/admin/etl.ts` (add `retry ` subcommand if not present, plus new `repair-from-scratch ` subcommand) +- Test: `packages/api/test/etl-retry-repair.test.ts` (new) + +**Approach:** +- `chunkCsvForR2(objectKey, r2, options?)`: signature returns `Promise`. Calls `r2.head(objectKey)`, splits into 20 MB windows. Newline-alignment lives in U7 but the shape lands here so U7 is a fill-in. +- Retry endpoint (`POST /admin/etl/:jobId/retry`): looks up `(source, filename, scraperRevision)` from the existing job, generates a fresh `jobId`, INSERTs a new `etl_jobs` row with `chunks_total = chunkCsvForR2(...).length`, sets `superseded_by_job_id = ` on the new row only if the original is `failed`, sends batch. +- Repair-from-scratch (`POST /admin/etl/:jobId/repair-from-scratch`): same behavior as retry but always sets `superseded_by_job_id` and `superseded_at = now()` on the new row, and always re-reads the full file (even if the original was `completed`). Use case: an operator suspects a `completed` job is undercount; repair recreates from scratch. +- **R2 ETag verification (failure-closed)**: before creating the new job row, both endpoints call `r2.head(objectKey)` and compare the returned `etag` (and `lastModified`) against the original job's recorded values. If the original job has no `etag` stored (legacy rows), require an explicit `?force=true` query flag. If the `etag` differs (source was overwritten by a later scrape), return 409 Conflict with a clear message naming both etags — never silently re-ingest a different file under the same path. (This implies adding `source_etag text` and `source_last_modified timestamp` to `etl_jobs` — fold into U1's column list if not already, or capture as a follow-up here.) +- Both endpoints accept an optional `?dryRun=true` query that returns the planned chunk spec without enqueuing anything — operator preview. +- The 7 historical jobs from 2026-05-14 will be recovered by calling repair-from-scratch on each of them once Phase 1+2 ships. U15 runbook documents the operator procedure including the ETag verification step. + +**Patterns to follow:** +- Admin route structure at `packages/api/src/routes/admin/analytics/catalog.ts:178-235` for response shape. +- Existing retry endpoint at `:413-470` for the basic flow (just don't replicate the broken single-chunk behavior). + +**Test scenarios:** +- Happy path: Retry a failed job with a 50 MB source file → 3 chunks created via `chunkCsvForR2`, 3 messages sent, new `etl_jobs` row has `chunks_total = 3`, `superseded_by_job_id` matches original. +- Happy path: Repair-from-scratch a `completed` job with apparent undercount → new job created with `superseded_by_job_id` set; original row untouched. +- Edge case: Retry a single-chunk legacy job (file size < 20 MB) → 1 chunk, `chunks_total = 1`, behaves identically to the producer endpoint. +- Edge case: Retry on a job whose `filename` no longer exists in R2 → endpoint returns 404 with a clear message; no new `etl_jobs` row. +- Edge case: `?dryRun=true` returns the planned chunk spec; no DB writes, no queue sends. +- Integration: Repair-from-scratch on a 50,100-row file (the `evo` case) produces the expected ~3 chunks, all enqueued, and after the full pipeline completes the new job's `total_processed` matches the file's actual row count. +- Covers AE: the 7 jobs from 2026-05-14 can each be repaired by calling repair-from-scratch — verified manually post-deploy. + +**Verification:** +- Both endpoints documented in the OpenAPI spec emitted by `@elysiajs/openapi`. +- CLI subcommands invoke the endpoints with proper auth. +- `bun test:api` passes the new integration test. + +--- + +### U7. P1 #3 + P1 #4 + P1 #5 fix: row-boundary-aligned chunks + robust header injection + +**Goal:** No row is silently dropped, invalidated, or split across chunks. Wide-CSV headers (>4 KB) fail loudly instead of silently misaligning columns. + +**Requirements:** R4, R6 + +**Dependencies:** U6 (for `chunkCsvForR2`) + +**Files:** +- Modify: `packages/api/src/services/etl/chunkCsvForR2.ts` (implement newline alignment — for each 20 MB window, read the next 64 KB tail, find the last `\n`, snap `byteEnd` to the byte before that newline) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (remove `skipPartialRow` at `:95-108`; rewrite header injection at `:50-58` with a bounded expand loop 4K→16K→64K; throw a typed error if no newline in 64 KB) +- Test: `packages/api/test/etl-chunk-boundaries.test.ts` (new) + +**Approach:** +- Newline alignment in producer: + - For each chunk window `[start, start + 20MB)`: + - Read `[start + 20MB - 64KB, start + 20MB)`. + - Find the index of the last `\n` in that slice. + - If found: `byteEnd = (start + 20MB - 64KB) + lastNewlineIndex`. The next chunk's `byteStart = byteEnd + 1`. + - If not found in 64 KB (extremely unlikely with normal CSV row sizes): throw `ChunkBoundaryError` immediately, surfacing to Sentry and aborting the job creation. Caller is told the file has a row larger than 64 KB. + - Last chunk: `byteEnd = file.size - 1`. +- Header re-fetch in consumer (for `chunkIndex > 0`): + ```text + let headerSlice = await r2.get(key, { range: { offset: 0, length: 4096 }}).then(b => b.text()); + let nlIdx = headerSlice.indexOf('\n'); + if (nlIdx === -1) { + headerSlice = await r2.get(key, { range: { offset: 0, length: 16384 }}).then(b => b.text()); + nlIdx = headerSlice.indexOf('\n'); + } + if (nlIdx === -1) { + headerSlice = await r2.get(key, { range: { offset: 0, length: 65536 }}).then(b => b.text()); + nlIdx = headerSlice.indexOf('\n'); + } + if (nlIdx === -1) throw new EtlHeaderError(`No newline in first 64 KB of ${key} — malformed header`); + const headerRow = headerSlice.slice(0, nlIdx); + ``` +- Since chunks are now newline-aligned, `skipPartialRow` is no longer needed — the consumer can stream the chunk body directly into the parser after prepending the header. +- BOM handling: if the first byte of the header slice is `0xEF 0xBB 0xBF`, strip it before extracting the header row. Same treatment for the first chunk. + +**Patterns to follow:** +- R2 byte-range read pattern at `packages/api/src/services/etl/processCatalogEtl.ts:54, 71`. +- Typed-error pattern: extend whatever the repo uses for domain errors (typically `Error` subclasses in `packages/api/src/utils/errors.ts`). + +**Test scenarios:** +- Happy path: 5 MB file, 1 chunk → no boundary logic exercised; row count matches actual. +- Happy path: 60 MB file, 3 chunks; rows of varying width; all `byteEnd` values land immediately before a `\n`; total row count across chunks = file row count. +- Edge case: Chunk boundary lands exactly on a newline character (`source[byteEnd] === '\n'`) → still aligned; next chunk starts on next row; no dropped row. +- Edge case: Header row of 4500 bytes (just over 4 KB) → re-fetch expands to 16 KB, succeeds; columns mapped correctly. +- Edge case: Header row of 50 KB (one absurdly wide CSV) → re-fetch expands to 64 KB, succeeds. +- Edge case: BOM at start of file → stripped from header extraction in both chunk-0 and re-fetch paths. +- Error path: File with no newline in first 64 KB → throws `EtlHeaderError`; job marked `failed` via DLQ (U3). +- Error path: Row larger than 64 KB encountered at a chunk boundary → producer throws `ChunkBoundaryError`; no job created. +- Integration: A real CSV from prod (anonymized fixture in `packages/api/test/fixtures/`) splits into multiple chunks; sum of consumer-reported `totalProcessed` across all chunks equals `wc -l fixture.csv - 1` (subtract header). +- Covers AE: A 50,100-row file (the `evo` shape) ingested via the new chunking logic shows `total_processed = 50100`, `total_valid + total_invalid = 50100`, no missing rows. + +**Verification:** +- Manual run on a real prod fixture file with `wc -l` cross-check matches the job's `total_processed`. +- `bun test:api` passes the new fixture-driven test. +- Sentry catches the malformed-header case during the next dev exercise. + +--- + +### U8. Sentry wiring via `@sentry/cloudflare` + +**Goal:** Every uncaught exception in the API Worker — including queue-consumer paths — emits a Sentry event with structured tags. Operators can debug a stuck job without paging through raw Worker logs. + +**Requirements:** R9 + +**Dependencies:** None (independent; can start in parallel with Phase 1 but lands in Phase 3) + +**Files:** +- Modify: `packages/api/package.json` (add `@sentry/cloudflare` dependency; pin to a specific version) +- Modify: `packages/api/src/index.ts` (wrap the Worker default export with `Sentry.withSentry({...}, { fetch, queue })`; pass the Sentry options factory that reads `env.SENTRY_DSN`) +- Modify: `packages/api/src/utils/env-validation.ts` (no schema change — `SENTRY_DSN` is already declared at `:9, 94`; verify it's required vs optional and adjust accordingly so dev doesn't break without a DSN) +- Modify: `packages/api/wrangler.jsonc` (add `upload_source_maps: true` at the top level) +- Modify: `packages/api/src/services/etl/queue.ts` (fill in the `Sentry.captureException(...)` call site that U3 stubbed) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (Sentry breadcrumbs at chunk-start, batch-flush, and chunk-end; `Sentry.startSpan` around the chunk lifecycle) +- Create: `packages/api/src/utils/logger.ts` (the thin structured logger — accepts `LogContext`, emits JSON-prefixed `console.log` lines, also calls `Sentry.addBreadcrumb` when Sentry is initialized) +- Modify: All `packages/api/src/services/etl/*.ts` console calls migrated to `logger.{info,warn,error}` (mechanical change — sweeps across the ETL files) +- Test: `packages/api/test/sentry-instrumentation.test.ts` (new — mocks `@sentry/cloudflare` and asserts captureException/breadcrumb call shape) + +**Approach:** +- `withSentry({ fetch, queue })` wraps the existing default export at `packages/api/src/index.ts`. The Sentry options factory reads `env.SENTRY_DSN`, `env.ENVIRONMENT`, sets `tracesSampleRate: 0.1`. +- Queue consumer instrumentation per : + - `Sentry.startSpan({ op: 'queue.process', name: 'etl-chunk', attributes: { 'messaging.message.id': msg.id, 'messaging.message.retry.count': msg.attempts, 'jobId': msg.body.id, 'chunkIndex': msg.body.data.chunkIndex } }, async () => { ... })`. + - `Sentry.captureException(err, { tags: { jobId, chunkIndex, r2Key }, contexts: { queue: { messageId, attempts } } })` inside the catch. +- DLQ consumer (from U3) gets the same treatment. +- `logger.ts`: ~30 lines. Functions: `info(event, ctx)`, `warn(event, ctx)`, `error(event, ctx, err?)`. Emits a JSON line; if Sentry is initialized, also calls `Sentry.addBreadcrumb({ category: event, data: ctx, level })`. +- Source maps: `upload_source_maps: true` works with Wrangler 4.x and `compatibility_date: 2025-06-01`. + +**Patterns to follow:** +- No existing Sentry initialization in `packages/api` — this is the first. +- Reference Sentry-in-CF guidance: . + +**Test scenarios:** +- Happy path: Successful chunk → one `startSpan` invocation, breadcrumbs at chunk-start/flush/end, no `captureException`. +- Error path: Chunk throws → `captureException` called once with expected tags; span marks status `internal_error`. +- Edge case: `SENTRY_DSN` empty (dev without secret) → no Sentry calls fire; logger still emits lines; no crash. +- Edge case: Logger called before Sentry initialized (cold-start race) → graceful no-op on breadcrumb path; logger.info still emits the line. +- Integration: A real Sentry test project receives events from `bun api` dev-server when forced failures are triggered. + +**Verification:** +- Dev `bun api` cold start logs the Sentry init line. +- A forced chunk failure produces a Sentry event visible in the project. +- All `packages/api/src/services/etl/*.ts` files have zero `console.*` references (`grep -rn 'console\.' packages/api/src/services/etl/` returns nothing). + +--- + +### U9. P2 #2 + P2 #3 + P2 #4 fix: error propagation + embedding-failure observability + IIFE error handling + +**Goal:** Three related but smaller correctness issues that all share the theme "errors should not vanish silently." + +**Requirements:** R2, R10 + +**Dependencies:** U1 (for `total_embedding_failures`), U8 (so the new error sites can `Sentry.captureException`) + +**Files:** +- Modify: `packages/api/src/services/etl/processLogsBatch.ts` (rethrow on DB failure at `:25-27`; remove the swallow) +- Modify: `packages/api/src/services/etl/processValidItemsBatch.ts` (in the embedding-fallback path at `:52-63`, atomically increment `etl_jobs.total_embedding_failures` before upserting; surface a Sentry warning event with `jobId` and the affected SKU count; do not throw) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (wrap the writer IIFE at `:89-117` in an explicit promise: `const writerPromise = (async () => { ... })().catch(err => parser.destroy(err)); ...; await writerPromise.catch(err => { throw err })` so unhandled rejections become outer-flow throws) +- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (extend the admin job-list response to include `totalEmbeddingFailures` so dashboards can surface degradation) +- Test: `packages/api/test/etl-error-propagation.test.ts` (new) + +**Approach:** +- `processLogsBatch`: catch block currently logs and returns. Replace with `throw err`. The outer `processCatalogETL` catch already exists and the chunk will retry/DLQ correctly via U3. +- Embedding fallback: at `processValidItemsBatch.ts:52-63`, on `generateManyEmbeddings` throw: + ```text + await db.update(etlJobs).set({ totalEmbeddingFailures: sql`COALESCE(${etlJobs.totalEmbeddingFailures}, 0) + ${items.length}` }).where(eq(etlJobs.id, jobId)); + logger.warn('etl.embedding.fallback', { jobId, skuCount: items.length }); + Sentry.captureMessage('etl.embedding.fallback', { level: 'warning', tags: { jobId }, extra: { skuCount: items.length } }); + // continue with upsert; embedding stays NULL + ``` +- IIFE wrap pattern: + ```text + const writerPromise = (async () => { ... })() + .catch(err => { parser.destroy(err); throw err; }); + // ... for await loop ... + await writerPromise; + ``` + Any rejection in the writer now propagates to the outer try/catch in `processCatalogETL` and triggers retry/DLQ via U3. +- Admin response extension: extend the existing `GET /admin/analytics/catalog/etl` route's select shape to include `totalEmbeddingFailures` and update the response Zod schema if one is declared. + +**Patterns to follow:** +- Atomic update idiom at `packages/api/src/services/etl/updateEtlJobProgress.ts:16-23`. +- Admin route response shape at `packages/api/src/routes/admin/analytics/catalog.ts:178-235`. + +**Test scenarios:** +- Happy path (embedding fallback): Embedding service throws → SKUs upserted with `embedding=NULL`; `total_embedding_failures` increments by exactly `items.length`; Sentry warning event fires once per batch (not per SKU). +- Happy path (logs rethrow): `processLogsBatch` DB INSERT fails → exception propagates to outer catch → chunk retried by CF Queue. +- Happy path (IIFE wrap): Writer throws inside the async IIFE → parser destroyed; outer `for await` loop terminates; outer catch fires; chunk retried. +- Edge case: Multiple consecutive embedding batches in one chunk all fall back → counter increments cumulatively; Sentry warnings fire once per batch, not once per chunk. +- Edge case: Mixed batch — some SKUs embed, then fallback kicks in for the rest → counter increments only for the failed batch's SKU count. +- Integration: Admin endpoint response includes `totalEmbeddingFailures` field for every job; the prod-shape dashboard query still parses cleanly. + +**Verification:** +- New test passes with the rethrow / wrap / counter increments in place. +- `bun test:api` overall green. +- Dev admin endpoint `GET /admin/analytics/catalog/etl?limit=5` returns the new field. + +--- + +### U10. Reconciliation: admin endpoint + automatic post-job verification (via dedicated queue) + CLI subcommand + +**Goal:** Every ETL completion writes a verification row count; operators can also trigger reconciliation on any job on demand. Surfaces the user's "missing or falsely labeling" concern as a first-class observable signal. Auto-reconciliation runs on its own queue, not via `ctx.waitUntil`, so multi-GB files do not exceed the queue invocation's 15-min wall-clock. + +**Requirements:** R7 + +**Dependencies:** U1 (for `verified_at`, `verified_row_count`, `verified_row_count_partial`), U2 (for the completion transition that enqueues the reconcile message), U8 (for Sentry warnings on delta) + +**Files:** +- Create: `packages/api/src/services/etl/reconcileJob.ts` (pure function: given a `jobId` and optional `resumeFromByte`, stream the R2 source in 100 MB byte-range windows, count newlines, checkpoint progress, finalize verification on EOF, return delta) +- Create: `packages/api/src/services/etl/processReconcileBatch.ts` (queue consumer for `packrat-etl-reconcile-queue`; calls `reconcileJob`; handles retry/resume) +- Modify: `packages/api/src/services/etl/queue.ts` (extend producer to enqueue reconcile messages; type `ReconcileMessage { jobId: string; resumeFromByte?: number }`) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (on the final-chunk completion transition from U2, enqueue a `ReconcileMessage` to `packrat-etl-reconcile-queue` *inside the same transaction* as the status flip so a row can never transition to `completed` without an enqueued reconcile) +- Modify: `packages/api/src/index.ts` (extend the `queue()` switch with an arm for `packrat-etl-reconcile-queue` and `packrat-etl-reconcile-queue-dev`) +- Modify: `packages/api/wrangler.jsonc` (declare `packrat-etl-reconcile-queue` and `packrat-etl-reconcile-queue-dev` as producer + consumer with its own `dead_letter_queue: 'packrat-etl-dlq'` and `max_retries: 3`) +- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (add `POST /admin/etl/:jobId/reconcile` — calls `reconcileJob` synchronously; for small/medium files returns inline; for large files returns 202 Accepted and enqueues to the reconcile queue with the existing job id) +- Modify: `packages/cli/src/commands/admin/etl.ts` (add `reconcile ` subcommand) +- Modify: admin list endpoint response shape (include `verifiedAt`, `verifiedRowCount`, and `verifiedRowCountPartial` so the dashboard surfaces it) +- Test: `packages/api/test/etl-reconciliation.test.ts` (new) + +**Approach:** +- `reconcileJob(jobId, resumeFromByte = 0)`: + 1. Read `(filename, total_processed, verified_at, verified_row_count_partial)` from `etl_jobs`. If `verified_at IS NOT NULL`, return early — idempotent no-op for redelivered messages. + 2. `r2.head(key)` → `fileSize`. + 3. From `resumeFromByte` (or `verified_row_count_partial`'s checkpoint byte position if set), read 100 MB byte-range windows. For each window: + - Count `\n` bytes in the window. + - Add to running `lineCount`. + - On the last window, subtract 1 for the header row. + - Every 5 windows (500 MB processed) or when elapsed time > 10 min: `UPDATE etl_jobs SET verified_row_count_partial = $lineCount` (checkpoint), then throw a typed `ReconcileResumeError` carrying the current byte offset so the queue retry re-enqueues with `resumeFromByte` advanced. Wall-clock budget reset. + 4. On EOF: `UPDATE etl_jobs SET verified_at = now(), verified_row_count = $lineCount, verified_row_count_partial = NULL WHERE id = $1 AND verified_at IS NULL` (idempotency gate). + 5. Compute `delta = lineCount - total_processed`. If `abs(delta) > max(10, ceil(0.01 * lineCount))`: `Sentry.captureMessage('etl.reconciliation.delta', { level: 'warning', tags: { jobId }, extra: { delta, expected: lineCount, actual: total_processed } })`. + 6. Return `{ jobId, expectedRowCount: lineCount, actualRowCount: total_processed, delta, withinThreshold }`. +- `processReconcileBatch` (queue consumer): + - For each message: try `reconcileJob(msg.jobId, msg.resumeFromByte)` → on success `ack()`. On `ReconcileResumeError`: enqueue a new message with the advanced offset and `ack()` the current one. On any other error: `retry({ delaySeconds: 60 })`. +- Auto-trigger: in U2's completion transaction, after the status flip to `completed`, enqueue `{ jobId, resumeFromByte: 0 }` to `packrat-etl-reconcile-queue`. Because both writes are in the same transaction, a row can never be `completed` without an enqueued reconcile. +- Manual endpoint (`POST /admin/etl/:jobId/reconcile`): + - For files where `fileSize < 200 MB`: call `reconcileJob` synchronously and return the result inline. + - For files ≥ 200 MB: enqueue to `packrat-etl-reconcile-queue` and return 202 with a "poll the job for `verified_at`" message. + - Optional `?force=true` query: clear `verified_at` first and re-enqueue (operator override for a re-verify). +- CLI subcommand: `packrat-admin etl reconcile ` → wraps the endpoint, polls until `verifiedAt` is set or timeout. +- The 7 historical jobs from 2026-05-14 can each be reconciled retroactively via this endpoint *before* deciding to repair (U6). Confirms the suspicion that they processed partial data before being swept. + +**Patterns to follow:** +- Queue consumer pattern from U3 (per-message ack/retry, DLQ wired). +- Streaming-count pattern: `for await (const chunk of body)` and accumulate `chunk.filter(byte => byte === 0x0A).length`. + +**Test scenarios:** +- Happy path: Job with `total_processed = 100`, R2 file has 101 lines (100 rows + header) → delta = 0; `verified_at` set; no Sentry warning. +- Happy path: Job with `total_processed = 1000`, R2 file has 1006 lines (1005 rows + header) → delta = 5; within threshold; no warning. +- Edge case: Job with `total_processed = 50000`, R2 file has 50100 lines + header → delta = 100; threshold = `max(10, 500)` = 500; within threshold; no warning. (The 50,100 case stays informational.) +- Edge case (the real case): Job with `total_processed = 400`, R2 file has 50101 lines (50100 rows + header) — what the `campmor`-shape failures looked like → delta = 49700; way over threshold; Sentry warning fires. +- Edge case (resume): A 1.5 GB file forces three resume-error checkpoints; each resume picks up at the right byte offset; final `verified_row_count` matches the true row count. +- Edge case (idempotency): A redelivered reconcile message with `resumeFromByte = 0` against a job that already has `verified_at` set — `reconcileJob` returns early without re-reading the file. +- Error path: R2 object missing → `reconcileJob` throws a typed error; queue consumer retries with backoff; after exhausting `max_retries: 3`, the DLQ captures it. +- Edge case: Job with `total_processed = NULL` (legacy stuck-job-sweep casualty) → reconcileJob computes delta as `expected - 0 = expected`; the warning carries useful context for diagnosing the historical job. +- Integration: Auto-verify fires exactly once per job, enqueued atomically with the completion transition; it does not fire for intermediate chunk completions; it does not fire twice on a redelivered final chunk (idempotency comes from the `etl_job_chunks` gate in U2). + +**Verification:** +- New test passes. +- Calling the endpoint on a real dev-seeded job returns the documented shape (inline for small files, 202 + queued for large). +- The chunk-completion transaction either commits both the status flip and the reconcile enqueue, or neither (verify with a forced enqueue failure mid-transaction). + +--- + +### U11. Quality-of-life: scheduler.wait, BATCH_SIZE rename, mergeBySku log aggregation + +**Goal:** Three tiny correctness/cleanliness wins that share a maintenance flavor and ship together. + +**Requirements:** R9 (log volume), and audit P2 #5, P2 #6, P3 #1 + +**Dependencies:** U8 (for the logger surface used by the aggregated merge summary) + +**Files:** +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts:120` (replace `setTimeout(resolve, 1)` with `await scheduler.wait(0)`) +- Create: `packages/api/src/services/etl/constants.ts` (new — exports `ITEM_FLUSH_BATCH_SIZE = 100` and `CF_QUEUE_BATCH_SIZE = 100`) +- Modify: `packages/api/src/services/etl/processCatalogEtl.ts:13` and `packages/api/src/services/etl/queue.ts:17` (import from the new constants module instead of declaring inline) +- Modify: `packages/api/src/services/etl/mergeItemsBySku.ts:34-48` (replace per-SKU `console.log` with a per-batch summary `logger.info('etl.merge.summary', { jobId, mergedSkuCount, totalChangedFields })`) +- Test: `packages/api/test/etl-yield-and-constants.test.ts` (new — minimal; mostly behavior-preservation) + +**Approach:** +- `await scheduler.wait(0)` is the documented Workers Scheduler API. `scheduler.yield()` does not exist (corrected from audit P2 #5). +- The constants module is dead-simple — two exports — but the rename surfaces intent at the call site and ends the ambiguity the audit flagged at P2 #6. +- The mergeBySku aggregation accumulates change counts across one batch (already a natural unit) and logs once at the end. No per-SKU lines. + +**Patterns to follow:** +- Module organization mirrors `packages/api/src/services/etl/types.ts` for a constants file. + +**Test scenarios:** +- Behavior preservation: A 10,000-row chunk completes at least as fast as before with `scheduler.wait(0)` (regression check, not a strict assertion). +- Happy path (merge log): A batch with 50 SKU merges → exactly one log line emitted, summarizing the batch. +- Edge case: A batch with 0 merges → no log line. + +**Verification:** +- `grep -rn "setTimeout\(.*1.*\)" packages/api/src/services/etl/` returns nothing. +- `grep -rn "BATCH_SIZE\s*=" packages/api/src/services/etl/` returns only the new constants. +- A real ETL run on dev with 1k duplicate SKUs produces 1 merge summary line, not 1000. + +--- + +### U12. Validator hardening: URL scheme + length caps + SKU charset + +**Goal:** Eliminate the audit P3 #2 attack surface — `javascript:` URLs and oversize fields cannot enter the catalog. + +**Requirements:** R11 + +**Dependencies:** None (independent; can land any time after Phase 1) + +**Files:** +- Modify: `packages/api/src/services/etl/CatalogItemValidator.ts` (rewrite `isValidUrl` at `:60-67`; add length caps and SKU regex) +- Test: `packages/api/test/etl-validator.test.ts` (new or extend existing) + +**Approach:** +- `isValidUrl`: parse with `new URL()`; reject any scheme other than `http:` and `https:`. Reject URLs longer than 2048 chars. +- Length caps (rejects, not truncates): `name ≤ 500`, `description ≤ 50000`, `brand ≤ 200`, `category ≤ 200`. +- SKU regex: `/^[A-Za-z0-9_.\-\/]+$/`; max length 200. +- Each rejection produces a structured invalid-item log entry with the specific reason — surfaces in the existing `/admin/etl/:jobId/failures` endpoint. + +**Patterns to follow:** +- Existing validator structure at `packages/api/src/services/etl/CatalogItemValidator.ts`. +- Invalid-log shape at `packages/api/src/services/etl/processLogsBatch.ts`. + +**Test scenarios:** +- Happy path: Valid `https://example.com/product/123` URL accepted. +- Error path: `javascript:alert(1)` URL rejected with reason `INVALID_URL_SCHEME`. +- Error path: `mailto:foo@bar` rejected with `INVALID_URL_SCHEME`. +- Error path: URL of 3000 chars rejected with `URL_TOO_LONG`. +- Edge case: Name of exactly 500 chars accepted; 501 chars rejected. +- Edge case: SKU `ABC-123_/test.sku` accepted; SKU `