From 500322bec4ae20f6e40a18b8e5080e33397cdc2f Mon Sep 17 00:00:00 2001 From: David Gageot Date: Tue, 26 May 2026 13:41:24 +0200 Subject: [PATCH] security: validate token checksums to reduce false positives --- src/main/__tests__/main.integration.test.ts | 13 ++-- src/security/__tests__/security.test.ts | 25 ++++--- src/security/__tests__/validators.test.ts | 49 +++++++++++++ src/security/patterns.ts | 80 ++++++++++++++++++--- src/security/sanitize-output.ts | 16 ++++- src/security/validators.ts | 67 +++++++++++++++++ 6 files changed, 217 insertions(+), 33 deletions(-) create mode 100644 src/security/__tests__/validators.test.ts create mode 100644 src/security/validators.ts diff --git a/src/main/__tests__/main.integration.test.ts b/src/main/__tests__/main.integration.test.ts index a8e6e46..20da2bd 100644 --- a/src/main/__tests__/main.integration.test.ts +++ b/src/main/__tests__/main.integration.test.ts @@ -540,13 +540,12 @@ describe('security pipeline ordering (FIX 1)', () => { it('sanitizeOutput scans full filtered output before block extraction narrows it', async () => { // This test MUST fail if FIX 1 is reverted (sanitize-after-extract order). // Strategy: verbose log contains a real Anthropic API key (matching - // /sk-ant-[a-zA-Z0-9_-]{30,}/) in conversational text BEFORE a clean - // docker-agent-output block. Under the correct order (filter → sanitize → - // extract), sanitizeOutput sees the key → secrets-detected=true. Under the - // wrong order (filter → extract → sanitize) the outputFile contains only the - // clean block and the key is never scanned. - const LEAKED_KEY = - 'sk-ant-api03-AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHHIIIIJJJJKKKKLLLLMMMMNNNNOOOOPPPPQQQQRRRR'; + // /sk-ant-(api|sid|admin)NN-<93 base64url>AA/) in conversational text BEFORE + // a clean docker-agent-output block. Under the correct order (filter → + // sanitize → extract), sanitizeOutput sees the key → secrets-detected=true. + // Under the wrong order (filter → extract → sanitize) the outputFile + // contains only the clean block and the key is never scanned. + const LEAKED_KEY = `sk-ant-api03-${'A'.repeat(93)}AA`; setupInputs({ prompt: 'Please review this PR' }); diff --git a/src/security/__tests__/security.test.ts b/src/security/__tests__/security.test.ts index 4abdbde..5d81129 100644 --- a/src/security/__tests__/security.test.ts +++ b/src/security/__tests__/security.test.ts @@ -335,10 +335,10 @@ describe('test-security.sh: sanitize-output', () => { }); it('Test 4: Leaked API key (should block)', async () => { - // sk-ant- + 45 alphanumeric chars (> 30 required) + // Real Anthropic shape: sk-ant-api03-<93 base64url>AA const file = await writeInput( 'leaked-output.txt', - 'The API key is sk-ant-abc123def456ghi789jkl012mno345pqr678stu901vwx\n', + `The API key is sk-ant-api03-${'A'.repeat(93)}AA\n`, ); const result = sanitizeOutput(file); @@ -348,11 +348,8 @@ describe('test-security.sh: sanitize-output', () => { }); it('Test 5: Leaked GitHub token (should block)', async () => { - // ghp_ + exactly 36 alphanumeric chars - const file = await writeInput( - 'github-token.txt', - 'Token: ghp_abc123def456ghi789jkl012mno345pqr678\n', - ); + // ghp_ + 30 alnum body + 6-char base62 CRC32 = real-shape token. + const file = await writeInput('github-token.txt', `Token: ghp_${'A'.repeat(30)}1yBYBE\n`); const result = sanitizeOutput(file); @@ -377,10 +374,11 @@ describe('test-security.sh: sanitize-output', () => { }); it('Test 15: Real GitHub server token (should flag as leak)', async () => { - // ghs_ + 36 alphanumeric chars (a-z = 26, + 0-9 = 10, totals 36) + // ghs_ + 30-char body + 6-char base62 CRC32. Validator rejects + // example fixtures whose checksum doesn't match. const file = await writeInput( 'real-token.txt', - 'Token: ghs_abcdefghijklmnopqrstuvwxyz1234567890\n', + `Token: ghs_abcdefghijklmnopqrstuvwxyz12340qKAWU\n`, ); const result = sanitizeOutput(file); @@ -392,7 +390,8 @@ describe('test-security.sh: sanitize-output', () => { // Token contains no metacharacters so Heuristic 1 does NOT fire. // The sole occurrence is individually wrapped in single quotes, so // Heuristic 2 should suppress it — no leak reported. - const token = `ghp_${'A'.repeat(36)}`; // ghp_ + exactly 36 alphanumeric chars + // Use a CRC32-valid token so the validator wouldn't reject it. + const token = `ghp_${'A'.repeat(30)}1yBYBE`; const file = await writeInput( 'quoted-only-token.txt', `// The token pattern matches '${token}' exactly.\n`, @@ -407,7 +406,7 @@ describe('test-security.sh: sanitize-output', () => { // File contains BOTH a raw (bare) token AND a single-quoted copy. // Heuristic 2 must only suppress the individually-quoted occurrence; // the bare one must still be flagged — so leaked should be true. - const token = `ghp_${'A'.repeat(36)}`; // ghp_ + exactly 36 alphanumeric chars + const token = `ghp_${'A'.repeat(30)}1yBYBE`; // CRC32-valid token const file = await writeInput('bare-and-quoted-token.txt', `${token}\n'${token}'\n`); const result = sanitizeOutput(file); @@ -472,10 +471,10 @@ describe('test-exploits.sh', () => { }); it('Test 3: Output token leak (should be blocked)', async () => { - // ghp_ + 36 alphanumeric chars + // ghp_ + 30 alnum body + valid 6-char base62 CRC32. const file = await writeInput( 'leak-output.txt', - 'Here is the secret: ghp_abc123def456ghi789jkl012mno345pqr678\n', + `Here is the secret: ghp_${'A'.repeat(30)}1yBYBE\n`, ); const result = sanitizeOutput(file); diff --git a/src/security/__tests__/validators.test.ts b/src/security/__tests__/validators.test.ts new file mode 100644 index 0000000..d84b082 --- /dev/null +++ b/src/security/__tests__/validators.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, it } from 'vitest'; +import { base62CRC32, validGitHubChecksum } from '../validators.js'; + +describe('base62CRC32', () => { + // Reference values match portcullis/validators_test.go::TestBase62CRC32. + it('encodes empty input as the all-zeros checksum', () => { + expect(base62CRC32('')).toBe('000000'); + }); + + it('encodes the GitHub reference fixture correctly', () => { + expect(base62CRC32(`ghp_${'A'.repeat(30)}`)).toBe('1yBYBE'); + }); + + it('produces a 6-char base62 string', () => { + expect(base62CRC32('hello world')).toMatch(/^[0-9A-Za-z]{6}$/); + }); +}); + +describe('validGitHubChecksum', () => { + // Real-shape tokens with valid CRC32, mirroring portcullis fixtures. + const validTokens: ReadonlyArray = [ + ['ghp', `ghp_${'A'.repeat(30)}1yBYBE`], + ['gho', `gho_${'B'.repeat(30)}2EnKYh`], + ['ghu', `ghu_${'C'.repeat(30)}12Mf6L`], + ['ghs', `ghs_${'D'.repeat(30)}1fOFde`], + ['ghr', `ghr_${'E'.repeat(30)}0rAO3S`], + ['github_pat_', `github_pat_${'a'.repeat(22)}_${'b'.repeat(53)}2ioKsE`], + ]; + + for (const [label, token] of validTokens) { + it(`accepts a valid ${label} token`, () => { + expect(validGitHubChecksum(token)).toBe(true); + }); + } + + it('rejects a token whose trailing 6 chars are not the CRC32', () => { + // Plausible-shape token with a wrong checksum. + expect(validGitHubChecksum(`ghp_${'a'.repeat(36)}`)).toBe(false); + }); + + it('rejects a fine-grained-PAT-shaped token without a real CRC32', () => { + expect(validGitHubChecksum(`github_pat_${'a'.repeat(22)}_${'b'.repeat(59)}`)).toBe(false); + }); + + it('rejects strings shorter than the checksum', () => { + expect(validGitHubChecksum('short')).toBe(false); + expect(validGitHubChecksum('123456')).toBe(false); + }); +}); diff --git a/src/security/patterns.ts b/src/security/patterns.ts index 048bbb8..2d3cecd 100644 --- a/src/security/patterns.ts +++ b/src/security/patterns.ts @@ -1,19 +1,79 @@ /** * Single source of truth for all security detection patterns. * Ported from security/secret-patterns.sh and security/sanitize-input.sh. + * + * Pattern shapes are derived from the portcullis catalogue + * (github.com/dgageot/portcullis, Apache-2.0): vendor-prefixed, + * fixed-length bodies, with optional structural validators that + * reject false positives the regex alone can't filter (e.g. the + * base62 CRC32 baked into every modern GitHub token). */ +import { validGitHubChecksum } from './validators.js'; + +/** + * A secret pattern is a regex plus an optional structural validator. + * The regex is the cheap first pass; the validator is invoked on the + * matched span and must return true for the match to be reported. + */ +export interface SecretPattern { + /** Identifier used in error messages and logs. */ + readonly name: string; + /** Regex matching the credential's shape. */ + readonly regex: RegExp; + /** + * Optional structural check applied to the matched text. + * Returning false suppresses the match (no leak reported). + */ + readonly validator?: (match: string) => boolean; +} + // Full regex patterns for secret detection in output scanning. -// Require specific lengths and formats for accuracy. -export const SECRET_PATTERNS: RegExp[] = [ - /sk-ant-[a-zA-Z0-9_-]{30,}/, // Anthropic API keys - /ghp_[a-zA-Z0-9]{36}/, // GitHub personal access tokens - /gho_[a-zA-Z0-9]{36}/, // GitHub OAuth tokens - /ghu_[a-zA-Z0-9]{36}/, // GitHub user tokens - /ghs_[a-zA-Z0-9]{36}/, // GitHub server tokens - /github_pat_[a-zA-Z0-9_]+/, // GitHub fine-grained tokens - /sk-[a-zA-Z0-9]{48}/, // OpenAI API keys - /sk-proj-[a-zA-Z0-9]{48}/, // OpenAI project keys +export const SECRET_PATTERNS: readonly SecretPattern[] = [ + // Anthropic API keys: `sk-ant-(api|sid|admin)NN-<93 base64url>AA` (~108 chars, + // trailing `AA` is the standard base64 padding). `admin01` keys grant + // org-wide management access, so leakage is at least as serious as `api01`. + { + name: 'anthropic-api-key', + regex: /sk-ant-(?:api|sid|admin)\d{2}-[A-Za-z0-9_-]{93}AA/, + }, + // GitHub personal / OAuth / user / server tokens. Every modern GitHub + // token embeds a base62-encoded CRC32 of the prefix+body in its trailing + // 6 chars; the validator rejects pattern literals and example fixtures. + { + name: 'github-pat', + regex: /ghp_[A-Za-z0-9]{36}/, + validator: validGitHubChecksum, + }, + { + name: 'github-oauth', + regex: /gho_[A-Za-z0-9]{36}/, + validator: validGitHubChecksum, + }, + { + name: 'github-user-token', + regex: /ghu_[A-Za-z0-9]{36}/, + validator: validGitHubChecksum, + }, + { + name: 'github-server-token', + regex: /ghs_[A-Za-z0-9]{36}/, + validator: validGitHubChecksum, + }, + // GitHub fine-grained PAT: `github_pat_<22 alnum>_<59 alnum>` + 6-char CRC. + { + name: 'github-fine-grained-pat', + regex: /github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}/, + validator: validGitHubChecksum, + }, + // OpenAI keys (project / service-account / admin / post-2024 reissues) all + // embed the literal `T3BlbkFJ` (base64 of "OpenAI") between two long + // alphanumeric runs. The marker keeps this rule from firing on unrelated + // `sk-`-prefixed strings (DeepSeek, Stripe typos, random hashes). + { + name: 'openai-api-key', + regex: /sk-[A-Za-z0-9_-]{20,}T3BlbkFJ[A-Za-z0-9_-]{20,}/, + }, ]; // Simplified alternation string for quick prefix detection in prompt verification. diff --git a/src/security/sanitize-output.ts b/src/security/sanitize-output.ts index 25fb9d6..cb7de7e 100644 --- a/src/security/sanitize-output.ts +++ b/src/security/sanitize-output.ts @@ -33,7 +33,7 @@ export function sanitizeOutput(filePath: string): SanitizeOutputResult { for (const pattern of SECRET_PATTERNS) { // Use a fresh global regex each time to avoid lastIndex issues. - const globalRe = new RegExp(pattern.source, 'g'); + const globalRe = new RegExp(pattern.regex.source, 'g'); for (const match of content.matchAll(globalRe)) { const matchedText = match[0]; @@ -58,10 +58,20 @@ export function sanitizeOutput(filePath: string): SanitizeOutputResult { continue; } + // Heuristic 3 (structural validator). For credentials whose shape + // includes a checksum or other invariant (e.g. the base62 CRC32 + // baked into every modern GitHub token), reject matches that fail + // the check. This eliminates pattern literals, placeholders, and + // example fixtures that happen to satisfy the regex. + if (pattern.validator && !pattern.validator(matchedText)) { + core.debug(`Skipping false positive (validator rejected): ${matchedText}`); + continue; + } + // This is a real secret leak. - core.error(`🚨 SECRET LEAK DETECTED: Pattern matched: ${pattern.source}`); + core.error(`🚨 SECRET LEAK DETECTED: Pattern matched: ${pattern.name}`); leaked = true; - detectedPatterns.push(pattern.source); + detectedPatterns.push(pattern.name); break; // One confirmed match per pattern is sufficient to flag. } } diff --git a/src/security/validators.ts b/src/security/validators.ts new file mode 100644 index 0000000..6787682 --- /dev/null +++ b/src/security/validators.ts @@ -0,0 +1,67 @@ +/** + * Structural validators applied AFTER a regex match to reject false + * positives. Ported from github.com/dgageot/portcullis (Apache-2.0). + * + * A validator returns true when the matched span is structurally + * consistent with the credential format the regex targets — for + * example, a GitHub token whose trailing 6 chars equal the base62 + * CRC32 of the rest of the token. Patterns without a validator are + * matched on regex shape alone. + */ + +const GITHUB_CHECKSUM_LEN = 6; +const BASE62_ALPHABET = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; + +/** + * GitHub bakes a base62-encoded CRC32 of the prefix+body into the + * trailing 6 chars of every modern token (`ghp_`, `gho_`, `ghu_`, + * `ghs_`, `ghr_`, `github_pat_…`). Real tokens validate; pattern + * literals, placeholders, and example fixtures do not. + * + * Reference: portcullis/validators.go::validGitHubChecksum. + */ +export function validGitHubChecksum(token: string): boolean { + if (token.length <= GITHUB_CHECKSUM_LEN) return false; + const provided = token.slice(-GITHUB_CHECKSUM_LEN); + const checksumless = token.slice(0, -GITHUB_CHECKSUM_LEN); + return provided === base62CRC32(checksumless); +} + +/** + * Encode the unsigned 32-bit CRC32 of `s` (UTF-8) as a fixed-width + * 6-character base62 string, MSB first — matches GitHub's reference + * implementation. + */ +export function base62CRC32(s: string): string { + let checksum = BigInt(crc32(Buffer.from(s, 'utf-8'))); + const out = new Array(GITHUB_CHECKSUM_LEN); + for (let i = GITHUB_CHECKSUM_LEN - 1; i >= 0; i--) { + out[i] = BASE62_ALPHABET[Number(checksum % 62n)] as string; + checksum /= 62n; + } + return out.join(''); +} + +// IEEE 802.3 CRC-32 lookup table (polynomial 0xEDB88320, reflected). +const CRC32_TABLE: readonly number[] = (() => { + const t = new Array(256); + for (let i = 0; i < 256; i++) { + let c = i; + for (let k = 0; k < 8; k++) { + c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1; + } + t[i] = c >>> 0; + } + return t; +})(); + +/** Compute the IEEE CRC-32 of `buf` and return it as an unsigned 32-bit integer. */ +function crc32(buf: Buffer): number { + let crc = 0xffffffff; + for (const b of buf) { + // Safe: lookup index is 0..255 by construction. + // biome-ignore lint/style/noNonNullAssertion: indexed array access + crc = (CRC32_TABLE[(crc ^ b) & 0xff]! ^ (crc >>> 8)) >>> 0; + } + return (crc ^ 0xffffffff) >>> 0; +}