Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions src/main/__tests__/main.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -540,13 +540,12 @@ describe('security pipeline ordering (FIX 1)', () => {
it('sanitizeOutput scans full filtered output before block extraction narrows it', async () => {
// This test MUST fail if FIX 1 is reverted (sanitize-after-extract order).
// Strategy: verbose log contains a real Anthropic API key (matching
// /sk-ant-[a-zA-Z0-9_-]{30,}/) in conversational text BEFORE a clean
// docker-agent-output block. Under the correct order (filter → sanitize →
// extract), sanitizeOutput sees the key → secrets-detected=true. Under the
// wrong order (filter → extract → sanitize) the outputFile contains only the
// clean block and the key is never scanned.
const LEAKED_KEY =
'sk-ant-api03-AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHHIIIIJJJJKKKKLLLLMMMMNNNNOOOOPPPPQQQQRRRR';
// /sk-ant-(api|sid|admin)NN-<93 base64url>AA/) in conversational text BEFORE
// a clean docker-agent-output block. Under the correct order (filter →
// sanitize → extract), sanitizeOutput sees the key → secrets-detected=true.
// Under the wrong order (filter → extract → sanitize) the outputFile
// contains only the clean block and the key is never scanned.
const LEAKED_KEY = `sk-ant-api03-${'A'.repeat(93)}AA`;

setupInputs({ prompt: 'Please review this PR' });

Expand Down
25 changes: 12 additions & 13 deletions src/security/__tests__/security.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,10 @@ describe('test-security.sh: sanitize-output', () => {
});

it('Test 4: Leaked API key (should block)', async () => {
// sk-ant- + 45 alphanumeric chars (> 30 required)
// Real Anthropic shape: sk-ant-api03-<93 base64url>AA
const file = await writeInput(
'leaked-output.txt',
'The API key is sk-ant-abc123def456ghi789jkl012mno345pqr678stu901vwx\n',
`The API key is sk-ant-api03-${'A'.repeat(93)}AA\n`,
);

const result = sanitizeOutput(file);
Expand All @@ -348,11 +348,8 @@ describe('test-security.sh: sanitize-output', () => {
});

it('Test 5: Leaked GitHub token (should block)', async () => {
// ghp_ + exactly 36 alphanumeric chars
const file = await writeInput(
'github-token.txt',
'Token: ghp_abc123def456ghi789jkl012mno345pqr678\n',
);
// ghp_ + 30 alnum body + 6-char base62 CRC32 = real-shape token.
const file = await writeInput('github-token.txt', `Token: ghp_${'A'.repeat(30)}1yBYBE\n`);

const result = sanitizeOutput(file);

Expand All @@ -377,10 +374,11 @@ describe('test-security.sh: sanitize-output', () => {
});

it('Test 15: Real GitHub server token (should flag as leak)', async () => {
// ghs_ + 36 alphanumeric chars (a-z = 26, + 0-9 = 10, totals 36)
// ghs_ + 30-char body + 6-char base62 CRC32. Validator rejects
// example fixtures whose checksum doesn't match.
const file = await writeInput(
'real-token.txt',
'Token: ghs_abcdefghijklmnopqrstuvwxyz1234567890\n',
`Token: ghs_abcdefghijklmnopqrstuvwxyz12340qKAWU\n`,
);

const result = sanitizeOutput(file);
Expand All @@ -392,7 +390,8 @@ describe('test-security.sh: sanitize-output', () => {
// Token contains no metacharacters so Heuristic 1 does NOT fire.
// The sole occurrence is individually wrapped in single quotes, so
// Heuristic 2 should suppress it — no leak reported.
const token = `ghp_${'A'.repeat(36)}`; // ghp_ + exactly 36 alphanumeric chars
// Use a CRC32-valid token so the validator wouldn't reject it.
const token = `ghp_${'A'.repeat(30)}1yBYBE`;
const file = await writeInput(
'quoted-only-token.txt',
`// The token pattern matches '${token}' exactly.\n`,
Expand All @@ -407,7 +406,7 @@ describe('test-security.sh: sanitize-output', () => {
// File contains BOTH a raw (bare) token AND a single-quoted copy.
// Heuristic 2 must only suppress the individually-quoted occurrence;
// the bare one must still be flagged — so leaked should be true.
const token = `ghp_${'A'.repeat(36)}`; // ghp_ + exactly 36 alphanumeric chars
const token = `ghp_${'A'.repeat(30)}1yBYBE`; // CRC32-valid token
const file = await writeInput('bare-and-quoted-token.txt', `${token}\n'${token}'\n`);

const result = sanitizeOutput(file);
Expand Down Expand Up @@ -472,10 +471,10 @@ describe('test-exploits.sh', () => {
});

it('Test 3: Output token leak (should be blocked)', async () => {
// ghp_ + 36 alphanumeric chars
// ghp_ + 30 alnum body + valid 6-char base62 CRC32.
const file = await writeInput(
'leak-output.txt',
'Here is the secret: ghp_abc123def456ghi789jkl012mno345pqr678\n',
`Here is the secret: ghp_${'A'.repeat(30)}1yBYBE\n`,
);

const result = sanitizeOutput(file);
Expand Down
49 changes: 49 additions & 0 deletions src/security/__tests__/validators.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { describe, expect, it } from 'vitest';
import { base62CRC32, validGitHubChecksum } from '../validators.js';

describe('base62CRC32', () => {
// Reference values match portcullis/validators_test.go::TestBase62CRC32.
it('encodes empty input as the all-zeros checksum', () => {
expect(base62CRC32('')).toBe('000000');
});

it('encodes the GitHub reference fixture correctly', () => {
expect(base62CRC32(`ghp_${'A'.repeat(30)}`)).toBe('1yBYBE');
});

it('produces a 6-char base62 string', () => {
expect(base62CRC32('hello world')).toMatch(/^[0-9A-Za-z]{6}$/);
});
});

describe('validGitHubChecksum', () => {
// Real-shape tokens with valid CRC32, mirroring portcullis fixtures.
const validTokens: ReadonlyArray<readonly [string, string]> = [
['ghp', `ghp_${'A'.repeat(30)}1yBYBE`],
['gho', `gho_${'B'.repeat(30)}2EnKYh`],
['ghu', `ghu_${'C'.repeat(30)}12Mf6L`],
['ghs', `ghs_${'D'.repeat(30)}1fOFde`],
['ghr', `ghr_${'E'.repeat(30)}0rAO3S`],
['github_pat_', `github_pat_${'a'.repeat(22)}_${'b'.repeat(53)}2ioKsE`],
];

for (const [label, token] of validTokens) {
it(`accepts a valid ${label} token`, () => {
expect(validGitHubChecksum(token)).toBe(true);
});
}

it('rejects a token whose trailing 6 chars are not the CRC32', () => {
// Plausible-shape token with a wrong checksum.
expect(validGitHubChecksum(`ghp_${'a'.repeat(36)}`)).toBe(false);
});

it('rejects a fine-grained-PAT-shaped token without a real CRC32', () => {
expect(validGitHubChecksum(`github_pat_${'a'.repeat(22)}_${'b'.repeat(59)}`)).toBe(false);
});

it('rejects strings shorter than the checksum', () => {
expect(validGitHubChecksum('short')).toBe(false);
expect(validGitHubChecksum('123456')).toBe(false);
});
});
80 changes: 70 additions & 10 deletions src/security/patterns.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,79 @@
/**
* Single source of truth for all security detection patterns.
* Ported from security/secret-patterns.sh and security/sanitize-input.sh.
*
* Pattern shapes are derived from the portcullis catalogue
* (github.com/dgageot/portcullis, Apache-2.0): vendor-prefixed,
* fixed-length bodies, with optional structural validators that
* reject false positives the regex alone can't filter (e.g. the
* base62 CRC32 baked into every modern GitHub token).
*/

import { validGitHubChecksum } from './validators.js';

/**
* A secret pattern is a regex plus an optional structural validator.
* The regex is the cheap first pass; the validator is invoked on the
* matched span and must return true for the match to be reported.
*/
export interface SecretPattern {
/** Identifier used in error messages and logs. */
readonly name: string;
/** Regex matching the credential's shape. */
readonly regex: RegExp;
/**
* Optional structural check applied to the matched text.
* Returning false suppresses the match (no leak reported).
*/
readonly validator?: (match: string) => boolean;
}

// Full regex patterns for secret detection in output scanning.
// Require specific lengths and formats for accuracy.
export const SECRET_PATTERNS: RegExp[] = [
/sk-ant-[a-zA-Z0-9_-]{30,}/, // Anthropic API keys
/ghp_[a-zA-Z0-9]{36}/, // GitHub personal access tokens
/gho_[a-zA-Z0-9]{36}/, // GitHub OAuth tokens
/ghu_[a-zA-Z0-9]{36}/, // GitHub user tokens
/ghs_[a-zA-Z0-9]{36}/, // GitHub server tokens
/github_pat_[a-zA-Z0-9_]+/, // GitHub fine-grained tokens
/sk-[a-zA-Z0-9]{48}/, // OpenAI API keys
/sk-proj-[a-zA-Z0-9]{48}/, // OpenAI project keys
export const SECRET_PATTERNS: readonly SecretPattern[] = [
// Anthropic API keys: `sk-ant-(api|sid|admin)NN-<93 base64url>AA` (~108 chars,
// trailing `AA` is the standard base64 padding). `admin01` keys grant
// org-wide management access, so leakage is at least as serious as `api01`.
{
name: 'anthropic-api-key',
regex: /sk-ant-(?:api|sid|admin)\d{2}-[A-Za-z0-9_-]{93}AA/,
},
// GitHub personal / OAuth / user / server tokens. Every modern GitHub
// token embeds a base62-encoded CRC32 of the prefix+body in its trailing
// 6 chars; the validator rejects pattern literals and example fixtures.
{
name: 'github-pat',
regex: /ghp_[A-Za-z0-9]{36}/,
validator: validGitHubChecksum,
},
{
name: 'github-oauth',
regex: /gho_[A-Za-z0-9]{36}/,
validator: validGitHubChecksum,
},
{
name: 'github-user-token',
regex: /ghu_[A-Za-z0-9]{36}/,
validator: validGitHubChecksum,
},
{
name: 'github-server-token',
regex: /ghs_[A-Za-z0-9]{36}/,
validator: validGitHubChecksum,
},
// GitHub fine-grained PAT: `github_pat_<22 alnum>_<59 alnum>` + 6-char CRC.
{
name: 'github-fine-grained-pat',
regex: /github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}/,
validator: validGitHubChecksum,
},
// OpenAI keys (project / service-account / admin / post-2024 reissues) all
// embed the literal `T3BlbkFJ` (base64 of "OpenAI") between two long
// alphanumeric runs. The marker keeps this rule from firing on unrelated
// `sk-`-prefixed strings (DeepSeek, Stripe typos, random hashes).
{
name: 'openai-api-key',
regex: /sk-[A-Za-z0-9_-]{20,}T3BlbkFJ[A-Za-z0-9_-]{20,}/,
},
];

// Simplified alternation string for quick prefix detection in prompt verification.
Expand Down
16 changes: 13 additions & 3 deletions src/security/sanitize-output.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export function sanitizeOutput(filePath: string): SanitizeOutputResult {

for (const pattern of SECRET_PATTERNS) {
// Use a fresh global regex each time to avoid lastIndex issues.
const globalRe = new RegExp(pattern.source, 'g');
const globalRe = new RegExp(pattern.regex.source, 'g');

for (const match of content.matchAll(globalRe)) {
const matchedText = match[0];
Expand All @@ -58,10 +58,20 @@ export function sanitizeOutput(filePath: string): SanitizeOutputResult {
continue;
}

// Heuristic 3 (structural validator). For credentials whose shape
// includes a checksum or other invariant (e.g. the base62 CRC32
// baked into every modern GitHub token), reject matches that fail
// the check. This eliminates pattern literals, placeholders, and
// example fixtures that happen to satisfy the regex.
if (pattern.validator && !pattern.validator(matchedText)) {
core.debug(`Skipping false positive (validator rejected): ${matchedText}`);
continue;
}

// This is a real secret leak.
core.error(`🚨 SECRET LEAK DETECTED: Pattern matched: ${pattern.source}`);
core.error(`🚨 SECRET LEAK DETECTED: Pattern matched: ${pattern.name}`);
leaked = true;
detectedPatterns.push(pattern.source);
detectedPatterns.push(pattern.name);
break; // One confirmed match per pattern is sufficient to flag.
}
}
Expand Down
67 changes: 67 additions & 0 deletions src/security/validators.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* Structural validators applied AFTER a regex match to reject false
* positives. Ported from github.com/dgageot/portcullis (Apache-2.0).
*
* A validator returns true when the matched span is structurally
* consistent with the credential format the regex targets — for
* example, a GitHub token whose trailing 6 chars equal the base62
* CRC32 of the rest of the token. Patterns without a validator are
* matched on regex shape alone.
*/

const GITHUB_CHECKSUM_LEN = 6;
const BASE62_ALPHABET = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';

/**
* GitHub bakes a base62-encoded CRC32 of the prefix+body into the
* trailing 6 chars of every modern token (`ghp_`, `gho_`, `ghu_`,
* `ghs_`, `ghr_`, `github_pat_…`). Real tokens validate; pattern
* literals, placeholders, and example fixtures do not.
*
* Reference: portcullis/validators.go::validGitHubChecksum.
*/
export function validGitHubChecksum(token: string): boolean {
if (token.length <= GITHUB_CHECKSUM_LEN) return false;
const provided = token.slice(-GITHUB_CHECKSUM_LEN);
const checksumless = token.slice(0, -GITHUB_CHECKSUM_LEN);
return provided === base62CRC32(checksumless);
}

/**
* Encode the unsigned 32-bit CRC32 of `s` (UTF-8) as a fixed-width
* 6-character base62 string, MSB first — matches GitHub's reference
* implementation.
*/
export function base62CRC32(s: string): string {
let checksum = BigInt(crc32(Buffer.from(s, 'utf-8')));
const out = new Array<string>(GITHUB_CHECKSUM_LEN);
for (let i = GITHUB_CHECKSUM_LEN - 1; i >= 0; i--) {
out[i] = BASE62_ALPHABET[Number(checksum % 62n)] as string;
checksum /= 62n;
}
return out.join('');
}

// IEEE 802.3 CRC-32 lookup table (polynomial 0xEDB88320, reflected).
const CRC32_TABLE: readonly number[] = (() => {
const t = new Array<number>(256);
for (let i = 0; i < 256; i++) {
let c = i;
for (let k = 0; k < 8; k++) {
c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1;
}
t[i] = c >>> 0;
}
return t;
})();

/** Compute the IEEE CRC-32 of `buf` and return it as an unsigned 32-bit integer. */
function crc32(buf: Buffer): number {
let crc = 0xffffffff;
for (const b of buf) {
// Safe: lookup index is 0..255 by construction.
// biome-ignore lint/style/noNonNullAssertion: indexed array access
crc = (CRC32_TABLE[(crc ^ b) & 0xff]! ^ (crc >>> 8)) >>> 0;
}
return (crc ^ 0xffffffff) >>> 0;
}
Loading