From 035d95678701fc09835156c5cdb94a1444a0d5a0 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:02:09 +0100 Subject: [PATCH 01/17] feat(07-01): add scanNameRegex and scanValuePatterns to APIKeyScanner - Add nameRegexPatterns (35 patterns: 28 provider keywords + 7 generic credential terms) - Add valuePatterns (10 entries covering OpenAI, HuggingFace, GitHub token formats) - Implement scanNameRegex(): flags env vars with provider/credential names not in HighRiskEnvKeys - Implement scanValuePatterns(): flags env vars whose values match known provider key formats - Wire both methods into Scan() after scanEnvKeys() - Values are read only for emptiness check (scanNameRegex) or prefix+length match (scanValuePatterns); never stored in findings --- internal/scan/apikeys.go | 169 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index 0724f74..dd0a9fb 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -4,13 +4,85 @@ import ( "fmt" "os" "path/filepath" + "regexp" "sort" + "strings" "github.com/Pringled/agentcheck/internal/config" "github.com/Pringled/agentcheck/internal/fsutil" "github.com/Pringled/agentcheck/internal/models" ) +// nameRegexPatterns is compiled once at package init. It matches env var names that suggest +// they hold credentials for known providers or generic secret terms. +// Case-insensitive match on the full variable name. +var nameRegexPatterns = []*regexp.Regexp{ + // Provider keywords + regexp.MustCompile(`(?i)OPENAI`), + regexp.MustCompile(`(?i)ANTHROPIC`), + regexp.MustCompile(`(?i)COHERE`), + regexp.MustCompile(`(?i)MISTRAL`), + regexp.MustCompile(`(?i)REPLICATE`), + regexp.MustCompile(`(?i)HUGGINGFACE`), + regexp.MustCompile(`(?i)HF_`), + regexp.MustCompile(`(?i)TOGETHER`), + regexp.MustCompile(`(?i)GROQ`), + regexp.MustCompile(`(?i)VOYAGE`), + regexp.MustCompile(`(?i)ELEVEN_LABS`), + regexp.MustCompile(`(?i)PINECONE`), + regexp.MustCompile(`(?i)STRIPE`), + regexp.MustCompile(`(?i)BRAINTREE`), + regexp.MustCompile(`(?i)PAYPAL`), + regexp.MustCompile(`(?i)SQUARE`), + regexp.MustCompile(`(?i)TWILIO`), + regexp.MustCompile(`(?i)SENDGRID`), + regexp.MustCompile(`(?i)MAILGUN`), + regexp.MustCompile(`(?i)SLACK`), + regexp.MustCompile(`(?i)DISCORD`), + regexp.MustCompile(`(?i)OKTA`), + regexp.MustCompile(`(?i)AUTH0`), + regexp.MustCompile(`(?i)DATADOG`), + regexp.MustCompile(`(?i)SENTRY`), + regexp.MustCompile(`(?i)VERCEL`), + regexp.MustCompile(`(?i)NETLIFY`), + regexp.MustCompile(`(?i)CLOUDFLARE`), + regexp.MustCompile(`(?i)HEROKU`), + regexp.MustCompile(`(?i)RAILWAY`), + regexp.MustCompile(`(?i)FLY`), + regexp.MustCompile(`(?i)GITHUB`), + regexp.MustCompile(`(?i)GITLAB`), + regexp.MustCompile(`(?i)BITBUCKET`), + // Generic credential terms + regexp.MustCompile(`(?i)API_KEY`), + regexp.MustCompile(`(?i)API_TOKEN`), + regexp.MustCompile(`(?i)SECRET_KEY`), + regexp.MustCompile(`(?i)AUTH_TOKEN`), + regexp.MustCompile(`(?i)ACCESS_TOKEN`), + regexp.MustCompile(`(?i)PRIVATE_KEY`), + regexp.MustCompile(`(?i)SERVICE_KEY`), +} + +// valuePattern describes a provider key format recognisable by prefix and exact total length. +type valuePattern struct { + prefix string + totalLen int + providerTag string // used to build description, e.g. "OpenAI project" +} + +// valuePatterns is compiled (constructed) once at package init. +var valuePatterns = []valuePattern{ + {prefix: "sk-", totalLen: 51, providerTag: "OpenAI legacy"}, + {prefix: "sk-proj-", totalLen: 56, providerTag: "OpenAI project"}, + {prefix: "sk-admin-", totalLen: 57, providerTag: "OpenAI admin"}, + {prefix: "hf_", totalLen: 37, providerTag: "HuggingFace"}, + {prefix: "ghp_", totalLen: 40, providerTag: "GitHub classic PAT"}, + {prefix: "github_pat_", totalLen: 93, providerTag: "GitHub fine-grained PAT"}, + {prefix: "gho_", totalLen: 40, providerTag: "GitHub OAuth token"}, + {prefix: "ghu_", totalLen: 40, providerTag: "GitHub user token"}, + {prefix: "ghs_", totalLen: 40, providerTag: "GitHub app installation token"}, + {prefix: "ghr_", totalLen: 40, providerTag: "GitHub refresh token"}, +} + // credentialFiles is the list of credential files/dirs to check. var credentialFiles = []config.CredentialFile{ {Path: "~/.config/gcloud/", Label: "GCP application default credentials"}, @@ -62,6 +134,8 @@ func (s *APIKeyScanner) Name() string { return "api_keys" } func (s *APIKeyScanner) Scan() models.ScanResult { var findings []models.Finding findings = append(findings, s.scanEnvKeys()...) + findings = append(findings, s.scanNameRegex()...) + findings = append(findings, s.scanValuePatterns()...) findings = append(findings, s.scanCredentialFiles()...) return models.ScanResult{ ScannerName: "api_keys", @@ -110,6 +184,101 @@ func (s *APIKeyScanner) scanEnvKeys() []models.Finding { return findings } +// scanNameRegex checks env var names against known provider keywords and generic +// credential terms. It catches non-standard names like MY_OPENAI_KEY that are +// missed by the exact-match HighRiskEnvKeys pass. Key names only are reported; +// values are checked only for emptiness and then discarded. +func (s *APIKeyScanner) scanNameRegex() []models.Finding { + var findings []models.Finding + seen := make(map[string]bool) + + for _, entry := range os.Environ() { + idx := strings.IndexByte(entry, '=') + if idx < 0 { + continue + } + name := entry[:idx] + value := entry[idx+1:] + + // Skip if already covered by the exact-match HighRiskEnvKeys pass. + if HighRiskEnvKeys[name] { + continue + } + // Skip if value is empty — key exists but no credential is set. + if value == "" { + continue + } + // Guard against duplicate findings (a name appears at most once in os.Environ, + // but be defensive in case of unexpected duplicates). + if seen[name] { + continue + } + + for _, re := range nameRegexPatterns { + if re.MatchString(name) { + seen[name] = true + findings = append(findings, models.Finding{ + Scanner: "api_keys", + Resource: name, // key name only, never the value + Severity: models.SeverityHigh, + Description: "Can be used to make authenticated API calls.", + }) + break + } + } + } + + return findings +} + +// scanValuePatterns reads env var values to match against known provider prefixes. +// NOTE: unlike scanEnvKeys and scanNameRegex, this method reads the actual value. +// Values are used only for prefix+length pattern matching and then discarded immediately. +// No value is stored in findings, logs, or returned data structures. +// This is a deliberate, scoped relaxation of the "values never read" contract. +func (s *APIKeyScanner) scanValuePatterns() []models.Finding { + var findings []models.Finding + seen := make(map[string]bool) + + for _, entry := range os.Environ() { + idx := strings.IndexByte(entry, '=') + if idx < 0 { + continue + } + name := entry[:idx] + value := entry[idx+1:] + + // Skip if already covered by the exact-match HighRiskEnvKeys pass. + if HighRiskEnvKeys[name] { + continue + } + // Skip empty values. + if value == "" { + continue + } + // Dedup by name: emit at most one finding per variable name. + if seen[name] { + continue + } + + for _, p := range valuePatterns { + if strings.HasPrefix(value, p.prefix) && len(value) == p.totalLen { + seen[name] = true + findings = append(findings, models.Finding{ + Scanner: "api_keys", + Resource: name, // env var NAME, never the value + Severity: models.SeverityHigh, + Description: fmt.Sprintf("Value matches %s API key format.", p.providerTag), + }) + break // one finding per variable name + } + } + // value goes out of scope here; it is not stored anywhere + } + + return findings +} + // scanCredentialFiles checks built-in and extra credential file paths for existence. // File paths only are reported; file contents are never read or stored. func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { From 6da8341d2f6cc7c659312368593eadd7dc747879 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:03:51 +0100 Subject: [PATCH 02/17] feat(07-01): add tests for scanNameRegex and scanValuePatterns Name-regex tests (TestAPIKeyScanner_NameRegex_*): - ProviderKeyword: MY_OPENAI_KEY flagged by provider keyword match - GenericTerm: INTERNAL_API_KEY flagged by generic credential term - NoDuplicateWithBuiltin: OPENAI_API_KEY produces exactly 1 finding - EmptyValueNotFlagged: empty-value vars not reported - ValueNotInFindings: secret value never appears in any finding field Value-pattern tests (TestAPIKeyScanner_ValuePattern_*): - OpenAIProject: sk-proj- prefix + 48 chars detected with provider tag - HuggingFace: hf_ prefix + 34 chars detected with HuggingFace tag - GitHub_ClassicPAT: ghp_ prefix + 36 chars detected with GitHub tag - NoMatchWrongLength: correct prefix but wrong length not flagged - BuiltinSkipped: HighRiskEnvKeys key produces exactly 1 finding --- internal/scan/apikeys_test.go | 193 ++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index 90cc396..761cc3b 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -3,6 +3,7 @@ package scan_test import ( "os" "path/filepath" + "strings" "testing" "github.com/Pringled/agentcheck/internal/config" @@ -308,3 +309,195 @@ func TestAPIKeyScanner_ExtraCredentialFiles_TildeExpanded(t *testing.T) { assertResource(t, result.Findings, tokenFile) } + +// ── Name-regex tests ────────────────────────────────────────────────────────── + +// TestAPIKeyScanner_NameRegex_ProviderKeyword verifies that an env var with a +// provider keyword in its name (MY_OPENAI_KEY) is flagged even though it is not +// in HighRiskEnvKeys. +func TestAPIKeyScanner_NameRegex_ProviderKeyword(t *testing.T) { + t.Setenv("MY_OPENAI_KEY", "sk-something") + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "MY_OPENAI_KEY") +} + +// TestAPIKeyScanner_NameRegex_GenericTerm verifies that an env var containing a +// generic credential term (INTERNAL_API_KEY) is flagged. +func TestAPIKeyScanner_NameRegex_GenericTerm(t *testing.T) { + t.Setenv("INTERNAL_API_KEY", "secret") + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "INTERNAL_API_KEY") +} + +// TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin verifies that a key already in +// HighRiskEnvKeys (OPENAI_API_KEY) produces exactly ONE finding — scanEnvKeys() gets it +// and scanNameRegex() skips it. +func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { + t.Setenv("OPENAI_API_KEY", "sk-test") + // Clear all built-in keys except OPENAI_API_KEY. + for k := range scan.HighRiskEnvKeys { + if k != "OPENAI_API_KEY" { + t.Setenv(k, "") + } + } + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + count := 0 + for _, f := range result.Findings { + if f.Resource == "OPENAI_API_KEY" { + count++ + } + } + if count != 1 { + t.Errorf("expected exactly 1 finding for OPENAI_API_KEY, got %d", count) + } +} + +// TestAPIKeyScanner_NameRegex_EmptyValueNotFlagged verifies that an env var whose +// name matches but whose value is empty produces NO finding. +func TestAPIKeyScanner_NameRegex_EmptyValueNotFlagged(t *testing.T) { + t.Setenv("MY_ANTHROPIC_TOKEN", "") + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + for _, f := range result.Findings { + if f.Resource == "MY_ANTHROPIC_TOKEN" { + t.Error("got unexpected finding for MY_ANTHROPIC_TOKEN with empty value") + } + } +} + +// TestAPIKeyScanner_NameRegex_ValueNotInFindings verifies that the secret value set on +// a name-matched env var does not appear in any field of any finding. +func TestAPIKeyScanner_NameRegex_ValueNotInFindings(t *testing.T) { + const secret = "supersecretvalue" + t.Setenv("MY_SECRET_KEY", secret) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertNoSecretValue(t, result.Findings, secret) +} + +// ── Value-pattern tests ─────────────────────────────────────────────────────── + +// TestAPIKeyScanner_ValuePattern_OpenAIProject verifies that a value matching the +// OpenAI project key format (sk-proj- + 48 chars = 56 total) produces a finding +// with the correct resource name and provider tag in the description. +func TestAPIKeyScanner_ValuePattern_OpenAIProject(t *testing.T) { + value := "sk-proj-" + strings.Repeat("a", 48) // total 56 chars + t.Setenv("SOME_AI_CRED", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "SOME_AI_CRED") + for _, f := range result.Findings { + if f.Resource == "SOME_AI_CRED" { + if !strings.Contains(f.Description, "OpenAI project") { + t.Errorf("expected description to contain %q, got %q", "OpenAI project", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_HuggingFace verifies that a value matching the +// HuggingFace token format (hf_ + 34 chars = 37 total) produces a correct finding. +func TestAPIKeyScanner_ValuePattern_HuggingFace(t *testing.T) { + value := "hf_" + strings.Repeat("b", 34) // total 37 chars + // Use a variable name that does NOT match any nameRegex pattern so the finding + // comes from scanValuePatterns (and the HuggingFace provider tag is in the description). + t.Setenv("ML_MODEL_CRED", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "ML_MODEL_CRED") + for _, f := range result.Findings { + if f.Resource == "ML_MODEL_CRED" { + if !strings.Contains(f.Description, "HuggingFace") { + t.Errorf("expected description to contain %q, got %q", "HuggingFace", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT verifies that a value matching the +// GitHub classic PAT format (ghp_ + 36 chars = 40 total) produces a correct finding. +func TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT(t *testing.T) { + value := "ghp_" + strings.Repeat("c", 36) // total 40 chars + t.Setenv("WORK_GH_TOKEN", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "WORK_GH_TOKEN") + for _, f := range result.Findings { + if f.Resource == "WORK_GH_TOKEN" { + if !strings.Contains(f.Description, "GitHub") { + t.Errorf("expected description to contain %q, got %q", "GitHub", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the +// right prefix but wrong length does NOT produce a finding. +func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { + value := "sk-proj-" + strings.Repeat("x", 10) // total 18 chars, wrong length for any pattern + t.Setenv("SOME_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + for _, f := range result.Findings { + if f.Resource == "SOME_KEY" { + t.Errorf("got unexpected finding for SOME_KEY with wrong-length value") + } + } +} + +// TestAPIKeyScanner_ValuePattern_BuiltinSkipped verifies that a key in HighRiskEnvKeys +// whose value also matches a value pattern produces exactly ONE finding (from scanEnvKeys, +// not from scanValuePatterns which skips it). +func TestAPIKeyScanner_ValuePattern_BuiltinSkipped(t *testing.T) { + value := "sk-proj-" + strings.Repeat("z", 48) // total 56 chars — matches OpenAI project pattern + t.Setenv("OPENAI_API_KEY", value) + // Clear all other built-in keys. + for k := range scan.HighRiskEnvKeys { + if k != "OPENAI_API_KEY" { + t.Setenv(k, "") + } + } + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + count := 0 + for _, f := range result.Findings { + if f.Resource == "OPENAI_API_KEY" { + count++ + } + } + if count != 1 { + t.Errorf("expected exactly 1 finding for OPENAI_API_KEY, got %d", count) + } +} From db20f03973ce78b241fc5e05a23857075a88b6f9 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:16:42 +0100 Subject: [PATCH 03/17] feat(07-01): expand value patterns, name regex, and fix FLY false positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add 9 new value patterns: Anthropic (sk-ant-), Stripe live/test secrets (sk_live_/sk_test_), Stripe restricted (rk_live_/rk_test_), GitLab PAT (glpat-), npm granular token (npm_), Groq (gsk_), SendGrid (SG.) - Demote generic sk- (51 chars) to SeverityUncertain — shared by many tools - Add per-pattern severity field to valuePattern struct; scanValuePatterns uses it instead of always emitting SeverityHigh - Reorder valuePatterns so sk-proj- and sk-admin- match before sk- - Expand nameRegexPatterns with 17 new entries: GEMINI, VERTEX, PALM, BEDROCK, AZURE_OPENAI, AZURE_COGNITIVE, RESEND, POSTMARK, SPARKPOST, LINEAR, NOTION, AIRTABLE, SUPABASE, NEON, PLANETSCALE - Fix overbroad FLY pattern → \bFLY_ (word boundary) to avoid false positives on BUTTERFLY_KEY and similar variable names - Fix misleading comment on valuePatterns var (was 'compiled at package init'; it is a plain struct slice, nothing is compiled) - Add 10 new tests covering all new patterns and the FLY boundary fix --- internal/scan/apikeys.go | 77 ++++++++--- internal/scan/apikeys_test.go | 239 ++++++++++++++++++++++++++++++++-- 2 files changed, 288 insertions(+), 28 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index dd0a9fb..d85538a 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -17,7 +17,7 @@ import ( // they hold credentials for known providers or generic secret terms. // Case-insensitive match on the full variable name. var nameRegexPatterns = []*regexp.Regexp{ - // Provider keywords + // AI / ML providers regexp.MustCompile(`(?i)OPENAI`), regexp.MustCompile(`(?i)ANTHROPIC`), regexp.MustCompile(`(?i)COHERE`), @@ -30,28 +30,54 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)VOYAGE`), regexp.MustCompile(`(?i)ELEVEN_LABS`), regexp.MustCompile(`(?i)PINECONE`), + // Google AI (Gemini, Vertex AI, PaLM) + regexp.MustCompile(`(?i)GEMINI`), + regexp.MustCompile(`(?i)VERTEX`), + regexp.MustCompile(`(?i)PALM`), + // AWS AI + regexp.MustCompile(`(?i)BEDROCK`), + // Azure AI + regexp.MustCompile(`(?i)AZURE_OPENAI`), + regexp.MustCompile(`(?i)AZURE_COGNITIVE`), + // Payment providers regexp.MustCompile(`(?i)STRIPE`), regexp.MustCompile(`(?i)BRAINTREE`), regexp.MustCompile(`(?i)PAYPAL`), regexp.MustCompile(`(?i)SQUARE`), + // Communication / messaging regexp.MustCompile(`(?i)TWILIO`), regexp.MustCompile(`(?i)SENDGRID`), regexp.MustCompile(`(?i)MAILGUN`), + regexp.MustCompile(`(?i)RESEND`), + regexp.MustCompile(`(?i)POSTMARK`), + regexp.MustCompile(`(?i)SPARKPOST`), regexp.MustCompile(`(?i)SLACK`), regexp.MustCompile(`(?i)DISCORD`), + // Auth / identity regexp.MustCompile(`(?i)OKTA`), regexp.MustCompile(`(?i)AUTH0`), + // Observability regexp.MustCompile(`(?i)DATADOG`), regexp.MustCompile(`(?i)SENTRY`), + // Cloud / hosting platforms regexp.MustCompile(`(?i)VERCEL`), regexp.MustCompile(`(?i)NETLIFY`), regexp.MustCompile(`(?i)CLOUDFLARE`), regexp.MustCompile(`(?i)HEROKU`), regexp.MustCompile(`(?i)RAILWAY`), - regexp.MustCompile(`(?i)FLY`), + regexp.MustCompile(`(?i)\bFLY_`), // word boundary prevents false positives (BUTTERFLY_KEY) + // Source control regexp.MustCompile(`(?i)GITHUB`), regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), + // Productivity / project tools (common in agent contexts) + regexp.MustCompile(`(?i)LINEAR`), + regexp.MustCompile(`(?i)NOTION`), + regexp.MustCompile(`(?i)AIRTABLE`), + // Database-as-a-service (API keys / connection tokens) + regexp.MustCompile(`(?i)SUPABASE`), + regexp.MustCompile(`(?i)NEON`), + regexp.MustCompile(`(?i)PLANETSCALE`), // Generic credential terms regexp.MustCompile(`(?i)API_KEY`), regexp.MustCompile(`(?i)API_TOKEN`), @@ -66,21 +92,42 @@ var nameRegexPatterns = []*regexp.Regexp{ type valuePattern struct { prefix string totalLen int - providerTag string // used to build description, e.g. "OpenAI project" + severity models.Severity // HIGH for provider-specific prefixes; UNCERTAIN for ambiguous ones + providerTag string // used to build description, e.g. "OpenAI project" } -// valuePatterns is compiled (constructed) once at package init. +// valuePatterns lists known API key formats identified by a distinctive prefix and exact total length. var valuePatterns = []valuePattern{ - {prefix: "sk-", totalLen: 51, providerTag: "OpenAI legacy"}, - {prefix: "sk-proj-", totalLen: 56, providerTag: "OpenAI project"}, - {prefix: "sk-admin-", totalLen: 57, providerTag: "OpenAI admin"}, - {prefix: "hf_", totalLen: 37, providerTag: "HuggingFace"}, - {prefix: "ghp_", totalLen: 40, providerTag: "GitHub classic PAT"}, - {prefix: "github_pat_", totalLen: 93, providerTag: "GitHub fine-grained PAT"}, - {prefix: "gho_", totalLen: 40, providerTag: "GitHub OAuth token"}, - {prefix: "ghu_", totalLen: 40, providerTag: "GitHub user token"}, - {prefix: "ghs_", totalLen: 40, providerTag: "GitHub app installation token"}, - {prefix: "ghr_", totalLen: 40, providerTag: "GitHub refresh token"}, + // OpenAI — more-specific prefixes listed first so they match before the generic sk- entry. + {prefix: "sk-proj-", totalLen: 56, severity: models.SeverityHigh, providerTag: "OpenAI project"}, + {prefix: "sk-admin-", totalLen: 57, severity: models.SeverityHigh, providerTag: "OpenAI admin"}, + // sk- is shared by many tools (OpenAI legacy, LangChain proxies, self-hosted LLMs, …). + // Flag as UNCERTAIN so the user can confirm the actual provider via the variable name. + {prefix: "sk-", totalLen: 51, severity: models.SeverityUncertain, providerTag: "possible OpenAI legacy or other sk- key"}, + // Anthropic — prefix is distinctive enough for HIGH confidence. + {prefix: "sk-ant-", totalLen: 108, severity: models.SeverityHigh, providerTag: "Anthropic"}, + // Stripe — underscore separator makes these provider-specific. + {prefix: "sk_live_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe live secret"}, + {prefix: "sk_test_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe test secret"}, + {prefix: "rk_live_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe live restricted"}, + {prefix: "rk_test_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe test restricted"}, + // GitLab — glpat- + 20 random chars = 26 total. + {prefix: "glpat-", totalLen: 26, severity: models.SeverityHigh, providerTag: "GitLab personal access token"}, + // npm granular access token — npm_ + 36 hex chars = 40 total. + {prefix: "npm_", totalLen: 40, severity: models.SeverityHigh, providerTag: "npm access token"}, + // Groq — gsk_ prefix confirmed in Groq docs. + {prefix: "gsk_", totalLen: 56, severity: models.SeverityHigh, providerTag: "Groq"}, + // SendGrid — SG. + 22 + . + 43 = 69 total (with the dots). + {prefix: "SG.", totalLen: 69, severity: models.SeverityHigh, providerTag: "SendGrid"}, + // HuggingFace + {prefix: "hf_", totalLen: 37, severity: models.SeverityHigh, providerTag: "HuggingFace"}, + // GitHub tokens — all provider-specific prefixes. + {prefix: "ghp_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub classic PAT"}, + {prefix: "github_pat_", totalLen: 93, severity: models.SeverityHigh, providerTag: "GitHub fine-grained PAT"}, + {prefix: "gho_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub OAuth token"}, + {prefix: "ghu_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub user token"}, + {prefix: "ghs_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub app installation token"}, + {prefix: "ghr_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub refresh token"}, } // credentialFiles is the list of credential files/dirs to check. @@ -267,7 +314,7 @@ func (s *APIKeyScanner) scanValuePatterns() []models.Finding { findings = append(findings, models.Finding{ Scanner: "api_keys", Resource: name, // env var NAME, never the value - Severity: models.SeverityHigh, + Severity: p.severity, Description: fmt.Sprintf("Value matches %s API key format.", p.providerTag), }) break // one finding per variable name diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index 761cc3b..54b580f 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -363,36 +363,249 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { } } -// TestAPIKeyScanner_NameRegex_EmptyValueNotFlagged verifies that an env var whose -// name matches but whose value is empty produces NO finding. -func TestAPIKeyScanner_NameRegex_EmptyValueNotFlagged(t *testing.T) { - t.Setenv("MY_ANTHROPIC_TOKEN", "") +// ── New value-pattern tests ─────────────────────────────────────────────────── + +// TestAPIKeyScanner_ValuePattern_AmbiguousSK verifies that a value matching the +// generic sk- format (51 chars) produces an UNCERTAIN finding, not HIGH, because +// sk- is used by many tools beyond OpenAI legacy. +func TestAPIKeyScanner_ValuePattern_AmbiguousSK(t *testing.T) { + value := "sk-" + strings.Repeat("x", 48) // total 51 chars + t.Setenv("SOME_CRED", value) clearHighRiskEnv(t) s := newScannerWithHome(t.TempDir()) result := s.Scan() + assertResource(t, result.Findings, "SOME_CRED") for _, f := range result.Findings { - if f.Resource == "MY_ANTHROPIC_TOKEN" { - t.Error("got unexpected finding for MY_ANTHROPIC_TOKEN with empty value") + if f.Resource == "SOME_CRED" { + if f.Severity != "UNCERTAIN" { + t.Errorf("expected UNCERTAIN severity for ambiguous sk- key, got %q", f.Severity) + } } } + assertNoSecretValue(t, result.Findings, value) } -// TestAPIKeyScanner_NameRegex_ValueNotInFindings verifies that the secret value set on -// a name-matched env var does not appear in any field of any finding. -func TestAPIKeyScanner_NameRegex_ValueNotInFindings(t *testing.T) { - const secret = "supersecretvalue" - t.Setenv("MY_SECRET_KEY", secret) +// TestAPIKeyScanner_ValuePattern_StripeLiveSecret verifies that a Stripe live secret key +// (sk_live_ + 47 chars = 55 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_StripeLiveSecret(t *testing.T) { + value := "sk_live_" + strings.Repeat("s", 47) // total 55 chars + t.Setenv("PAYMENT_KEY", value) clearHighRiskEnv(t) s := newScannerWithHome(t.TempDir()) result := s.Scan() - assertNoSecretValue(t, result.Findings, secret) + assertResource(t, result.Findings, "PAYMENT_KEY") + for _, f := range result.Findings { + if f.Resource == "PAYMENT_KEY" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for Stripe live key, got %q", f.Severity) + } + if !strings.Contains(f.Description, "Stripe") { + t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_StripeTestSecret verifies that a Stripe test secret key +// (sk_test_ + 47 chars = 55 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_StripeTestSecret(t *testing.T) { + value := "sk_test_" + strings.Repeat("t", 47) // total 55 chars + t.Setenv("TEST_PAYMENT_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "TEST_PAYMENT_KEY") + for _, f := range result.Findings { + if f.Resource == "TEST_PAYMENT_KEY" { + if !strings.Contains(f.Description, "Stripe") { + t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) + } + } + } } -// ── Value-pattern tests ─────────────────────────────────────────────────────── +// TestAPIKeyScanner_ValuePattern_GitLabPAT verifies that a GitLab personal access token +// (glpat- + 20 chars = 26 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_GitLabPAT(t *testing.T) { + value := "glpat-" + strings.Repeat("g", 20) // total 26 chars + t.Setenv("REPO_TOKEN", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "REPO_TOKEN") + for _, f := range result.Findings { + if f.Resource == "REPO_TOKEN" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for GitLab PAT, got %q", f.Severity) + } + if !strings.Contains(f.Description, "GitLab") { + t.Errorf("expected description to contain %q, got %q", "GitLab", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_NpmToken verifies that an npm granular access token +// (npm_ + 36 chars = 40 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_NpmToken(t *testing.T) { + value := "npm_" + strings.Repeat("n", 36) // total 40 chars + t.Setenv("REGISTRY_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "REGISTRY_KEY") + for _, f := range result.Findings { + if f.Resource == "REGISTRY_KEY" { + if !strings.Contains(f.Description, "npm") { + t.Errorf("expected description to contain %q, got %q", "npm", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_Groq verifies that a Groq key (gsk_ + 52 chars = 56 total) +// produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_Groq(t *testing.T) { + value := "gsk_" + strings.Repeat("q", 52) // total 56 chars + t.Setenv("INFERENCE_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "INFERENCE_KEY") + for _, f := range result.Findings { + if f.Resource == "INFERENCE_KEY" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for Groq key, got %q", f.Severity) + } + if !strings.Contains(f.Description, "Groq") { + t.Errorf("expected description to contain %q, got %q", "Groq", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_SendGrid verifies that a SendGrid key +// (SG. + 22 chars + . + 43 chars = 69 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_SendGrid(t *testing.T) { + // SG. (3) + 22 chars + . (1) + 43 chars = 69 total + value := "SG." + strings.Repeat("a", 22) + "." + strings.Repeat("b", 43) + t.Setenv("MAIL_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "MAIL_KEY") + for _, f := range result.Findings { + if f.Resource == "MAIL_KEY" { + if !strings.Contains(f.Description, "SendGrid") { + t.Errorf("expected description to contain %q, got %q", "SendGrid", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_Anthropic verifies that an Anthropic key +// (sk-ant- prefix, 108 total chars) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_Anthropic(t *testing.T) { + value := "sk-ant-" + strings.Repeat("a", 101) // total 108 chars + t.Setenv("LLM_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "LLM_KEY") + for _, f := range result.Findings { + if f.Resource == "LLM_KEY" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for Anthropic key, got %q", f.Severity) + } + if !strings.Contains(f.Description, "Anthropic") { + t.Errorf("expected description to contain %q, got %q", "Anthropic", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// ── New nameRegex tests ─────────────────────────────────────────────────────── + +// TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN +// but does NOT match BUTTERFLY_KEY (which contains the substring FLY_ but should not +// be treated as a Fly.io credential due to the word-boundary anchor in the pattern). +func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { + clearHighRiskEnv(t) + t.Setenv("FLY_API_TOKEN", "real-token") + t.Setenv("BUTTERFLY_KEY", "not-a-fly-token") + t.Setenv("FLYWEIGHT_INDEX", "not-a-token") + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + // FLY_API_TOKEN must be flagged. + assertResource(t, result.Findings, "FLY_API_TOKEN") + + // BUTTERFLY_KEY and FLYWEIGHT_INDEX must NOT be flagged. + for _, f := range result.Findings { + if f.Resource == "BUTTERFLY_KEY" { + t.Error("BUTTERFLY_KEY should not be flagged by FLY_ pattern") + } + if f.Resource == "FLYWEIGHT_INDEX" { + t.Error("FLYWEIGHT_INDEX should not be flagged by FLY_ pattern") + } + } +} + +// TestAPIKeyScanner_NameRegex_NewProviders verifies that new provider keywords +// added in this session are recognised. +func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { + clearHighRiskEnv(t) + cases := []struct { + envVar string + value string + }{ + {"MY_GEMINI_KEY", "gemini-key-value"}, + {"VERTEX_API_KEY", "vertex-key-value"}, + {"BEDROCK_ACCESS_KEY", "bedrock-key-value"}, + {"AZURE_OPENAI_KEY", "azure-openai-key"}, + {"RESEND_API_KEY", "resend-key-value"}, + {"POSTMARK_TOKEN", "postmark-key-value"}, + {"MY_LINEAR_TOKEN", "linear-key-value"}, + {"NOTION_API_KEY", "notion-key-value"}, + {"AIRTABLE_KEY", "airtable-key-value"}, + {"SUPABASE_KEY", "supabase-key-value"}, + {"NEON_API_KEY", "neon-key-value"}, + {"PLANETSCALE_TOKEN", "ps-key-value"}, + } + + for _, tc := range cases { + t.Setenv(tc.envVar, tc.value) + } + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + for _, tc := range cases { + assertResource(t, result.Findings, tc.envVar) + } +} // TestAPIKeyScanner_ValuePattern_OpenAIProject verifies that a value matching the // OpenAI project key format (sk-proj- + 48 chars = 56 total) produces a finding From 124de66536699b470016a63fd417cb4b0f14d3c2 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:23:49 +0100 Subject: [PATCH 04/17] =?UTF-8?q?fix(07-01):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20cross-pass=20dedup,=20tighter=20regexes,=20new=20pr?= =?UTF-8?q?oviders?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-pass duplicate findings (critical): - Introduce shared seenEnvNames map passed into scanNameRegex and scanValuePatterns; a variable matching both passes now produces exactly one finding (name-regex wins, value-pattern is skipped) - Add regression test: CUSTOM_STRIPE_KEY=sk_live_... yields 1 finding Tighten overbroad name regexes (false-positive fix): - NEON → \bNEON_ (avoids ANEMONE_CONFIG, NEONLIGHTS_COLOR) - LINEAR → \bLINEAR_ (avoids BILINEAR_FILTER) - PALM → \bPALM_ (avoids NAPALM_MODE, PALM_BEACH_PROPERTY) - Add false-positive regression tests for all three Add missing AI provider name patterns: - OPENROUTER, FIREWORKS, DEEPSEEK, PERPLEXITY, CEREBRAS, DOPPLER - Covered by TestAPIKeyScanner_NameRegex_NewAIProviders Add Twilio SK value pattern: - SK + 32 hex chars = 34 total → SeverityHigh - Covered by TestAPIKeyScanner_ValuePattern_TwilioSID Fix top-level APIKeyScanner struct comment: - Clarifies that values are read transiently in scanValuePatterns for pattern matching only, never emitted or stored --- internal/scan/apikeys.go | 50 ++++-- internal/scan/apikeys_test.go | 317 ++++++++-------------------------- 2 files changed, 103 insertions(+), 264 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index d85538a..f67647a 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -30,10 +30,17 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)VOYAGE`), regexp.MustCompile(`(?i)ELEVEN_LABS`), regexp.MustCompile(`(?i)PINECONE`), + regexp.MustCompile(`(?i)OPENROUTER`), + regexp.MustCompile(`(?i)FIREWORKS`), + regexp.MustCompile(`(?i)DEEPSEEK`), + regexp.MustCompile(`(?i)PERPLEXITY`), + regexp.MustCompile(`(?i)CEREBRAS`), + // Secrets managers + regexp.MustCompile(`(?i)DOPPLER`), // Google AI (Gemini, Vertex AI, PaLM) regexp.MustCompile(`(?i)GEMINI`), regexp.MustCompile(`(?i)VERTEX`), - regexp.MustCompile(`(?i)PALM`), + regexp.MustCompile(`(?i)\bPALM_`), // word boundary + underscore avoids NAPALM_MODE, PALM_BEACH_PROPERTY // AWS AI regexp.MustCompile(`(?i)BEDROCK`), // Azure AI @@ -71,12 +78,12 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), // Productivity / project tools (common in agent contexts) - regexp.MustCompile(`(?i)LINEAR`), + regexp.MustCompile(`(?i)\bLINEAR_`), // word boundary + underscore avoids BILINEAR_FILTER regexp.MustCompile(`(?i)NOTION`), regexp.MustCompile(`(?i)AIRTABLE`), // Database-as-a-service (API keys / connection tokens) regexp.MustCompile(`(?i)SUPABASE`), - regexp.MustCompile(`(?i)NEON`), + regexp.MustCompile(`(?i)\bNEON_`), // word boundary + underscore avoids ANEMONE_CONFIG, NEON_LIGHTS_COLOR regexp.MustCompile(`(?i)PLANETSCALE`), // Generic credential terms regexp.MustCompile(`(?i)API_KEY`), @@ -117,6 +124,8 @@ var valuePatterns = []valuePattern{ {prefix: "npm_", totalLen: 40, severity: models.SeverityHigh, providerTag: "npm access token"}, // Groq — gsk_ prefix confirmed in Groq docs. {prefix: "gsk_", totalLen: 56, severity: models.SeverityHigh, providerTag: "Groq"}, + // Twilio API key SID — SK + 32 hex chars = 34 total. + {prefix: "SK", totalLen: 34, severity: models.SeverityHigh, providerTag: "Twilio API key SID"}, // SendGrid — SG. + 22 + . + 43 = 69 total (with the dots). {prefix: "SG.", totalLen: 69, severity: models.SeverityHigh, providerTag: "SendGrid"}, // HuggingFace @@ -144,7 +153,10 @@ var credentialFiles = []config.CredentialFile{ } // APIKeyScanner scans for high-risk API keys in environment variables and credential config files. -// It reports key names and file paths only; never values or file contents. +// Key names and file paths only are reported in findings; values and file contents are never emitted. +// Exception: scanValuePatterns transiently reads env var values solely for prefix+length pattern +// matching; values are discarded immediately and never stored in findings, logs, or any +// data structure. See scanValuePatterns for the full security contract. // It never returns skipped=true. type APIKeyScanner struct { Base @@ -180,9 +192,13 @@ func (s *APIKeyScanner) Name() string { return "api_keys" } // Implements Scanner. Never returns skipped=true. func (s *APIKeyScanner) Scan() models.ScanResult { var findings []models.Finding + // seenEnvNames is shared across the name-regex and value-pattern passes so that a + // variable matching both (e.g. CUSTOM_STRIPE_KEY=sk_live_...) produces exactly one + // finding — the name-regex pass runs first and claims it. + seenEnvNames := make(map[string]bool) findings = append(findings, s.scanEnvKeys()...) - findings = append(findings, s.scanNameRegex()...) - findings = append(findings, s.scanValuePatterns()...) + findings = append(findings, s.scanNameRegex(seenEnvNames)...) + findings = append(findings, s.scanValuePatterns(seenEnvNames)...) findings = append(findings, s.scanCredentialFiles()...) return models.ScanResult{ ScannerName: "api_keys", @@ -235,9 +251,9 @@ func (s *APIKeyScanner) scanEnvKeys() []models.Finding { // credential terms. It catches non-standard names like MY_OPENAI_KEY that are // missed by the exact-match HighRiskEnvKeys pass. Key names only are reported; // values are checked only for emptiness and then discarded. -func (s *APIKeyScanner) scanNameRegex() []models.Finding { +// seenEnvNames is the shared cross-pass dedup set; matched names are added to it. +func (s *APIKeyScanner) scanNameRegex(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding - seen := make(map[string]bool) for _, entry := range os.Environ() { idx := strings.IndexByte(entry, '=') @@ -255,15 +271,14 @@ func (s *APIKeyScanner) scanNameRegex() []models.Finding { if value == "" { continue } - // Guard against duplicate findings (a name appears at most once in os.Environ, - // but be defensive in case of unexpected duplicates). - if seen[name] { + // Skip if already claimed by a prior pass or earlier in this pass. + if seenEnvNames[name] { continue } for _, re := range nameRegexPatterns { if re.MatchString(name) { - seen[name] = true + seenEnvNames[name] = true findings = append(findings, models.Finding{ Scanner: "api_keys", Resource: name, // key name only, never the value @@ -283,9 +298,10 @@ func (s *APIKeyScanner) scanNameRegex() []models.Finding { // Values are used only for prefix+length pattern matching and then discarded immediately. // No value is stored in findings, logs, or returned data structures. // This is a deliberate, scoped relaxation of the "values never read" contract. -func (s *APIKeyScanner) scanValuePatterns() []models.Finding { +// seenEnvNames is the shared cross-pass dedup set; names already claimed by scanNameRegex +// are skipped, and newly matched names are added. +func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding - seen := make(map[string]bool) for _, entry := range os.Environ() { idx := strings.IndexByte(entry, '=') @@ -303,14 +319,14 @@ func (s *APIKeyScanner) scanValuePatterns() []models.Finding { if value == "" { continue } - // Dedup by name: emit at most one finding per variable name. - if seen[name] { + // Skip if already claimed by scanNameRegex or an earlier iteration of this pass. + if seenEnvNames[name] { continue } for _, p := range valuePatterns { if strings.HasPrefix(value, p.prefix) && len(value) == p.totalLen { - seen[name] = true + seenEnvNames[name] = true findings = append(findings, models.Finding{ Scanner: "api_keys", Resource: name, // env var NAME, never the value diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index 54b580f..6668600 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -363,238 +363,142 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { } } -// ── New value-pattern tests ─────────────────────────────────────────────────── - -// TestAPIKeyScanner_ValuePattern_AmbiguousSK verifies that a value matching the -// generic sk- format (51 chars) produces an UNCERTAIN finding, not HIGH, because -// sk- is used by many tools beyond OpenAI legacy. -func TestAPIKeyScanner_ValuePattern_AmbiguousSK(t *testing.T) { - value := "sk-" + strings.Repeat("x", 48) // total 51 chars - t.Setenv("SOME_CRED", value) +// TestAPIKeyScanner_ValuePattern_TwilioSID verifies that a Twilio API key SID +// (SK + 32 hex chars = 34 total) produces a HIGH finding. +// The variable name is intentionally neutral (no provider keyword) so the finding +// comes from the value-pattern pass, confirming the pattern itself works. +func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) { + value := "SK" + strings.Repeat("f", 32) // total 34 chars + t.Setenv("CRED_SID", value) clearHighRiskEnv(t) s := newScannerWithHome(t.TempDir()) result := s.Scan() - assertResource(t, result.Findings, "SOME_CRED") + assertResource(t, result.Findings, "CRED_SID") for _, f := range result.Findings { - if f.Resource == "SOME_CRED" { - if f.Severity != "UNCERTAIN" { - t.Errorf("expected UNCERTAIN severity for ambiguous sk- key, got %q", f.Severity) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_StripeLiveSecret verifies that a Stripe live secret key -// (sk_live_ + 47 chars = 55 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_StripeLiveSecret(t *testing.T) { - value := "sk_live_" + strings.Repeat("s", 47) // total 55 chars - t.Setenv("PAYMENT_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "PAYMENT_KEY") - for _, f := range result.Findings { - if f.Resource == "PAYMENT_KEY" { + if f.Resource == "CRED_SID" { if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Stripe live key, got %q", f.Severity) + t.Errorf("expected HIGH severity for Twilio SID, got %q", f.Severity) } - if !strings.Contains(f.Description, "Stripe") { - t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) + if !strings.Contains(f.Description, "Twilio") { + t.Errorf("expected description to contain %q, got %q", "Twilio", f.Description) } } } assertNoSecretValue(t, result.Findings, value) } -// TestAPIKeyScanner_ValuePattern_StripeTestSecret verifies that a Stripe test secret key -// (sk_test_ + 47 chars = 55 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_StripeTestSecret(t *testing.T) { - value := "sk_test_" + strings.Repeat("t", 47) // total 55 chars - t.Setenv("TEST_PAYMENT_KEY", value) +// TestAPIKeyScanner_CrossPassDedup_NameRegexWins verifies that a variable whose name +// matches a nameRegex pattern AND whose value matches a value pattern produces exactly +// ONE finding — from the name-regex pass — not two. +func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) { + // CUSTOM_STRIPE_KEY matches the STRIPE name-regex. + // sk_live_ + 47 chars matches the Stripe live secret value pattern. + // Without cross-pass dedup both passes would emit a finding. + value := "sk_live_" + strings.Repeat("s", 47) // total 55 chars + t.Setenv("CUSTOM_STRIPE_KEY", value) clearHighRiskEnv(t) s := newScannerWithHome(t.TempDir()) result := s.Scan() - assertResource(t, result.Findings, "TEST_PAYMENT_KEY") + count := 0 for _, f := range result.Findings { - if f.Resource == "TEST_PAYMENT_KEY" { - if !strings.Contains(f.Description, "Stripe") { - t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) - } + if f.Resource == "CUSTOM_STRIPE_KEY" { + count++ } } -} - -// TestAPIKeyScanner_ValuePattern_GitLabPAT verifies that a GitLab personal access token -// (glpat- + 20 chars = 26 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_GitLabPAT(t *testing.T) { - value := "glpat-" + strings.Repeat("g", 20) // total 26 chars - t.Setenv("REPO_TOKEN", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "REPO_TOKEN") - for _, f := range result.Findings { - if f.Resource == "REPO_TOKEN" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for GitLab PAT, got %q", f.Severity) - } - if !strings.Contains(f.Description, "GitLab") { - t.Errorf("expected description to contain %q, got %q", "GitLab", f.Description) - } - } + if count != 1 { + t.Errorf("expected exactly 1 finding for CUSTOM_STRIPE_KEY (cross-pass dedup), got %d", count) } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_NpmToken verifies that an npm granular access token -// (npm_ + 36 chars = 40 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_NpmToken(t *testing.T) { - value := "npm_" + strings.Repeat("n", 36) // total 40 chars - t.Setenv("REGISTRY_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "REGISTRY_KEY") + // The finding must be the name-regex one (no "Value matches" in description). for _, f := range result.Findings { - if f.Resource == "REGISTRY_KEY" { - if !strings.Contains(f.Description, "npm") { - t.Errorf("expected description to contain %q, got %q", "npm", f.Description) + if f.Resource == "CUSTOM_STRIPE_KEY" { + if strings.Contains(f.Description, "Value matches") { + t.Errorf("expected name-regex finding (not value-pattern), got description: %q", f.Description) } } } } -// TestAPIKeyScanner_ValuePattern_Groq verifies that a Groq key (gsk_ + 52 chars = 56 total) -// produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_Groq(t *testing.T) { - value := "gsk_" + strings.Repeat("q", 52) // total 56 chars - t.Setenv("INFERENCE_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() +// ── Tightened-regex false-positive tests ───────────────────────────────────── - assertResource(t, result.Findings, "INFERENCE_KEY") - for _, f := range result.Findings { - if f.Resource == "INFERENCE_KEY" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Groq key, got %q", f.Severity) - } - if !strings.Contains(f.Description, "Groq") { - t.Errorf("expected description to contain %q, got %q", "Groq", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_SendGrid verifies that a SendGrid key -// (SG. + 22 chars + . + 43 chars = 69 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_SendGrid(t *testing.T) { - // SG. (3) + 22 chars + . (1) + 43 chars = 69 total - value := "SG." + strings.Repeat("a", 22) + "." + strings.Repeat("b", 43) - t.Setenv("MAIL_KEY", value) +// TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern verifies that the tightened \bNEON_ +// pattern does not fire on variable names that contain "neon" as part of a longer word. +func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) + // These should NOT be flagged. + t.Setenv("ANEMONE_CONFIG", "some-value") + t.Setenv("NEONLIGHTS_COLOR", "blue") + // This SHOULD be flagged. + t.Setenv("NEON_API_KEY", "real-neon-key") s := newScannerWithHome(t.TempDir()) result := s.Scan() - assertResource(t, result.Findings, "MAIL_KEY") + assertResource(t, result.Findings, "NEON_API_KEY") for _, f := range result.Findings { - if f.Resource == "MAIL_KEY" { - if !strings.Contains(f.Description, "SendGrid") { - t.Errorf("expected description to contain %q, got %q", "SendGrid", f.Description) - } + if f.Resource == "ANEMONE_CONFIG" { + t.Error("ANEMONE_CONFIG should not be flagged by NEON_ pattern") + } + if f.Resource == "NEONLIGHTS_COLOR" { + t.Error("NEONLIGHTS_COLOR should not be flagged by NEON_ pattern") } } - assertNoSecretValue(t, result.Findings, value) } -// TestAPIKeyScanner_ValuePattern_Anthropic verifies that an Anthropic key -// (sk-ant- prefix, 108 total chars) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_Anthropic(t *testing.T) { - value := "sk-ant-" + strings.Repeat("a", 101) // total 108 chars - t.Setenv("LLM_KEY", value) +// TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern verifies that the tightened \bLINEAR_ +// pattern does not fire on names containing "linear" as a substring. +func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) + t.Setenv("BILINEAR_FILTER", "some-value") + t.Setenv("LINEAR_API_KEY", "real-linear-key") s := newScannerWithHome(t.TempDir()) result := s.Scan() - assertResource(t, result.Findings, "LLM_KEY") + assertResource(t, result.Findings, "LINEAR_API_KEY") for _, f := range result.Findings { - if f.Resource == "LLM_KEY" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Anthropic key, got %q", f.Severity) - } - if !strings.Contains(f.Description, "Anthropic") { - t.Errorf("expected description to contain %q, got %q", "Anthropic", f.Description) - } + if f.Resource == "BILINEAR_FILTER" { + t.Error("BILINEAR_FILTER should not be flagged by LINEAR_ pattern") } } - assertNoSecretValue(t, result.Findings, value) } -// ── New nameRegex tests ─────────────────────────────────────────────────────── - -// TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN -// but does NOT match BUTTERFLY_KEY (which contains the substring FLY_ but should not -// be treated as a Fly.io credential due to the word-boundary anchor in the pattern). -func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { +// TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern verifies that the tightened \bPALM_ +// pattern does not fire on names like NAPALM_MODE. +func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) - t.Setenv("FLY_API_TOKEN", "real-token") - t.Setenv("BUTTERFLY_KEY", "not-a-fly-token") - t.Setenv("FLYWEIGHT_INDEX", "not-a-token") + t.Setenv("NAPALM_MODE", "some-value") + t.Setenv("PALM_API_KEY", "real-palm-key") s := newScannerWithHome(t.TempDir()) result := s.Scan() - // FLY_API_TOKEN must be flagged. - assertResource(t, result.Findings, "FLY_API_TOKEN") - - // BUTTERFLY_KEY and FLYWEIGHT_INDEX must NOT be flagged. + assertResource(t, result.Findings, "PALM_API_KEY") for _, f := range result.Findings { - if f.Resource == "BUTTERFLY_KEY" { - t.Error("BUTTERFLY_KEY should not be flagged by FLY_ pattern") - } - if f.Resource == "FLYWEIGHT_INDEX" { - t.Error("FLYWEIGHT_INDEX should not be flagged by FLY_ pattern") + if f.Resource == "NAPALM_MODE" { + t.Error("NAPALM_MODE should not be flagged by PALM_ pattern") } } } -// TestAPIKeyScanner_NameRegex_NewProviders verifies that new provider keywords -// added in this session are recognised. -func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { +// TestAPIKeyScanner_NameRegex_NewAIProviders verifies that newly added AI provider +// name patterns are recognised. +func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { clearHighRiskEnv(t) cases := []struct { envVar string value string }{ - {"MY_GEMINI_KEY", "gemini-key-value"}, - {"VERTEX_API_KEY", "vertex-key-value"}, - {"BEDROCK_ACCESS_KEY", "bedrock-key-value"}, - {"AZURE_OPENAI_KEY", "azure-openai-key"}, - {"RESEND_API_KEY", "resend-key-value"}, - {"POSTMARK_TOKEN", "postmark-key-value"}, - {"MY_LINEAR_TOKEN", "linear-key-value"}, - {"NOTION_API_KEY", "notion-key-value"}, - {"AIRTABLE_KEY", "airtable-key-value"}, - {"SUPABASE_KEY", "supabase-key-value"}, - {"NEON_API_KEY", "neon-key-value"}, - {"PLANETSCALE_TOKEN", "ps-key-value"}, + {"OPENROUTER_API_KEY", "or-key-value"}, + {"FIREWORKS_API_KEY", "fw-key-value"}, + {"DEEPSEEK_API_KEY", "ds-key-value"}, + {"PERPLEXITY_API_KEY", "pplx-key-value"}, + {"CEREBRAS_API_KEY", "cb-key-value"}, + {"DOPPLER_TOKEN", "dp-token-value"}, } - for _, tc := range cases { t.Setenv(tc.envVar, tc.value) } @@ -607,87 +511,6 @@ func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { } } -// TestAPIKeyScanner_ValuePattern_OpenAIProject verifies that a value matching the -// OpenAI project key format (sk-proj- + 48 chars = 56 total) produces a finding -// with the correct resource name and provider tag in the description. -func TestAPIKeyScanner_ValuePattern_OpenAIProject(t *testing.T) { - value := "sk-proj-" + strings.Repeat("a", 48) // total 56 chars - t.Setenv("SOME_AI_CRED", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "SOME_AI_CRED") - for _, f := range result.Findings { - if f.Resource == "SOME_AI_CRED" { - if !strings.Contains(f.Description, "OpenAI project") { - t.Errorf("expected description to contain %q, got %q", "OpenAI project", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_HuggingFace verifies that a value matching the -// HuggingFace token format (hf_ + 34 chars = 37 total) produces a correct finding. -func TestAPIKeyScanner_ValuePattern_HuggingFace(t *testing.T) { - value := "hf_" + strings.Repeat("b", 34) // total 37 chars - // Use a variable name that does NOT match any nameRegex pattern so the finding - // comes from scanValuePatterns (and the HuggingFace provider tag is in the description). - t.Setenv("ML_MODEL_CRED", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "ML_MODEL_CRED") - for _, f := range result.Findings { - if f.Resource == "ML_MODEL_CRED" { - if !strings.Contains(f.Description, "HuggingFace") { - t.Errorf("expected description to contain %q, got %q", "HuggingFace", f.Description) - } - } - } -} - -// TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT verifies that a value matching the -// GitHub classic PAT format (ghp_ + 36 chars = 40 total) produces a correct finding. -func TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT(t *testing.T) { - value := "ghp_" + strings.Repeat("c", 36) // total 40 chars - t.Setenv("WORK_GH_TOKEN", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "WORK_GH_TOKEN") - for _, f := range result.Findings { - if f.Resource == "WORK_GH_TOKEN" { - if !strings.Contains(f.Description, "GitHub") { - t.Errorf("expected description to contain %q, got %q", "GitHub", f.Description) - } - } - } -} - -// TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the -// right prefix but wrong length does NOT produce a finding. -func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { - value := "sk-proj-" + strings.Repeat("x", 10) // total 18 chars, wrong length for any pattern - t.Setenv("SOME_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - for _, f := range result.Findings { - if f.Resource == "SOME_KEY" { - t.Errorf("got unexpected finding for SOME_KEY with wrong-length value") - } - } -} - // TestAPIKeyScanner_ValuePattern_BuiltinSkipped verifies that a key in HighRiskEnvKeys // whose value also matches a value pattern produces exactly ONE finding (from scanEnvKeys, // not from scanValuePatterns which skips it). From ce41c8d1441de8500d76b7b2d37d147abe1056ab Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:32:06 +0100 Subject: [PATCH 05/17] fix: restore deleted tests, downgrade Twilio SK to UNCERTAIN, fix LINEAR_ regex - Restore 14 value-pattern and name-regex tests accidentally deleted in 124de66 (recovered from db20f03 and merged with tests added in HEAD) - Downgrade Twilio SK prefix from SeverityHigh to SeverityUncertain: the bare 'SK' prefix is too broad (no hex charset validation), so false positives are likely; test updated to assert UNCERTAIN - Fix LINEAR_ name-regex: replace \bLINEAR_ with (^|_)LINEAR_ so that MY_LINEAR_TOKEN matches (underscore is a word char in RE2, so \b fails there) while BILINEAR_FILTER still does not match --- internal/scan/apikeys.go | 6 +- internal/scan/apikeys_test.go | 333 +++++++++++++++++++++++++++++++++- 2 files changed, 332 insertions(+), 7 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index f67647a..8c40253 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -78,7 +78,7 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), // Productivity / project tools (common in agent contexts) - regexp.MustCompile(`(?i)\bLINEAR_`), // word boundary + underscore avoids BILINEAR_FILTER + regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while still matching MY_LINEAR_TOKEN regexp.MustCompile(`(?i)NOTION`), regexp.MustCompile(`(?i)AIRTABLE`), // Database-as-a-service (API keys / connection tokens) @@ -125,7 +125,9 @@ var valuePatterns = []valuePattern{ // Groq — gsk_ prefix confirmed in Groq docs. {prefix: "gsk_", totalLen: 56, severity: models.SeverityHigh, providerTag: "Groq"}, // Twilio API key SID — SK + 32 hex chars = 34 total. - {prefix: "SK", totalLen: 34, severity: models.SeverityHigh, providerTag: "Twilio API key SID"}, + // SeverityUncertain: the SK prefix is too broad (any 34-char string starting with SK + // would match); we don't validate the hex charset, so false positives are likely. + {prefix: "SK", totalLen: 34, severity: models.SeverityUncertain, providerTag: "Twilio API key SID"}, // SendGrid — SG. + 22 + . + 43 = 69 total (with the dots). {prefix: "SG.", totalLen: 69, severity: models.SeverityHigh, providerTag: "SendGrid"}, // HuggingFace diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index 6668600..d9badb5 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -363,10 +363,333 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { } } +// ── Value-pattern tests ─────────────────────────────────────────────────────── + +// TestAPIKeyScanner_ValuePattern_AmbiguousSK verifies that a value matching the +// generic sk- format (51 chars) produces an UNCERTAIN finding, not HIGH, because +// sk- is used by many tools beyond OpenAI legacy. +func TestAPIKeyScanner_ValuePattern_AmbiguousSK(t *testing.T) { + value := "sk-" + strings.Repeat("x", 48) // total 51 chars + t.Setenv("SOME_CRED", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "SOME_CRED") + for _, f := range result.Findings { + if f.Resource == "SOME_CRED" { + if f.Severity != "UNCERTAIN" { + t.Errorf("expected UNCERTAIN severity for ambiguous sk- key, got %q", f.Severity) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_StripeLiveSecret verifies that a Stripe live secret key +// (sk_live_ + 47 chars = 55 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_StripeLiveSecret(t *testing.T) { + value := "sk_live_" + strings.Repeat("s", 47) // total 55 chars + t.Setenv("PAYMENT_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "PAYMENT_KEY") + for _, f := range result.Findings { + if f.Resource == "PAYMENT_KEY" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for Stripe live key, got %q", f.Severity) + } + if !strings.Contains(f.Description, "Stripe") { + t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_StripeTestSecret verifies that a Stripe test secret key +// (sk_test_ + 47 chars = 55 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_StripeTestSecret(t *testing.T) { + value := "sk_test_" + strings.Repeat("t", 47) // total 55 chars + t.Setenv("TEST_PAYMENT_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "TEST_PAYMENT_KEY") + for _, f := range result.Findings { + if f.Resource == "TEST_PAYMENT_KEY" { + if !strings.Contains(f.Description, "Stripe") { + t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_GitLabPAT verifies that a GitLab personal access token +// (glpat- + 20 chars = 26 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_GitLabPAT(t *testing.T) { + value := "glpat-" + strings.Repeat("g", 20) // total 26 chars + t.Setenv("REPO_TOKEN", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "REPO_TOKEN") + for _, f := range result.Findings { + if f.Resource == "REPO_TOKEN" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for GitLab PAT, got %q", f.Severity) + } + if !strings.Contains(f.Description, "GitLab") { + t.Errorf("expected description to contain %q, got %q", "GitLab", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_NpmToken verifies that an npm granular access token +// (npm_ + 36 chars = 40 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_NpmToken(t *testing.T) { + value := "npm_" + strings.Repeat("n", 36) // total 40 chars + t.Setenv("REGISTRY_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "REGISTRY_KEY") + for _, f := range result.Findings { + if f.Resource == "REGISTRY_KEY" { + if !strings.Contains(f.Description, "npm") { + t.Errorf("expected description to contain %q, got %q", "npm", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_Groq verifies that a Groq key (gsk_ + 52 chars = 56 total) +// produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_Groq(t *testing.T) { + value := "gsk_" + strings.Repeat("q", 52) // total 56 chars + t.Setenv("INFERENCE_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "INFERENCE_KEY") + for _, f := range result.Findings { + if f.Resource == "INFERENCE_KEY" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for Groq key, got %q", f.Severity) + } + if !strings.Contains(f.Description, "Groq") { + t.Errorf("expected description to contain %q, got %q", "Groq", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_SendGrid verifies that a SendGrid key +// (SG. + 22 chars + . + 43 chars = 69 total) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_SendGrid(t *testing.T) { + // SG. (3) + 22 chars + . (1) + 43 chars = 69 total + value := "SG." + strings.Repeat("a", 22) + "." + strings.Repeat("b", 43) + t.Setenv("MAIL_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "MAIL_KEY") + for _, f := range result.Findings { + if f.Resource == "MAIL_KEY" { + if !strings.Contains(f.Description, "SendGrid") { + t.Errorf("expected description to contain %q, got %q", "SendGrid", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_Anthropic verifies that an Anthropic key +// (sk-ant- prefix, 108 total chars) produces a HIGH finding. +func TestAPIKeyScanner_ValuePattern_Anthropic(t *testing.T) { + value := "sk-ant-" + strings.Repeat("a", 101) // total 108 chars + t.Setenv("LLM_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "LLM_KEY") + for _, f := range result.Findings { + if f.Resource == "LLM_KEY" { + if f.Severity != "HIGH" { + t.Errorf("expected HIGH severity for Anthropic key, got %q", f.Severity) + } + if !strings.Contains(f.Description, "Anthropic") { + t.Errorf("expected description to contain %q, got %q", "Anthropic", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN +// but does NOT match BUTTERFLY_KEY (which contains the substring FLY_ but should not +// be treated as a Fly.io credential due to the word-boundary anchor in the pattern). +func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { + clearHighRiskEnv(t) + t.Setenv("FLY_API_TOKEN", "real-token") + t.Setenv("BUTTERFLY_KEY", "not-a-fly-token") + t.Setenv("FLYWEIGHT_INDEX", "not-a-token") + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + // FLY_API_TOKEN must be flagged. + assertResource(t, result.Findings, "FLY_API_TOKEN") + + // BUTTERFLY_KEY and FLYWEIGHT_INDEX must NOT be flagged. + for _, f := range result.Findings { + if f.Resource == "BUTTERFLY_KEY" { + t.Error("BUTTERFLY_KEY should not be flagged by FLY_ pattern") + } + if f.Resource == "FLYWEIGHT_INDEX" { + t.Error("FLYWEIGHT_INDEX should not be flagged by FLY_ pattern") + } + } +} + +// TestAPIKeyScanner_NameRegex_NewProviders verifies that new provider keywords +// added in this session are recognised. +func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { + clearHighRiskEnv(t) + cases := []struct { + envVar string + value string + }{ + {"MY_GEMINI_KEY", "gemini-key-value"}, + {"VERTEX_API_KEY", "vertex-key-value"}, + {"BEDROCK_ACCESS_KEY", "bedrock-key-value"}, + {"AZURE_OPENAI_KEY", "azure-openai-key"}, + {"RESEND_API_KEY", "resend-key-value"}, + {"POSTMARK_TOKEN", "postmark-key-value"}, + {"MY_LINEAR_TOKEN", "linear-key-value"}, + {"NOTION_API_KEY", "notion-key-value"}, + {"AIRTABLE_KEY", "airtable-key-value"}, + {"SUPABASE_KEY", "supabase-key-value"}, + {"NEON_API_KEY", "neon-key-value"}, + {"PLANETSCALE_TOKEN", "ps-key-value"}, + } + + for _, tc := range cases { + t.Setenv(tc.envVar, tc.value) + } + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + for _, tc := range cases { + assertResource(t, result.Findings, tc.envVar) + } +} + +// TestAPIKeyScanner_ValuePattern_OpenAIProject verifies that a value matching the +// OpenAI project key format (sk-proj- + 48 chars = 56 total) produces a finding +// with the correct resource name and provider tag in the description. +func TestAPIKeyScanner_ValuePattern_OpenAIProject(t *testing.T) { + value := "sk-proj-" + strings.Repeat("a", 48) // total 56 chars + t.Setenv("SOME_AI_CRED", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "SOME_AI_CRED") + for _, f := range result.Findings { + if f.Resource == "SOME_AI_CRED" { + if !strings.Contains(f.Description, "OpenAI project") { + t.Errorf("expected description to contain %q, got %q", "OpenAI project", f.Description) + } + } + } + assertNoSecretValue(t, result.Findings, value) +} + +// TestAPIKeyScanner_ValuePattern_HuggingFace verifies that a value matching the +// HuggingFace token format (hf_ + 34 chars = 37 total) produces a correct finding. +func TestAPIKeyScanner_ValuePattern_HuggingFace(t *testing.T) { + value := "hf_" + strings.Repeat("b", 34) // total 37 chars + // Use a variable name that does NOT match any nameRegex pattern so the finding + // comes from scanValuePatterns (and the HuggingFace provider tag is in the description). + t.Setenv("ML_MODEL_CRED", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "ML_MODEL_CRED") + for _, f := range result.Findings { + if f.Resource == "ML_MODEL_CRED" { + if !strings.Contains(f.Description, "HuggingFace") { + t.Errorf("expected description to contain %q, got %q", "HuggingFace", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT verifies that a value matching the +// GitHub classic PAT format (ghp_ + 36 chars = 40 total) produces a correct finding. +func TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT(t *testing.T) { + value := "ghp_" + strings.Repeat("c", 36) // total 40 chars + t.Setenv("WORK_GH_TOKEN", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "WORK_GH_TOKEN") + for _, f := range result.Findings { + if f.Resource == "WORK_GH_TOKEN" { + if !strings.Contains(f.Description, "GitHub") { + t.Errorf("expected description to contain %q, got %q", "GitHub", f.Description) + } + } + } +} + +// TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the +// right prefix but wrong length does NOT produce a finding. +func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { + value := "sk-proj-" + strings.Repeat("x", 10) // total 18 chars, wrong length for any pattern + t.Setenv("SOME_KEY", value) + clearHighRiskEnv(t) + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + for _, f := range result.Findings { + if f.Resource == "SOME_KEY" { + t.Errorf("got unexpected finding for SOME_KEY with wrong-length value") + } + } +} + // TestAPIKeyScanner_ValuePattern_TwilioSID verifies that a Twilio API key SID -// (SK + 32 hex chars = 34 total) produces a HIGH finding. -// The variable name is intentionally neutral (no provider keyword) so the finding -// comes from the value-pattern pass, confirming the pattern itself works. +// (SK + 32 hex chars = 34 total) produces an UNCERTAIN finding. +// The SK prefix is intentionally broad (any 34-char string starting with SK matches) +// so we use SeverityUncertain rather than SeverityHigh to avoid false positives. func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) { value := "SK" + strings.Repeat("f", 32) // total 34 chars t.Setenv("CRED_SID", value) @@ -378,8 +701,8 @@ func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) { assertResource(t, result.Findings, "CRED_SID") for _, f := range result.Findings { if f.Resource == "CRED_SID" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Twilio SID, got %q", f.Severity) + if f.Severity != "UNCERTAIN" { + t.Errorf("expected UNCERTAIN severity for Twilio SID (broad SK prefix), got %q", f.Severity) } if !strings.Contains(f.Description, "Twilio") { t.Errorf("expected description to contain %q, got %q", "Twilio", f.Description) From 994ba93be4869faf56cafc74f9f8c5ff35aeb809 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:35:42 +0100 Subject: [PATCH 06/17] fix: complete cross-pass dedup, fix (^|_) regexes, add XAI/ASSEMBLYAI/AI21/NVIDIA_NIM - Fix cross-pass dedup gap: scanEnvKeys now accepts and populates the shared seenEnvNames map, so extra_env_keys entries that also match a nameRegex pattern produce exactly one finding (scanEnvKeys wins as highest-priority pass). Regression test: TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex. - Fix FLY_, NEON_, PALM_ regexes: replace \bFLY_ / \bNEON_ / \bPALM_ with (^|_)FLY_ etc. In RE2, _ is a word character so \b does not fire between _ and a letter, meaning MY_FLY_TOKEN, MY_NEON_KEY, MY_PALM_KEY were silently missed. Tests updated to assert both the positive and negative cases. - Add name-regex patterns for XAI, ASSEMBLYAI, AI21, NVIDIA_NIM (reviewer suggestion). Tests added in TestAPIKeyScanner_NameRegex_NewAIProviders. --- internal/scan/apikeys.go | 32 ++++++++++++++++++--------- internal/scan/apikeys_test.go | 41 +++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index 8c40253..ab4c5d8 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -35,12 +35,16 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)DEEPSEEK`), regexp.MustCompile(`(?i)PERPLEXITY`), regexp.MustCompile(`(?i)CEREBRAS`), + regexp.MustCompile(`(?i)XAI`), + regexp.MustCompile(`(?i)ASSEMBLYAI`), + regexp.MustCompile(`(?i)AI21`), + regexp.MustCompile(`(?i)NVIDIA_NIM`), // Secrets managers regexp.MustCompile(`(?i)DOPPLER`), // Google AI (Gemini, Vertex AI, PaLM) regexp.MustCompile(`(?i)GEMINI`), regexp.MustCompile(`(?i)VERTEX`), - regexp.MustCompile(`(?i)\bPALM_`), // word boundary + underscore avoids NAPALM_MODE, PALM_BEACH_PROPERTY + regexp.MustCompile(`(?i)(^|_)PALM_`), // (^|_) avoids NAPALM_MODE while matching MY_PALM_KEY // AWS AI regexp.MustCompile(`(?i)BEDROCK`), // Azure AI @@ -72,7 +76,7 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)CLOUDFLARE`), regexp.MustCompile(`(?i)HEROKU`), regexp.MustCompile(`(?i)RAILWAY`), - regexp.MustCompile(`(?i)\bFLY_`), // word boundary prevents false positives (BUTTERFLY_KEY) + regexp.MustCompile(`(?i)(^|_)FLY_`), // (^|_) avoids BUTTERFLY_KEY, FLYWEIGHT_INDEX while matching MY_FLY_TOKEN // Source control regexp.MustCompile(`(?i)GITHUB`), regexp.MustCompile(`(?i)GITLAB`), @@ -83,7 +87,7 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)AIRTABLE`), // Database-as-a-service (API keys / connection tokens) regexp.MustCompile(`(?i)SUPABASE`), - regexp.MustCompile(`(?i)\bNEON_`), // word boundary + underscore avoids ANEMONE_CONFIG, NEON_LIGHTS_COLOR + regexp.MustCompile(`(?i)(^|_)NEON_`), // (^|_) avoids ANEMONE_CONFIG, NEONLIGHTS_COLOR while matching MY_NEON_KEY regexp.MustCompile(`(?i)PLANETSCALE`), // Generic credential terms regexp.MustCompile(`(?i)API_KEY`), @@ -194,11 +198,15 @@ func (s *APIKeyScanner) Name() string { return "api_keys" } // Implements Scanner. Never returns skipped=true. func (s *APIKeyScanner) Scan() models.ScanResult { var findings []models.Finding - // seenEnvNames is shared across the name-regex and value-pattern passes so that a - // variable matching both (e.g. CUSTOM_STRIPE_KEY=sk_live_...) produces exactly one - // finding — the name-regex pass runs first and claims it. + // seenEnvNames is shared across all three env-scanning passes so that any variable + // claimed by an earlier pass is not re-reported by a later one. Order: + // 1. scanEnvKeys — exact-match built-in + user-configured extra keys + // 2. scanNameRegex — name-pattern heuristics (MY_OPENAI_KEY etc.) + // 3. scanValuePatterns — prefix+length value matching + // A variable in ExtraEnvKeys that also matches a nameRegex pattern therefore produces + // exactly one finding (from scanEnvKeys, the highest-priority pass). seenEnvNames := make(map[string]bool) - findings = append(findings, s.scanEnvKeys()...) + findings = append(findings, s.scanEnvKeys(seenEnvNames)...) findings = append(findings, s.scanNameRegex(seenEnvNames)...) findings = append(findings, s.scanValuePatterns(seenEnvNames)...) findings = append(findings, s.scanCredentialFiles()...) @@ -210,7 +218,9 @@ func (s *APIKeyScanner) Scan() models.ScanResult { // scanEnvKeys checks built-in and extra environment variable key names for presence. // Key names only are reported; values are never read or stored. -func (s *APIKeyScanner) scanEnvKeys() []models.Finding { +// seenEnvNames is the shared cross-pass dedup set; matched names are added to it so +// that scanNameRegex and scanValuePatterns will skip variables already claimed here. +func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding // KEYS-01: Built-in high-risk env vars (sorted for deterministic output). @@ -222,6 +232,7 @@ func (s *APIKeyScanner) scanEnvKeys() []models.Finding { for _, key := range keys { if val := os.Getenv(key); val != "" { _ = val // value is intentionally discarded; presence only + seenEnvNames[key] = true findings = append(findings, envKeyFinding(key)) } } @@ -235,12 +246,13 @@ func (s *APIKeyScanner) scanEnvKeys() []models.Finding { copy(extraKeys, s.ExtraEnvKeys) sort.Strings(extraKeys) for _, key := range extraKeys { - if HighRiskEnvKeys[key] || seenExtra[key] { - continue // already covered by built-in check or earlier extra + if HighRiskEnvKeys[key] || seenExtra[key] || seenEnvNames[key] { + continue // already covered by built-in check, earlier extra, or another pass } seenExtra[key] = true if val := os.Getenv(key); val != "" { _ = val // value is intentionally discarded; presence only + seenEnvNames[key] = true findings = append(findings, envKeyFinding(key)) } } diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index d9badb5..99a0b52 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -551,14 +551,16 @@ func TestAPIKeyScanner_ValuePattern_Anthropic(t *testing.T) { func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { clearHighRiskEnv(t) t.Setenv("FLY_API_TOKEN", "real-token") + t.Setenv("MY_FLY_TOKEN", "also-real-token") t.Setenv("BUTTERFLY_KEY", "not-a-fly-token") t.Setenv("FLYWEIGHT_INDEX", "not-a-token") s := newScannerWithHome(t.TempDir()) result := s.Scan() - // FLY_API_TOKEN must be flagged. + // FLY_API_TOKEN and MY_FLY_TOKEN must both be flagged. assertResource(t, result.Findings, "FLY_API_TOKEN") + assertResource(t, result.Findings, "MY_FLY_TOKEN") // BUTTERFLY_KEY and FLYWEIGHT_INDEX must NOT be flagged. for _, f := range result.Findings { @@ -754,13 +756,15 @@ func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) { // These should NOT be flagged. t.Setenv("ANEMONE_CONFIG", "some-value") t.Setenv("NEONLIGHTS_COLOR", "blue") - // This SHOULD be flagged. + // These SHOULD be flagged. t.Setenv("NEON_API_KEY", "real-neon-key") + t.Setenv("MY_NEON_KEY", "also-real-neon-key") s := newScannerWithHome(t.TempDir()) result := s.Scan() assertResource(t, result.Findings, "NEON_API_KEY") + assertResource(t, result.Findings, "MY_NEON_KEY") for _, f := range result.Findings { if f.Resource == "ANEMONE_CONFIG" { t.Error("ANEMONE_CONFIG should not be flagged by NEON_ pattern") @@ -794,12 +798,15 @@ func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) { func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) t.Setenv("NAPALM_MODE", "some-value") + // These SHOULD be flagged. t.Setenv("PALM_API_KEY", "real-palm-key") + t.Setenv("MY_PALM_KEY", "also-real-palm-key") s := newScannerWithHome(t.TempDir()) result := s.Scan() assertResource(t, result.Findings, "PALM_API_KEY") + assertResource(t, result.Findings, "MY_PALM_KEY") for _, f := range result.Findings { if f.Resource == "NAPALM_MODE" { t.Error("NAPALM_MODE should not be flagged by PALM_ pattern") @@ -821,6 +828,10 @@ func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { {"PERPLEXITY_API_KEY", "pplx-key-value"}, {"CEREBRAS_API_KEY", "cb-key-value"}, {"DOPPLER_TOKEN", "dp-token-value"}, + {"XAI_API_KEY", "xai-key-value"}, + {"ASSEMBLYAI_API_KEY", "aai-key-value"}, + {"AI21_API_KEY", "ai21-key-value"}, + {"NVIDIA_NIM_API_KEY", "nim-key-value"}, } for _, tc := range cases { t.Setenv(tc.envVar, tc.value) @@ -834,6 +845,32 @@ func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { } } +// TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex verifies that a key listed in +// ExtraEnvKeys whose name also matches a nameRegexPattern produces exactly ONE finding. +// Previously scanEnvKeys and scanNameRegex were not sharing the seenEnvNames dedup map, +// so MY_OPENAI_KEY in extra_env_keys would fire twice. +func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) { + const key = "MY_OPENAI_KEY" // matches OPENAI nameRegexPattern AND is in ExtraEnvKeys + t.Setenv(key, "sk-test-value") + clearHighRiskEnv(t) + + s := &scan.APIKeyScanner{ + HomeDir: t.TempDir(), + ExtraEnvKeys: []string{key}, + } + result := s.Scan() + + count := 0 + for _, f := range result.Findings { + if f.Resource == key { + count++ + } + } + if count != 1 { + t.Errorf("expected exactly 1 finding for %q (ExtraEnvKeys + nameRegex cross-pass dedup), got %d", key, count) + } +} + // TestAPIKeyScanner_ValuePattern_BuiltinSkipped verifies that a key in HighRiskEnvKeys // whose value also matches a value pattern produces exactly ONE finding (from scanEnvKeys, // not from scanValuePatterns which skips it). From 2d7f91d226e1bf37e6e26381a5067f5c4202f480 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:40:29 +0100 Subject: [PATCH 07/17] Update --- internal/scan/apikeys.go | 19 ++++++++++++------- internal/scan/apikeys_test.go | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index ab4c5d8..db9c98c 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -365,11 +365,18 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { // If home directory cannot be resolved, skip all ~-based paths to avoid // scanning incorrect root-relative paths (e.g. /.aws/credentials). homeDir := s.resolveHomeDir() + // seenPath is shared across built-in and extra loops so that an extra path + // duplicating a built-in (e.g. ~/.netrc in both lists) produces only one finding. + seenPath := make(map[string]bool) for _, cf := range credentialFiles { if homeDir == "" && len(cf.Path) > 0 && cf.Path[0] == '~' { continue } - expanded := expandHome(cf.Path, homeDir) + expanded := filepath.Clean(expandHome(cf.Path, homeDir)) + if seenPath[expanded] { + continue + } + seenPath[expanded] = true if fsutil.Exists(expanded) { findings = append(findings, models.Finding{ Scanner: "api_keys", @@ -381,17 +388,15 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { } // Extra credential files from user config. - // Deduplicate by expanded path to avoid reporting the same file twice. - seenExtraPath := make(map[string]bool, len(s.ExtraCredentialFiles)) for _, cf := range s.ExtraCredentialFiles { if homeDir == "" && len(cf.Path) > 0 && cf.Path[0] == '~' { continue } - expanded := expandHome(cf.Path, homeDir) - if seenExtraPath[expanded] { - continue // duplicate path in extras list + expanded := filepath.Clean(expandHome(cf.Path, homeDir)) + if seenPath[expanded] { + continue // already reported by built-in or earlier extra } - seenExtraPath[expanded] = true + seenPath[expanded] = true if fsutil.Exists(expanded) { findings = append(findings, models.Finding{ Scanner: "api_keys", diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index 99a0b52..ff13f23 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -246,6 +246,25 @@ func TestAPIKeyScanner_NoDuplicateFindings(t *testing.T) { return tokenFile }, }, + { + name: "extra credential file duplicates built-in path", + makeScanner: func(home string) *scan.APIKeyScanner { + return &scan.APIKeyScanner{ + HomeDir: home, + ExtraCredentialFiles: []config.CredentialFile{ + {Path: "~/.netrc", Label: "netrc (duplicate of built-in)"}, + }, + } + }, + setup: func(t *testing.T, home string) string { + netrcFile := filepath.Join(home, ".netrc") + if err := os.WriteFile(netrcFile, []byte("machine example.com"), 0o600); err != nil { + t.Fatalf("create .netrc: %v", err) + } + clearHighRiskEnv(t) + return netrcFile + }, + }, } for _, tc := range cases { From f5f446d2fec6754d1435ee7c19015c85c63b7134 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:44:36 +0100 Subject: [PATCH 08/17] Simplify --- internal/scan/apikeys.go | 44 ++---- internal/scan/apikeys_test.go | 278 +++++----------------------------- 2 files changed, 52 insertions(+), 270 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index db9c98c..94052ad 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -230,28 +230,22 @@ func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Findi } sort.Strings(keys) for _, key := range keys { - if val := os.Getenv(key); val != "" { - _ = val // value is intentionally discarded; presence only + if os.Getenv(key) != "" { seenEnvNames[key] = true findings = append(findings, envKeyFinding(key)) } } // Extra env keys from user config (sorted for deterministic output). - // Skip any that are already in the built-in set or seen earlier in the extras - // list to avoid duplicate findings. if len(s.ExtraEnvKeys) > 0 { - seenExtra := make(map[string]bool, len(s.ExtraEnvKeys)) extraKeys := make([]string, len(s.ExtraEnvKeys)) copy(extraKeys, s.ExtraEnvKeys) sort.Strings(extraKeys) for _, key := range extraKeys { - if HighRiskEnvKeys[key] || seenExtra[key] || seenEnvNames[key] { - continue // already covered by built-in check, earlier extra, or another pass + if HighRiskEnvKeys[key] || seenEnvNames[key] { + continue } - seenExtra[key] = true - if val := os.Getenv(key); val != "" { - _ = val // value is intentionally discarded; presence only + if os.Getenv(key) != "" { seenEnvNames[key] = true findings = append(findings, envKeyFinding(key)) } @@ -365,10 +359,12 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { // If home directory cannot be resolved, skip all ~-based paths to avoid // scanning incorrect root-relative paths (e.g. /.aws/credentials). homeDir := s.resolveHomeDir() - // seenPath is shared across built-in and extra loops so that an extra path - // duplicating a built-in (e.g. ~/.netrc in both lists) produces only one finding. - seenPath := make(map[string]bool) - for _, cf := range credentialFiles { + // Combine built-in and extra credential files into a single pass. + // seenPath deduplicates so that an extra path duplicating a built-in + // (e.g. ~/.netrc in both lists) produces only one finding. + allCredFiles := append(credentialFiles, s.ExtraCredentialFiles...) + seenPath := make(map[string]bool, len(allCredFiles)) + for _, cf := range allCredFiles { if homeDir == "" && len(cf.Path) > 0 && cf.Path[0] == '~' { continue } @@ -387,26 +383,6 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { } } - // Extra credential files from user config. - for _, cf := range s.ExtraCredentialFiles { - if homeDir == "" && len(cf.Path) > 0 && cf.Path[0] == '~' { - continue - } - expanded := filepath.Clean(expandHome(cf.Path, homeDir)) - if seenPath[expanded] { - continue // already reported by built-in or earlier extra - } - seenPath[expanded] = true - if fsutil.Exists(expanded) { - findings = append(findings, models.Finding{ - Scanner: "api_keys", - Resource: expanded, // path only, never file contents - Severity: models.SeverityModerate, - Description: fmt.Sprintf("Credential file readable at %s.", expanded), - }) - } - } - return findings } diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index ff13f23..bedc6f9 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -384,184 +384,54 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { // ── Value-pattern tests ─────────────────────────────────────────────────────── -// TestAPIKeyScanner_ValuePattern_AmbiguousSK verifies that a value matching the -// generic sk- format (51 chars) produces an UNCERTAIN finding, not HIGH, because -// sk- is used by many tools beyond OpenAI legacy. -func TestAPIKeyScanner_ValuePattern_AmbiguousSK(t *testing.T) { - value := "sk-" + strings.Repeat("x", 48) // total 51 chars - t.Setenv("SOME_CRED", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "SOME_CRED") - for _, f := range result.Findings { - if f.Resource == "SOME_CRED" { - if f.Severity != "UNCERTAIN" { - t.Errorf("expected UNCERTAIN severity for ambiguous sk- key, got %q", f.Severity) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_StripeLiveSecret verifies that a Stripe live secret key -// (sk_live_ + 47 chars = 55 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_StripeLiveSecret(t *testing.T) { - value := "sk_live_" + strings.Repeat("s", 47) // total 55 chars - t.Setenv("PAYMENT_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "PAYMENT_KEY") - for _, f := range result.Findings { - if f.Resource == "PAYMENT_KEY" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Stripe live key, got %q", f.Severity) - } - if !strings.Contains(f.Description, "Stripe") { - t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_StripeTestSecret verifies that a Stripe test secret key -// (sk_test_ + 47 chars = 55 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_StripeTestSecret(t *testing.T) { - value := "sk_test_" + strings.Repeat("t", 47) // total 55 chars - t.Setenv("TEST_PAYMENT_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "TEST_PAYMENT_KEY") - for _, f := range result.Findings { - if f.Resource == "TEST_PAYMENT_KEY" { - if !strings.Contains(f.Description, "Stripe") { - t.Errorf("expected description to contain %q, got %q", "Stripe", f.Description) - } - } - } -} - -// TestAPIKeyScanner_ValuePattern_GitLabPAT verifies that a GitLab personal access token -// (glpat- + 20 chars = 26 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_GitLabPAT(t *testing.T) { - value := "glpat-" + strings.Repeat("g", 20) // total 26 chars - t.Setenv("REPO_TOKEN", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "REPO_TOKEN") - for _, f := range result.Findings { - if f.Resource == "REPO_TOKEN" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for GitLab PAT, got %q", f.Severity) - } - if !strings.Contains(f.Description, "GitLab") { - t.Errorf("expected description to contain %q, got %q", "GitLab", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_NpmToken verifies that an npm granular access token -// (npm_ + 36 chars = 40 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_NpmToken(t *testing.T) { - value := "npm_" + strings.Repeat("n", 36) // total 40 chars - t.Setenv("REGISTRY_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "REGISTRY_KEY") - for _, f := range result.Findings { - if f.Resource == "REGISTRY_KEY" { - if !strings.Contains(f.Description, "npm") { - t.Errorf("expected description to contain %q, got %q", "npm", f.Description) - } - } - } -} - -// TestAPIKeyScanner_ValuePattern_Groq verifies that a Groq key (gsk_ + 52 chars = 56 total) -// produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_Groq(t *testing.T) { - value := "gsk_" + strings.Repeat("q", 52) // total 56 chars - t.Setenv("INFERENCE_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "INFERENCE_KEY") - for _, f := range result.Findings { - if f.Resource == "INFERENCE_KEY" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Groq key, got %q", f.Severity) - } - if !strings.Contains(f.Description, "Groq") { - t.Errorf("expected description to contain %q, got %q", "Groq", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_SendGrid verifies that a SendGrid key -// (SG. + 22 chars + . + 43 chars = 69 total) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_SendGrid(t *testing.T) { - // SG. (3) + 22 chars + . (1) + 43 chars = 69 total - value := "SG." + strings.Repeat("a", 22) + "." + strings.Repeat("b", 43) - t.Setenv("MAIL_KEY", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "MAIL_KEY") - for _, f := range result.Findings { - if f.Resource == "MAIL_KEY" { - if !strings.Contains(f.Description, "SendGrid") { - t.Errorf("expected description to contain %q, got %q", "SendGrid", f.Description) - } - } +// TestAPIKeyScanner_ValuePatterns verifies that each known provider value pattern +// produces a finding with the correct severity and provider tag in the description. +// Variable names are intentionally neutral (no provider keyword) so the finding +// comes from scanValuePatterns, not scanNameRegex. +func TestAPIKeyScanner_ValuePatterns(t *testing.T) { + cases := []struct { + name string + envVar string + value string + wantSeverity string + wantDescSub string // substring expected in description + }{ + {"ambiguous sk-", "SOME_CRED", "sk-" + strings.Repeat("x", 48), "UNCERTAIN", "possible OpenAI legacy"}, + {"Stripe live secret", "PAYMENT_KEY", "sk_live_" + strings.Repeat("s", 47), "HIGH", "Stripe"}, + {"Stripe test secret", "TEST_PAYMENT_KEY", "sk_test_" + strings.Repeat("t", 47), "HIGH", "Stripe"}, + {"GitLab PAT", "REPO_TOKEN", "glpat-" + strings.Repeat("g", 20), "HIGH", "GitLab"}, + {"npm token", "REGISTRY_KEY", "npm_" + strings.Repeat("n", 36), "HIGH", "npm"}, + {"Groq", "INFERENCE_KEY", "gsk_" + strings.Repeat("q", 52), "HIGH", "Groq"}, + {"SendGrid", "MAIL_KEY", "SG." + strings.Repeat("a", 22) + "." + strings.Repeat("b", 43), "HIGH", "SendGrid"}, + {"Anthropic", "LLM_KEY", "sk-ant-" + strings.Repeat("a", 101), "HIGH", "Anthropic"}, + {"OpenAI project", "SOME_AI_CRED", "sk-proj-" + strings.Repeat("a", 48), "HIGH", "OpenAI project"}, + {"HuggingFace", "ML_MODEL_CRED", "hf_" + strings.Repeat("b", 34), "HIGH", "HuggingFace"}, + {"GitHub classic PAT", "WORK_GH_TOKEN", "ghp_" + strings.Repeat("c", 36), "HIGH", "GitHub"}, + {"Twilio SID", "CRED_SID", "SK" + strings.Repeat("f", 32), "UNCERTAIN", "Twilio"}, } - assertNoSecretValue(t, result.Findings, value) -} -// TestAPIKeyScanner_ValuePattern_Anthropic verifies that an Anthropic key -// (sk-ant- prefix, 108 total chars) produces a HIGH finding. -func TestAPIKeyScanner_ValuePattern_Anthropic(t *testing.T) { - value := "sk-ant-" + strings.Repeat("a", 101) // total 108 chars - t.Setenv("LLM_KEY", value) - clearHighRiskEnv(t) + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Setenv(tc.envVar, tc.value) + clearHighRiskEnv(t) - s := newScannerWithHome(t.TempDir()) - result := s.Scan() + s := newScannerWithHome(t.TempDir()) + result := s.Scan() - assertResource(t, result.Findings, "LLM_KEY") - for _, f := range result.Findings { - if f.Resource == "LLM_KEY" { - if f.Severity != "HIGH" { - t.Errorf("expected HIGH severity for Anthropic key, got %q", f.Severity) - } - if !strings.Contains(f.Description, "Anthropic") { - t.Errorf("expected description to contain %q, got %q", "Anthropic", f.Description) + assertResource(t, result.Findings, tc.envVar) + for _, f := range result.Findings { + if f.Resource == tc.envVar { + if string(f.Severity) != tc.wantSeverity { + t.Errorf("severity: got %q, want %q", f.Severity, tc.wantSeverity) + } + if !strings.Contains(f.Description, tc.wantDescSub) { + t.Errorf("description %q missing %q", f.Description, tc.wantDescSub) + } + } } - } + assertNoSecretValue(t, result.Findings, tc.value) + }) } - assertNoSecretValue(t, result.Findings, value) } // TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN @@ -626,70 +496,6 @@ func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { } } -// TestAPIKeyScanner_ValuePattern_OpenAIProject verifies that a value matching the -// OpenAI project key format (sk-proj- + 48 chars = 56 total) produces a finding -// with the correct resource name and provider tag in the description. -func TestAPIKeyScanner_ValuePattern_OpenAIProject(t *testing.T) { - value := "sk-proj-" + strings.Repeat("a", 48) // total 56 chars - t.Setenv("SOME_AI_CRED", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "SOME_AI_CRED") - for _, f := range result.Findings { - if f.Resource == "SOME_AI_CRED" { - if !strings.Contains(f.Description, "OpenAI project") { - t.Errorf("expected description to contain %q, got %q", "OpenAI project", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - -// TestAPIKeyScanner_ValuePattern_HuggingFace verifies that a value matching the -// HuggingFace token format (hf_ + 34 chars = 37 total) produces a correct finding. -func TestAPIKeyScanner_ValuePattern_HuggingFace(t *testing.T) { - value := "hf_" + strings.Repeat("b", 34) // total 37 chars - // Use a variable name that does NOT match any nameRegex pattern so the finding - // comes from scanValuePatterns (and the HuggingFace provider tag is in the description). - t.Setenv("ML_MODEL_CRED", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "ML_MODEL_CRED") - for _, f := range result.Findings { - if f.Resource == "ML_MODEL_CRED" { - if !strings.Contains(f.Description, "HuggingFace") { - t.Errorf("expected description to contain %q, got %q", "HuggingFace", f.Description) - } - } - } -} - -// TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT verifies that a value matching the -// GitHub classic PAT format (ghp_ + 36 chars = 40 total) produces a correct finding. -func TestAPIKeyScanner_ValuePattern_GitHub_ClassicPAT(t *testing.T) { - value := "ghp_" + strings.Repeat("c", 36) // total 40 chars - t.Setenv("WORK_GH_TOKEN", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "WORK_GH_TOKEN") - for _, f := range result.Findings { - if f.Resource == "WORK_GH_TOKEN" { - if !strings.Contains(f.Description, "GitHub") { - t.Errorf("expected description to contain %q, got %q", "GitHub", f.Description) - } - } - } -} - // TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the // right prefix but wrong length does NOT produce a finding. func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { From 602fbdd7812231c080a0be6f8ea726969049ae40 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:47:00 +0100 Subject: [PATCH 09/17] Fix tests --- internal/scan/apikeys_test.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index bedc6f9..6051c31 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -18,6 +18,19 @@ func clearHighRiskEnv(t *testing.T) { } } +// clearAllEnv sets every environment variable to empty for the duration of the test. +// Use this in tests that assert 0 findings, since nameRegex patterns (e.g. (?i)GITHUB) +// can match CI variables like GITHUB_WORKSPACE that aren't credentials. +// t.Setenv restores original values after the test. +func clearAllEnv(t *testing.T) { + t.Helper() + for _, entry := range os.Environ() { + if idx := strings.IndexByte(entry, '='); idx >= 0 { + t.Setenv(entry[:idx], "") + } + } +} + // newScannerWithHome creates an APIKeyScanner with HomeDir set to home and no extras. func newScannerWithHome(home string) *scan.APIKeyScanner { s := scan.NewAPIKeyScanner() @@ -66,7 +79,7 @@ func TestAPIKeyScanner_NeverStoresSecretValue(t *testing.T) { } func TestAPIKeyScanner_EmptyEnvNoFindings(t *testing.T) { - clearHighRiskEnv(t) + clearAllEnv(t) s := newScannerWithHome(t.TempDir()) result := s.Scan() @@ -131,7 +144,7 @@ func TestAPIKeyScanner_CredentialFileContentNotInFindings(t *testing.T) { } func TestAPIKeyScanner_NoCredentialFileNoFinding(t *testing.T) { - clearHighRiskEnv(t) + clearAllEnv(t) s := newScannerWithHome(t.TempDir()) result := s.Scan() From c7b9c0044b2f646a7787a221b450978203db50f2 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 10:58:35 +0100 Subject: [PATCH 10/17] Fixes --- internal/scan/apikeys.go | 89 +++++++++++++++++++---------------- internal/scan/apikeys_test.go | 72 ++++++++-------------------- 2 files changed, 68 insertions(+), 93 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index 94052ad..b805fe4 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -13,10 +13,14 @@ import ( "github.com/Pringled/agentcheck/internal/models" ) -// nameRegexPatterns is compiled once at package init. It matches env var names that suggest -// they hold credentials for known providers or generic secret terms. -// Case-insensitive match on the full variable name. -var nameRegexPatterns = []*regexp.Regexp{ +// credentialSuffixRe matches env var names that contain a credential-related term. +// Provider name patterns require this suffix to avoid false positives on non-credential +// vars like GITHUB_WORKSPACE or OPENAI_BASE_URL. +var credentialSuffixRe = regexp.MustCompile(`(?i)(KEY|TOKEN|SECRET|PASSWORD|CRED)`) + +// providerNamePatterns matches env var names containing a known provider keyword. +// These only produce a finding when the name also matches credentialSuffixRe. +var providerNamePatterns = []*regexp.Regexp{ // AI / ML providers regexp.MustCompile(`(?i)OPENAI`), regexp.MustCompile(`(?i)ANTHROPIC`), @@ -81,15 +85,19 @@ var nameRegexPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)GITHUB`), regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), - // Productivity / project tools (common in agent contexts) - regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while still matching MY_LINEAR_TOKEN + // Productivity / project tools + regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while matching MY_LINEAR_TOKEN regexp.MustCompile(`(?i)NOTION`), regexp.MustCompile(`(?i)AIRTABLE`), - // Database-as-a-service (API keys / connection tokens) + // Database-as-a-service regexp.MustCompile(`(?i)SUPABASE`), regexp.MustCompile(`(?i)(^|_)NEON_`), // (^|_) avoids ANEMONE_CONFIG, NEONLIGHTS_COLOR while matching MY_NEON_KEY regexp.MustCompile(`(?i)PLANETSCALE`), - // Generic credential terms +} + +// credentialSuffixPatterns matches generic credential terms in env var names. +// These match standalone without requiring a provider keyword. +var credentialSuffixPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)API_KEY`), regexp.MustCompile(`(?i)API_TOKEN`), regexp.MustCompile(`(?i)SECRET_KEY`), @@ -109,15 +117,15 @@ type valuePattern struct { // valuePatterns lists known API key formats identified by a distinctive prefix and exact total length. var valuePatterns = []valuePattern{ - // OpenAI — more-specific prefixes listed first so they match before the generic sk- entry. + // OpenAI - more-specific prefixes listed first so they match before the generic sk- entry. {prefix: "sk-proj-", totalLen: 56, severity: models.SeverityHigh, providerTag: "OpenAI project"}, {prefix: "sk-admin-", totalLen: 57, severity: models.SeverityHigh, providerTag: "OpenAI admin"}, - // sk- is shared by many tools (OpenAI legacy, LangChain proxies, self-hosted LLMs, …). + // sk- is shared by many tools (OpenAI legacy, LangChain proxies, self-hosted LLMs, etc.). // Flag as UNCERTAIN so the user can confirm the actual provider via the variable name. {prefix: "sk-", totalLen: 51, severity: models.SeverityUncertain, providerTag: "possible OpenAI legacy or other sk- key"}, - // Anthropic — prefix is distinctive enough for HIGH confidence. + // Anthropic - prefix is distinctive enough for HIGH confidence. {prefix: "sk-ant-", totalLen: 108, severity: models.SeverityHigh, providerTag: "Anthropic"}, - // Stripe — underscore separator makes these provider-specific. + // Stripe - underscore separator makes these provider-specific. {prefix: "sk_live_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe live secret"}, {prefix: "sk_test_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe test secret"}, {prefix: "rk_live_", totalLen: 55, severity: models.SeverityHigh, providerTag: "Stripe live restricted"}, @@ -128,9 +136,8 @@ var valuePatterns = []valuePattern{ {prefix: "npm_", totalLen: 40, severity: models.SeverityHigh, providerTag: "npm access token"}, // Groq — gsk_ prefix confirmed in Groq docs. {prefix: "gsk_", totalLen: 56, severity: models.SeverityHigh, providerTag: "Groq"}, - // Twilio API key SID — SK + 32 hex chars = 34 total. - // SeverityUncertain: the SK prefix is too broad (any 34-char string starting with SK - // would match); we don't validate the hex charset, so false positives are likely. + // Twilio API key SID - SK + 32 hex chars = 34 total. + // SeverityUncertain: SK prefix is broad, false positives are likely. {prefix: "SK", totalLen: 34, severity: models.SeverityUncertain, providerTag: "Twilio API key SID"}, // SendGrid — SG. + 22 + . + 43 = 69 total (with the dots). {prefix: "SG.", totalLen: 69, severity: models.SeverityHigh, providerTag: "SendGrid"}, @@ -160,9 +167,6 @@ var credentialFiles = []config.CredentialFile{ // APIKeyScanner scans for high-risk API keys in environment variables and credential config files. // Key names and file paths only are reported in findings; values and file contents are never emitted. -// Exception: scanValuePatterns transiently reads env var values solely for prefix+length pattern -// matching; values are discarded immediately and never stored in findings, logs, or any -// data structure. See scanValuePatterns for the full security contract. // It never returns skipped=true. type APIKeyScanner struct { Base @@ -218,8 +222,6 @@ func (s *APIKeyScanner) Scan() models.ScanResult { // scanEnvKeys checks built-in and extra environment variable key names for presence. // Key names only are reported; values are never read or stored. -// seenEnvNames is the shared cross-pass dedup set; matched names are added to it so -// that scanNameRegex and scanValuePatterns will skip variables already claimed here. func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding @@ -284,30 +286,39 @@ func (s *APIKeyScanner) scanNameRegex(seenEnvNames map[string]bool) []models.Fin continue } - for _, re := range nameRegexPatterns { - if re.MatchString(name) { - seenEnvNames[name] = true - findings = append(findings, models.Finding{ - Scanner: "api_keys", - Resource: name, // key name only, never the value - Severity: models.SeverityHigh, - Description: "Can be used to make authenticated API calls.", - }) + matched := false + // Provider patterns require the name to also contain a credential suffix. + for _, re := range providerNamePatterns { + if re.MatchString(name) && credentialSuffixRe.MatchString(name) { + matched = true break } } + // Credential suffix patterns match standalone. + if !matched { + for _, re := range credentialSuffixPatterns { + if re.MatchString(name) { + matched = true + break + } + } + } + if matched { + seenEnvNames[name] = true + findings = append(findings, models.Finding{ + Scanner: "api_keys", + Resource: name, + Severity: models.SeverityHigh, + Description: "Can be used to make authenticated API calls.", + }) + } } return findings } // scanValuePatterns reads env var values to match against known provider prefixes. -// NOTE: unlike scanEnvKeys and scanNameRegex, this method reads the actual value. -// Values are used only for prefix+length pattern matching and then discarded immediately. -// No value is stored in findings, logs, or returned data structures. -// This is a deliberate, scoped relaxation of the "values never read" contract. -// seenEnvNames is the shared cross-pass dedup set; names already claimed by scanNameRegex -// are skipped, and newly matched names are added. +// Values are used only for prefix+length matching and then discarded. func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding @@ -344,7 +355,6 @@ func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models break // one finding per variable name } } - // value goes out of scope here; it is not stored anywhere } return findings @@ -359,9 +369,7 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { // If home directory cannot be resolved, skip all ~-based paths to avoid // scanning incorrect root-relative paths (e.g. /.aws/credentials). homeDir := s.resolveHomeDir() - // Combine built-in and extra credential files into a single pass. - // seenPath deduplicates so that an extra path duplicating a built-in - // (e.g. ~/.netrc in both lists) produces only one finding. + // seenPath deduplicates built-in and extra paths. allCredFiles := append(credentialFiles, s.ExtraCredentialFiles...) seenPath := make(map[string]bool, len(allCredFiles)) for _, cf := range allCredFiles { @@ -386,11 +394,10 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { return findings } -// envKeyFinding builds a HIGH severity finding for a detected environment variable key. func envKeyFinding(key string) models.Finding { return models.Finding{ Scanner: "api_keys", - Resource: key, // key name only, never the value + Resource: key, Severity: models.SeverityHigh, Description: "Can be used to make authenticated API calls.", } diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index 6051c31..dd2035e 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -19,9 +19,6 @@ func clearHighRiskEnv(t *testing.T) { } // clearAllEnv sets every environment variable to empty for the duration of the test. -// Use this in tests that assert 0 findings, since nameRegex patterns (e.g. (?i)GITHUB) -// can match CI variables like GITHUB_WORKSPACE that aren't credentials. -// t.Setenv restores original values after the test. func clearAllEnv(t *testing.T) { t.Helper() for _, entry := range os.Environ() { @@ -31,7 +28,6 @@ func clearAllEnv(t *testing.T) { } } -// newScannerWithHome creates an APIKeyScanner with HomeDir set to home and no extras. func newScannerWithHome(home string) *scan.APIKeyScanner { s := scan.NewAPIKeyScanner() s.HomeDir = home @@ -342,11 +338,6 @@ func TestAPIKeyScanner_ExtraCredentialFiles_TildeExpanded(t *testing.T) { assertResource(t, result.Findings, tokenFile) } -// ── Name-regex tests ────────────────────────────────────────────────────────── - -// TestAPIKeyScanner_NameRegex_ProviderKeyword verifies that an env var with a -// provider keyword in its name (MY_OPENAI_KEY) is flagged even though it is not -// in HighRiskEnvKeys. func TestAPIKeyScanner_NameRegex_ProviderKeyword(t *testing.T) { t.Setenv("MY_OPENAI_KEY", "sk-something") clearHighRiskEnv(t) @@ -357,8 +348,6 @@ func TestAPIKeyScanner_NameRegex_ProviderKeyword(t *testing.T) { assertResource(t, result.Findings, "MY_OPENAI_KEY") } -// TestAPIKeyScanner_NameRegex_GenericTerm verifies that an env var containing a -// generic credential term (INTERNAL_API_KEY) is flagged. func TestAPIKeyScanner_NameRegex_GenericTerm(t *testing.T) { t.Setenv("INTERNAL_API_KEY", "secret") clearHighRiskEnv(t) @@ -369,9 +358,6 @@ func TestAPIKeyScanner_NameRegex_GenericTerm(t *testing.T) { assertResource(t, result.Findings, "INTERNAL_API_KEY") } -// TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin verifies that a key already in -// HighRiskEnvKeys (OPENAI_API_KEY) produces exactly ONE finding — scanEnvKeys() gets it -// and scanNameRegex() skips it. func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { t.Setenv("OPENAI_API_KEY", "sk-test") // Clear all built-in keys except OPENAI_API_KEY. @@ -395,12 +381,6 @@ func TestAPIKeyScanner_NameRegex_NoDuplicateWithBuiltin(t *testing.T) { } } -// ── Value-pattern tests ─────────────────────────────────────────────────────── - -// TestAPIKeyScanner_ValuePatterns verifies that each known provider value pattern -// produces a finding with the correct severity and provider tag in the description. -// Variable names are intentionally neutral (no provider keyword) so the finding -// comes from scanValuePatterns, not scanNameRegex. func TestAPIKeyScanner_ValuePatterns(t *testing.T) { cases := []struct { name string @@ -447,9 +427,6 @@ func TestAPIKeyScanner_ValuePatterns(t *testing.T) { } } -// TestAPIKeyScanner_NameRegex_FLY_Anchored verifies that FLY_ matches FLY_API_TOKEN -// but does NOT match BUTTERFLY_KEY (which contains the substring FLY_ but should not -// be treated as a Fly.io credential due to the word-boundary anchor in the pattern). func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { clearHighRiskEnv(t) t.Setenv("FLY_API_TOKEN", "real-token") @@ -475,8 +452,6 @@ func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { } } -// TestAPIKeyScanner_NameRegex_NewProviders verifies that new provider keywords -// added in this session are recognised. func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { clearHighRiskEnv(t) cases := []struct { @@ -509,8 +484,6 @@ func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { } } -// TestAPIKeyScanner_ValuePattern_NoMatchWrongLength verifies that a value with the -// right prefix but wrong length does NOT produce a finding. func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { value := "sk-proj-" + strings.Repeat("x", 10) // total 18 chars, wrong length for any pattern t.Setenv("SOME_KEY", value) @@ -526,10 +499,6 @@ func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { } } -// TestAPIKeyScanner_ValuePattern_TwilioSID verifies that a Twilio API key SID -// (SK + 32 hex chars = 34 total) produces an UNCERTAIN finding. -// The SK prefix is intentionally broad (any 34-char string starting with SK matches) -// so we use SeverityUncertain rather than SeverityHigh to avoid false positives. func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) { value := "SK" + strings.Repeat("f", 32) // total 34 chars t.Setenv("CRED_SID", value) @@ -552,9 +521,6 @@ func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) { assertNoSecretValue(t, result.Findings, value) } -// TestAPIKeyScanner_CrossPassDedup_NameRegexWins verifies that a variable whose name -// matches a nameRegex pattern AND whose value matches a value pattern produces exactly -// ONE finding — from the name-regex pass — not two. func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) { // CUSTOM_STRIPE_KEY matches the STRIPE name-regex. // sk_live_ + 47 chars matches the Stripe live secret value pattern. @@ -585,10 +551,6 @@ func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) { } } -// ── Tightened-regex false-positive tests ───────────────────────────────────── - -// TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern verifies that the tightened \bNEON_ -// pattern does not fire on variable names that contain "neon" as part of a longer word. func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) // These should NOT be flagged. @@ -613,8 +575,6 @@ func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) { } } -// TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern verifies that the tightened \bLINEAR_ -// pattern does not fire on names containing "linear" as a substring. func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) t.Setenv("BILINEAR_FILTER", "some-value") @@ -631,8 +591,6 @@ func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) { } } -// TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern verifies that the tightened \bPALM_ -// pattern does not fire on names like NAPALM_MODE. func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) { clearHighRiskEnv(t) t.Setenv("NAPALM_MODE", "some-value") @@ -652,8 +610,6 @@ func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) { } } -// TestAPIKeyScanner_NameRegex_NewAIProviders verifies that newly added AI provider -// name patterns are recognised. func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { clearHighRiskEnv(t) cases := []struct { @@ -683,10 +639,6 @@ func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { } } -// TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex verifies that a key listed in -// ExtraEnvKeys whose name also matches a nameRegexPattern produces exactly ONE finding. -// Previously scanEnvKeys and scanNameRegex were not sharing the seenEnvNames dedup map, -// so MY_OPENAI_KEY in extra_env_keys would fire twice. func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) { const key = "MY_OPENAI_KEY" // matches OPENAI nameRegexPattern AND is in ExtraEnvKeys t.Setenv(key, "sk-test-value") @@ -709,11 +661,8 @@ func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) { } } -// TestAPIKeyScanner_ValuePattern_BuiltinSkipped verifies that a key in HighRiskEnvKeys -// whose value also matches a value pattern produces exactly ONE finding (from scanEnvKeys, -// not from scanValuePatterns which skips it). func TestAPIKeyScanner_ValuePattern_BuiltinSkipped(t *testing.T) { - value := "sk-proj-" + strings.Repeat("z", 48) // total 56 chars — matches OpenAI project pattern + value := "sk-proj-" + strings.Repeat("z", 48) // total 56 chars - matches OpenAI project pattern t.Setenv("OPENAI_API_KEY", value) // Clear all other built-in keys. for k := range scan.HighRiskEnvKeys { @@ -735,3 +684,22 @@ func TestAPIKeyScanner_ValuePattern_BuiltinSkipped(t *testing.T) { t.Errorf("expected exactly 1 finding for OPENAI_API_KEY, got %d", count) } } + +func TestAPIKeyScanner_NameRegex_ProviderWithoutSuffix_NotFlagged(t *testing.T) { + clearAllEnv(t) + // Provider keyword present but no credential suffix - should NOT be flagged. + t.Setenv("GITHUB_WORKSPACE", "/home/runner/work") + t.Setenv("GITHUB_ACTIONS", "true") + t.Setenv("OPENAI_BASE_URL", "https://api.openai.com") + t.Setenv("STRIPE_WEBHOOK_ENDPOINT", "https://example.com/webhook") + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + for _, f := range result.Findings { + switch f.Resource { + case "GITHUB_WORKSPACE", "GITHUB_ACTIONS", "OPENAI_BASE_URL", "STRIPE_WEBHOOK_ENDPOINT": + t.Errorf("%s should not be flagged (provider keyword without credential suffix)", f.Resource) + } + } +} From 095e3b964856e62ea52a5ab5c55a3ae2e64470eb Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:02:15 +0100 Subject: [PATCH 11/17] fix: narrow XAI pattern to (^|_)XAI_ to avoid false positives on PROXAI_, RELAXAI_ etc --- internal/scan/apikeys.go | 4 ++-- internal/scan/apikeys_test.go | 30 +++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index b805fe4..f0fe923 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -16,7 +16,7 @@ import ( // credentialSuffixRe matches env var names that contain a credential-related term. // Provider name patterns require this suffix to avoid false positives on non-credential // vars like GITHUB_WORKSPACE or OPENAI_BASE_URL. -var credentialSuffixRe = regexp.MustCompile(`(?i)(KEY|TOKEN|SECRET|PASSWORD|CRED)`) +var credentialSuffixRe = regexp.MustCompile(`(?i)(^|_)(KEY|TOKEN|SECRET|PASSWORD|CRED)(S?)(_|$)`) // providerNamePatterns matches env var names containing a known provider keyword. // These only produce a finding when the name also matches credentialSuffixRe. @@ -39,7 +39,7 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)DEEPSEEK`), regexp.MustCompile(`(?i)PERPLEXITY`), regexp.MustCompile(`(?i)CEREBRAS`), - regexp.MustCompile(`(?i)XAI`), + regexp.MustCompile(`(?i)(^|_)XAI_`), // (^|_) avoids TAXI_KEY, PROXAI_TOKEN while matching XAI_API_KEY, MY_XAI_KEY regexp.MustCompile(`(?i)ASSEMBLYAI`), regexp.MustCompile(`(?i)AI21`), regexp.MustCompile(`(?i)NVIDIA_NIM`), diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index dd2035e..b609585 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -639,6 +639,30 @@ func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { } } +func TestAPIKeyScanner_NameRegex_XAI_Anchored(t *testing.T) { + clearAllEnv(t) + // XAI embedded mid-word with no credential suffix — should NOT be flagged. + t.Setenv("PROXAI_ENDPOINT", "https://api.proxai.com") + t.Setenv("RELAXAI_MODE", "true") + // These SHOULD be flagged. + t.Setenv("XAI_API_KEY", "real-xai-key") + t.Setenv("MY_XAI_KEY", "also-real-xai-key") + + s := newScannerWithHome(t.TempDir()) + result := s.Scan() + + assertResource(t, result.Findings, "XAI_API_KEY") + assertResource(t, result.Findings, "MY_XAI_KEY") + for _, f := range result.Findings { + if f.Resource == "PROXAI_ENDPOINT" { + t.Error("PROXAI_ENDPOINT should not be flagged by XAI pattern") + } + if f.Resource == "RELAXAI_MODE" { + t.Error("RELAXAI_MODE should not be flagged by XAI pattern") + } + } +} + func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) { const key = "MY_OPENAI_KEY" // matches OPENAI nameRegexPattern AND is in ExtraEnvKeys t.Setenv(key, "sk-test-value") @@ -692,13 +716,17 @@ func TestAPIKeyScanner_NameRegex_ProviderWithoutSuffix_NotFlagged(t *testing.T) t.Setenv("GITHUB_ACTIONS", "true") t.Setenv("OPENAI_BASE_URL", "https://api.openai.com") t.Setenv("STRIPE_WEBHOOK_ENDPOINT", "https://example.com/webhook") + // Substring false positives: MONKEY contains KEY, DONKEY contains KEY. + t.Setenv("GITHUB_MONKEY", "banana") + t.Setenv("OPENAI_DONKEY", "hee-haw") s := newScannerWithHome(t.TempDir()) result := s.Scan() for _, f := range result.Findings { switch f.Resource { - case "GITHUB_WORKSPACE", "GITHUB_ACTIONS", "OPENAI_BASE_URL", "STRIPE_WEBHOOK_ENDPOINT": + case "GITHUB_WORKSPACE", "GITHUB_ACTIONS", "OPENAI_BASE_URL", "STRIPE_WEBHOOK_ENDPOINT", + "GITHUB_MONKEY", "OPENAI_DONKEY": t.Errorf("%s should not be flagged (provider keyword without credential suffix)", f.Resource) } } From 14a9589d5231f5d8bfc4e1928cf00dbd7e10dac6 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:06:34 +0100 Subject: [PATCH 12/17] =?UTF-8?q?refactor(tests):=20simplify=20apikeys=5Ft?= =?UTF-8?q?est=20=E2=80=94=20remove=20duplicates,=20collapse=20anchored-pa?= =?UTF-8?q?ttern=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/scan/apikeys_test.go | 234 ++++++++++------------------------ 1 file changed, 67 insertions(+), 167 deletions(-) diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index b609585..b4131be 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -139,20 +139,6 @@ func TestAPIKeyScanner_CredentialFileContentNotInFindings(t *testing.T) { assertResource(t, result.Findings, credFile) } -func TestAPIKeyScanner_NoCredentialFileNoFinding(t *testing.T) { - clearAllEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - if result.Skipped { - t.Error("APIKeyScanner must never return skipped=true") - } - if len(result.Findings) != 0 { - t.Errorf("expected 0 findings in empty home dir, got %d: %v", len(result.Findings), resourceSet(result.Findings)) - } -} - func TestAPIKeyScanner_GCPCredentialsDirDetected(t *testing.T) { home := t.TempDir() gcloudDir := filepath.Join(home, ".config", "gcloud") @@ -427,28 +413,61 @@ func TestAPIKeyScanner_ValuePatterns(t *testing.T) { } } -func TestAPIKeyScanner_NameRegex_FLY_Anchored(t *testing.T) { - clearHighRiskEnv(t) - t.Setenv("FLY_API_TOKEN", "real-token") - t.Setenv("MY_FLY_TOKEN", "also-real-token") - t.Setenv("BUTTERFLY_KEY", "not-a-fly-token") - t.Setenv("FLYWEIGHT_INDEX", "not-a-token") +func TestAPIKeyScanner_NameRegex_AnchoredPatterns(t *testing.T) { + cases := []struct { + name string + shouldMatch map[string]string + shouldNotMatch map[string]string + }{ + { + name: "FLY_", + shouldMatch: map[string]string{"FLY_API_TOKEN": "real-token", "MY_FLY_TOKEN": "also-real"}, + shouldNotMatch: map[string]string{"BUTTERFLY_KEY": "v", "FLYWEIGHT_INDEX": "v"}, + }, + { + name: "NEON_", + shouldMatch: map[string]string{"NEON_API_KEY": "real-neon-key", "MY_NEON_KEY": "also-real"}, + shouldNotMatch: map[string]string{"ANEMONE_CONFIG": "v", "NEONLIGHTS_COLOR": "v"}, + }, + { + name: "LINEAR_", + shouldMatch: map[string]string{"LINEAR_API_KEY": "real-linear-key", "MY_LINEAR_TOKEN": "also-real"}, + shouldNotMatch: map[string]string{"BILINEAR_FILTER": "v"}, + }, + { + name: "PALM_", + shouldMatch: map[string]string{"PALM_API_KEY": "real-palm-key", "MY_PALM_KEY": "also-real"}, + shouldNotMatch: map[string]string{"NAPALM_MODE": "v"}, + }, + { + name: "XAI_", + shouldMatch: map[string]string{"XAI_API_KEY": "real-xai-key", "MY_XAI_KEY": "also-real"}, + shouldNotMatch: map[string]string{"PROXAI_ENDPOINT": "https://api.proxai.com", "RELAXAI_MODE": "true"}, + }, + } - s := newScannerWithHome(t.TempDir()) - result := s.Scan() + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + clearAllEnv(t) + for k, v := range tc.shouldMatch { + t.Setenv(k, v) + } + for k, v := range tc.shouldNotMatch { + t.Setenv(k, v) + } - // FLY_API_TOKEN and MY_FLY_TOKEN must both be flagged. - assertResource(t, result.Findings, "FLY_API_TOKEN") - assertResource(t, result.Findings, "MY_FLY_TOKEN") + s := newScannerWithHome(t.TempDir()) + result := s.Scan() - // BUTTERFLY_KEY and FLYWEIGHT_INDEX must NOT be flagged. - for _, f := range result.Findings { - if f.Resource == "BUTTERFLY_KEY" { - t.Error("BUTTERFLY_KEY should not be flagged by FLY_ pattern") - } - if f.Resource == "FLYWEIGHT_INDEX" { - t.Error("FLYWEIGHT_INDEX should not be flagged by FLY_ pattern") - } + for k := range tc.shouldMatch { + assertResource(t, result.Findings, k) + } + for k := range tc.shouldNotMatch { + if contains(result.Findings, k) { + t.Errorf("%s should not be flagged by %s pattern", k, tc.name) + } + } + }) } } @@ -458,18 +477,33 @@ func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { envVar string value string }{ + // Google / cloud AI {"MY_GEMINI_KEY", "gemini-key-value"}, {"VERTEX_API_KEY", "vertex-key-value"}, {"BEDROCK_ACCESS_KEY", "bedrock-key-value"}, {"AZURE_OPENAI_KEY", "azure-openai-key"}, + // Communication {"RESEND_API_KEY", "resend-key-value"}, {"POSTMARK_TOKEN", "postmark-key-value"}, + // Productivity / project tools {"MY_LINEAR_TOKEN", "linear-key-value"}, {"NOTION_API_KEY", "notion-key-value"}, {"AIRTABLE_KEY", "airtable-key-value"}, + // Database-as-a-service {"SUPABASE_KEY", "supabase-key-value"}, {"NEON_API_KEY", "neon-key-value"}, {"PLANETSCALE_TOKEN", "ps-key-value"}, + // Newer AI providers + {"OPENROUTER_API_KEY", "or-key-value"}, + {"FIREWORKS_API_KEY", "fw-key-value"}, + {"DEEPSEEK_API_KEY", "ds-key-value"}, + {"PERPLEXITY_API_KEY", "pplx-key-value"}, + {"CEREBRAS_API_KEY", "cb-key-value"}, + {"DOPPLER_TOKEN", "dp-token-value"}, + {"XAI_API_KEY", "xai-key-value"}, + {"ASSEMBLYAI_API_KEY", "aai-key-value"}, + {"AI21_API_KEY", "ai21-key-value"}, + {"NVIDIA_NIM_API_KEY", "nim-key-value"}, } for _, tc := range cases { @@ -499,28 +533,6 @@ func TestAPIKeyScanner_ValuePattern_NoMatchWrongLength(t *testing.T) { } } -func TestAPIKeyScanner_ValuePattern_TwilioSID(t *testing.T) { - value := "SK" + strings.Repeat("f", 32) // total 34 chars - t.Setenv("CRED_SID", value) - clearHighRiskEnv(t) - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "CRED_SID") - for _, f := range result.Findings { - if f.Resource == "CRED_SID" { - if f.Severity != "UNCERTAIN" { - t.Errorf("expected UNCERTAIN severity for Twilio SID (broad SK prefix), got %q", f.Severity) - } - if !strings.Contains(f.Description, "Twilio") { - t.Errorf("expected description to contain %q, got %q", "Twilio", f.Description) - } - } - } - assertNoSecretValue(t, result.Findings, value) -} - func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) { // CUSTOM_STRIPE_KEY matches the STRIPE name-regex. // sk_live_ + 47 chars matches the Stripe live secret value pattern. @@ -551,118 +563,6 @@ func TestAPIKeyScanner_CrossPassDedup_NameRegexWins(t *testing.T) { } } -func TestAPIKeyScanner_NameRegex_NEON_NarrowedPattern(t *testing.T) { - clearHighRiskEnv(t) - // These should NOT be flagged. - t.Setenv("ANEMONE_CONFIG", "some-value") - t.Setenv("NEONLIGHTS_COLOR", "blue") - // These SHOULD be flagged. - t.Setenv("NEON_API_KEY", "real-neon-key") - t.Setenv("MY_NEON_KEY", "also-real-neon-key") - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "NEON_API_KEY") - assertResource(t, result.Findings, "MY_NEON_KEY") - for _, f := range result.Findings { - if f.Resource == "ANEMONE_CONFIG" { - t.Error("ANEMONE_CONFIG should not be flagged by NEON_ pattern") - } - if f.Resource == "NEONLIGHTS_COLOR" { - t.Error("NEONLIGHTS_COLOR should not be flagged by NEON_ pattern") - } - } -} - -func TestAPIKeyScanner_NameRegex_LINEAR_NarrowedPattern(t *testing.T) { - clearHighRiskEnv(t) - t.Setenv("BILINEAR_FILTER", "some-value") - t.Setenv("LINEAR_API_KEY", "real-linear-key") - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "LINEAR_API_KEY") - for _, f := range result.Findings { - if f.Resource == "BILINEAR_FILTER" { - t.Error("BILINEAR_FILTER should not be flagged by LINEAR_ pattern") - } - } -} - -func TestAPIKeyScanner_NameRegex_PALM_NarrowedPattern(t *testing.T) { - clearHighRiskEnv(t) - t.Setenv("NAPALM_MODE", "some-value") - // These SHOULD be flagged. - t.Setenv("PALM_API_KEY", "real-palm-key") - t.Setenv("MY_PALM_KEY", "also-real-palm-key") - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "PALM_API_KEY") - assertResource(t, result.Findings, "MY_PALM_KEY") - for _, f := range result.Findings { - if f.Resource == "NAPALM_MODE" { - t.Error("NAPALM_MODE should not be flagged by PALM_ pattern") - } - } -} - -func TestAPIKeyScanner_NameRegex_NewAIProviders(t *testing.T) { - clearHighRiskEnv(t) - cases := []struct { - envVar string - value string - }{ - {"OPENROUTER_API_KEY", "or-key-value"}, - {"FIREWORKS_API_KEY", "fw-key-value"}, - {"DEEPSEEK_API_KEY", "ds-key-value"}, - {"PERPLEXITY_API_KEY", "pplx-key-value"}, - {"CEREBRAS_API_KEY", "cb-key-value"}, - {"DOPPLER_TOKEN", "dp-token-value"}, - {"XAI_API_KEY", "xai-key-value"}, - {"ASSEMBLYAI_API_KEY", "aai-key-value"}, - {"AI21_API_KEY", "ai21-key-value"}, - {"NVIDIA_NIM_API_KEY", "nim-key-value"}, - } - for _, tc := range cases { - t.Setenv(tc.envVar, tc.value) - } - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - for _, tc := range cases { - assertResource(t, result.Findings, tc.envVar) - } -} - -func TestAPIKeyScanner_NameRegex_XAI_Anchored(t *testing.T) { - clearAllEnv(t) - // XAI embedded mid-word with no credential suffix — should NOT be flagged. - t.Setenv("PROXAI_ENDPOINT", "https://api.proxai.com") - t.Setenv("RELAXAI_MODE", "true") - // These SHOULD be flagged. - t.Setenv("XAI_API_KEY", "real-xai-key") - t.Setenv("MY_XAI_KEY", "also-real-xai-key") - - s := newScannerWithHome(t.TempDir()) - result := s.Scan() - - assertResource(t, result.Findings, "XAI_API_KEY") - assertResource(t, result.Findings, "MY_XAI_KEY") - for _, f := range result.Findings { - if f.Resource == "PROXAI_ENDPOINT" { - t.Error("PROXAI_ENDPOINT should not be flagged by XAI pattern") - } - if f.Resource == "RELAXAI_MODE" { - t.Error("RELAXAI_MODE should not be flagged by XAI pattern") - } - } -} - func TestAPIKeyScanner_ExtraEnvKeys_NoDuplicateWithNameRegex(t *testing.T) { const key = "MY_OPENAI_KEY" // matches OPENAI nameRegexPattern AND is in ExtraEnvKeys t.Setenv(key, "sk-test-value") From c7d8160edc6543db8e598e1328a3bebf66683ed0 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:12:59 +0100 Subject: [PATCH 13/17] =?UTF-8?q?refactor:=20simplify=20apikeys=20?= =?UTF-8?q?=E2=80=94=20inline=20helpers,=20deduplicate=20env=20iteration,?= =?UTF-8?q?=20trim=20comments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/scan/apikeys.go | 140 ++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 82 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index f0fe923..5bcb350 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -39,7 +39,7 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)DEEPSEEK`), regexp.MustCompile(`(?i)PERPLEXITY`), regexp.MustCompile(`(?i)CEREBRAS`), - regexp.MustCompile(`(?i)(^|_)XAI_`), // (^|_) avoids TAXI_KEY, PROXAI_TOKEN while matching XAI_API_KEY, MY_XAI_KEY + regexp.MustCompile(`(?i)(^|_)XAI_`), // (^|_) anchor avoids mid-word false positives regexp.MustCompile(`(?i)ASSEMBLYAI`), regexp.MustCompile(`(?i)AI21`), regexp.MustCompile(`(?i)NVIDIA_NIM`), @@ -48,7 +48,7 @@ var providerNamePatterns = []*regexp.Regexp{ // Google AI (Gemini, Vertex AI, PaLM) regexp.MustCompile(`(?i)GEMINI`), regexp.MustCompile(`(?i)VERTEX`), - regexp.MustCompile(`(?i)(^|_)PALM_`), // (^|_) avoids NAPALM_MODE while matching MY_PALM_KEY + regexp.MustCompile(`(?i)(^|_)PALM_`), // (^|_) anchor avoids mid-word false positives // AWS AI regexp.MustCompile(`(?i)BEDROCK`), // Azure AI @@ -80,18 +80,18 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)CLOUDFLARE`), regexp.MustCompile(`(?i)HEROKU`), regexp.MustCompile(`(?i)RAILWAY`), - regexp.MustCompile(`(?i)(^|_)FLY_`), // (^|_) avoids BUTTERFLY_KEY, FLYWEIGHT_INDEX while matching MY_FLY_TOKEN + regexp.MustCompile(`(?i)(^|_)FLY_`), // (^|_) anchor avoids mid-word false positives // Source control regexp.MustCompile(`(?i)GITHUB`), regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), // Productivity / project tools - regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) avoids BILINEAR_FILTER while matching MY_LINEAR_TOKEN + regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) anchor avoids mid-word false positives regexp.MustCompile(`(?i)NOTION`), regexp.MustCompile(`(?i)AIRTABLE`), // Database-as-a-service regexp.MustCompile(`(?i)SUPABASE`), - regexp.MustCompile(`(?i)(^|_)NEON_`), // (^|_) avoids ANEMONE_CONFIG, NEONLIGHTS_COLOR while matching MY_NEON_KEY + regexp.MustCompile(`(?i)(^|_)NEON_`), // (^|_) anchor avoids mid-word false positives regexp.MustCompile(`(?i)PLANETSCALE`), } @@ -195,11 +195,8 @@ func NewAPIKeyScannerWithConfig(cfg config.Config) *APIKeyScanner { } } -// Name returns the canonical scanner ID. func (s *APIKeyScanner) Name() string { return "api_keys" } -// Scan detects high-risk API keys in env vars and credential file presence. -// Implements Scanner. Never returns skipped=true. func (s *APIKeyScanner) Scan() models.ScanResult { var findings []models.Finding // seenEnvNames is shared across all three env-scanning passes so that any variable @@ -234,7 +231,12 @@ func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Findi for _, key := range keys { if os.Getenv(key) != "" { seenEnvNames[key] = true - findings = append(findings, envKeyFinding(key)) + findings = append(findings, models.Finding{ + Scanner: "api_keys", + Resource: key, + Severity: models.SeverityHigh, + Description: "Can be used to make authenticated API calls.", + }) } } @@ -249,7 +251,12 @@ func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Findi } if os.Getenv(key) != "" { seenEnvNames[key] = true - findings = append(findings, envKeyFinding(key)) + findings = append(findings, models.Finding{ + Scanner: "api_keys", + Resource: key, + Severity: models.SeverityHigh, + Description: "Can be used to make authenticated API calls.", + }) } } } @@ -257,6 +264,32 @@ func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Findi return findings } +// envEntry holds a parsed, non-empty environment variable that has not yet been claimed. +type envEntry struct { + name string + value string +} + +// unclaimedEnvEntries returns non-empty env vars that are not in HighRiskEnvKeys and not +// already present in seenEnvNames. It is used by scanNameRegex and scanValuePatterns to +// avoid repeating the same iteration and filtering logic in both methods. +func unclaimedEnvEntries(seenEnvNames map[string]bool) []envEntry { + var entries []envEntry + for _, raw := range os.Environ() { + idx := strings.IndexByte(raw, '=') + if idx < 0 { + continue + } + name := raw[:idx] + value := raw[idx+1:] + if HighRiskEnvKeys[name] || value == "" || seenEnvNames[name] { + continue + } + entries = append(entries, envEntry{name: name, value: value}) + } + return entries +} + // scanNameRegex checks env var names against known provider keywords and generic // credential terms. It catches non-standard names like MY_OPENAI_KEY that are // missed by the exact-match HighRiskEnvKeys pass. Key names only are reported; @@ -265,31 +298,11 @@ func (s *APIKeyScanner) scanEnvKeys(seenEnvNames map[string]bool) []models.Findi func (s *APIKeyScanner) scanNameRegex(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding - for _, entry := range os.Environ() { - idx := strings.IndexByte(entry, '=') - if idx < 0 { - continue - } - name := entry[:idx] - value := entry[idx+1:] - - // Skip if already covered by the exact-match HighRiskEnvKeys pass. - if HighRiskEnvKeys[name] { - continue - } - // Skip if value is empty — key exists but no credential is set. - if value == "" { - continue - } - // Skip if already claimed by a prior pass or earlier in this pass. - if seenEnvNames[name] { - continue - } - + for _, e := range unclaimedEnvEntries(seenEnvNames) { matched := false // Provider patterns require the name to also contain a credential suffix. for _, re := range providerNamePatterns { - if re.MatchString(name) && credentialSuffixRe.MatchString(name) { + if re.MatchString(e.name) && credentialSuffixRe.MatchString(e.name) { matched = true break } @@ -297,17 +310,17 @@ func (s *APIKeyScanner) scanNameRegex(seenEnvNames map[string]bool) []models.Fin // Credential suffix patterns match standalone. if !matched { for _, re := range credentialSuffixPatterns { - if re.MatchString(name) { + if re.MatchString(e.name) { matched = true break } } } if matched { - seenEnvNames[name] = true + seenEnvNames[e.name] = true findings = append(findings, models.Finding{ Scanner: "api_keys", - Resource: name, + Resource: e.name, Severity: models.SeverityHigh, Description: "Can be used to make authenticated API calls.", }) @@ -322,33 +335,13 @@ func (s *APIKeyScanner) scanNameRegex(seenEnvNames map[string]bool) []models.Fin func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models.Finding { var findings []models.Finding - for _, entry := range os.Environ() { - idx := strings.IndexByte(entry, '=') - if idx < 0 { - continue - } - name := entry[:idx] - value := entry[idx+1:] - - // Skip if already covered by the exact-match HighRiskEnvKeys pass. - if HighRiskEnvKeys[name] { - continue - } - // Skip empty values. - if value == "" { - continue - } - // Skip if already claimed by scanNameRegex or an earlier iteration of this pass. - if seenEnvNames[name] { - continue - } - + for _, e := range unclaimedEnvEntries(seenEnvNames) { for _, p := range valuePatterns { - if strings.HasPrefix(value, p.prefix) && len(value) == p.totalLen { - seenEnvNames[name] = true + if strings.HasPrefix(e.value, p.prefix) && len(e.value) == p.totalLen { + seenEnvNames[e.name] = true findings = append(findings, models.Finding{ Scanner: "api_keys", - Resource: name, // env var NAME, never the value + Resource: e.name, // env var NAME, never the value Severity: p.severity, Description: fmt.Sprintf("Value matches %s API key format.", p.providerTag), }) @@ -365,10 +358,14 @@ func (s *APIKeyScanner) scanValuePatterns(seenEnvNames map[string]bool) []models func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { var findings []models.Finding - // KEYS-02: Built-in credential files. // If home directory cannot be resolved, skip all ~-based paths to avoid // scanning incorrect root-relative paths (e.g. /.aws/credentials). - homeDir := s.resolveHomeDir() + homeDir := s.HomeDir + if homeDir == "" { + if h, err := os.UserHomeDir(); err == nil { + homeDir = h + } + } // seenPath deduplicates built-in and extra paths. allCredFiles := append(credentialFiles, s.ExtraCredentialFiles...) seenPath := make(map[string]bool, len(allCredFiles)) @@ -394,27 +391,6 @@ func (s *APIKeyScanner) scanCredentialFiles() []models.Finding { return findings } -func envKeyFinding(key string) models.Finding { - return models.Finding{ - Scanner: "api_keys", - Resource: key, - Severity: models.SeverityHigh, - Description: "Can be used to make authenticated API calls.", - } -} - -// resolveHomeDir returns the effective home directory for credential file expansion. -func (s *APIKeyScanner) resolveHomeDir() string { - if s.HomeDir != "" { - return s.HomeDir - } - home, err := os.UserHomeDir() - if err != nil { - return "" - } - return home -} - // expandHome replaces a leading ~ with the given homeDir. func expandHome(path, homeDir string) string { if len(path) == 0 { From 4f2c483269fa37a9ec4de5277eb242f65ed4c1ba Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:23:10 +0100 Subject: [PATCH 14/17] =?UTF-8?q?feat:=20expand=20provider=20coverage=20?= =?UTF-8?q?=E2=80=94=2040+=20new=20providers=20across=20AI,=20payments,=20?= =?UTF-8?q?comms,=20auth,=20observability,=20cloud,=20and=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/scan/apikeys.go | 46 ++++++++++++- internal/scan/apikeys_test.go | 70 ++++++++++++++++++++ internal/scan/scan.go | 119 +++++++++++++++++++++++++--------- 3 files changed, 202 insertions(+), 33 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index 5bcb350..a3653e7 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -43,8 +43,15 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)ASSEMBLYAI`), regexp.MustCompile(`(?i)AI21`), regexp.MustCompile(`(?i)NVIDIA_NIM`), + regexp.MustCompile(`(?i)STABILITY`), // Stability AI — image generation + regexp.MustCompile(`(?i)WANDB`), // Weights & Biases — ML experiment tracking + regexp.MustCompile(`(?i)TAVILY`), // Tavily — AI search, common in agents + regexp.MustCompile(`(?i)LANGCHAIN`), // LangSmith (LangChain tracing) + regexp.MustCompile(`(?i)(^|_)FAL_`), // fal.ai — (^|_) anchor avoids mid-word false positives // Secrets managers regexp.MustCompile(`(?i)DOPPLER`), + regexp.MustCompile(`(?i)VAULT`), // HashiCorp Vault + regexp.MustCompile(`(?i)INFISICAL`), // Infisical secrets manager // Google AI (Gemini, Vertex AI, PaLM) regexp.MustCompile(`(?i)GEMINI`), regexp.MustCompile(`(?i)VERTEX`), @@ -59,6 +66,10 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)BRAINTREE`), regexp.MustCompile(`(?i)PAYPAL`), regexp.MustCompile(`(?i)SQUARE`), + regexp.MustCompile(`(?i)ADYEN`), // Adyen — enterprise payments + regexp.MustCompile(`(?i)RAZORPAY`), // Razorpay — dominant in South/SE Asia + regexp.MustCompile(`(?i)MOLLIE`), // Mollie — dominant in EU + regexp.MustCompile(`(?i)PADDLE`), // Paddle — SaaS subscription billing // Communication / messaging regexp.MustCompile(`(?i)TWILIO`), regexp.MustCompile(`(?i)SENDGRID`), @@ -68,31 +79,59 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)SPARKPOST`), regexp.MustCompile(`(?i)SLACK`), regexp.MustCompile(`(?i)DISCORD`), + regexp.MustCompile(`(?i)VONAGE`), // Vonage/Nexmo — SMS/voice + regexp.MustCompile(`(?i)KLAVIYO`), // Klaviyo — e-commerce email + regexp.MustCompile(`(?i)MAILCHIMP`), // Mailchimp — customer lists + regexp.MustCompile(`(?i)CUSTOMERIO`), // Customer.io — behavioral marketing + regexp.MustCompile(`(?i)BREVO`), // Brevo/Sendinblue — EU email // Auth / identity regexp.MustCompile(`(?i)OKTA`), regexp.MustCompile(`(?i)AUTH0`), + regexp.MustCompile(`(?i)CLERK`), // Clerk — popular Next.js auth + regexp.MustCompile(`(?i)WORKOS`), // WorkOS — enterprise SSO // Observability regexp.MustCompile(`(?i)DATADOG`), regexp.MustCompile(`(?i)SENTRY`), + regexp.MustCompile(`(?i)NEW_RELIC`), // New Relic — APM/log exfil + regexp.MustCompile(`(?i)GRAFANA`), // Grafana Cloud + regexp.MustCompile(`(?i)HONEYCOMB`), // Honeycomb — observability // Cloud / hosting platforms regexp.MustCompile(`(?i)VERCEL`), regexp.MustCompile(`(?i)NETLIFY`), regexp.MustCompile(`(?i)CLOUDFLARE`), regexp.MustCompile(`(?i)HEROKU`), regexp.MustCompile(`(?i)RAILWAY`), - regexp.MustCompile(`(?i)(^|_)FLY_`), // (^|_) anchor avoids mid-word false positives + regexp.MustCompile(`(?i)(^|_)FLY_`), // (^|_) anchor avoids mid-word false positives + regexp.MustCompile(`(?i)DIGITALOCEAN`), // DigitalOcean — full infra control + regexp.MustCompile(`(?i)LINODE`), // Linode/Akamai — full infra control + regexp.MustCompile(`(?i)RENDER`), // Render — deploy platform + regexp.MustCompile(`(?i)PULUMI`), // Pulumi — IaC state + regexp.MustCompile(`(?i)CLOUDINARY`), // Cloudinary — media storage // Source control regexp.MustCompile(`(?i)GITHUB`), regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), + regexp.MustCompile(`(?i)CIRCLECI`), // CircleCI — CI supply chain // Productivity / project tools regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) anchor avoids mid-word false positives regexp.MustCompile(`(?i)NOTION`), regexp.MustCompile(`(?i)AIRTABLE`), + regexp.MustCompile(`(?i)ATLASSIAN`), // Atlassian/Jira — project data, PII + regexp.MustCompile(`(?i)JIRA`), // Jira + regexp.MustCompile(`(?i)ZENDESK`), // Zendesk — customer support PII + regexp.MustCompile(`(?i)INTERCOM`), // Intercom — customer chat PII + regexp.MustCompile(`(?i)HUBSPOT`), // HubSpot CRM — customer PII + regexp.MustCompile(`(?i)SALESFORCE`), // Salesforce — enterprise CRM + regexp.MustCompile(`(?i)SHOPIFY`), // Shopify — store orders, customer data + regexp.MustCompile(`(?i)SEGMENT`), // Segment — all customer behavioral events + regexp.MustCompile(`(?i)ALGOLIA`), // Algolia — search index admin // Database-as-a-service regexp.MustCompile(`(?i)SUPABASE`), regexp.MustCompile(`(?i)(^|_)NEON_`), // (^|_) anchor avoids mid-word false positives regexp.MustCompile(`(?i)PLANETSCALE`), + regexp.MustCompile(`(?i)TURSO`), // Turso — SQLite-at-edge + regexp.MustCompile(`(?i)UPSTASH`), // Upstash — serverless Redis/Kafka + regexp.MustCompile(`(?i)ELASTIC`), // Elasticsearch/Elastic Cloud } // credentialSuffixPatterns matches generic credential terms in env var names. @@ -150,6 +189,11 @@ var valuePatterns = []valuePattern{ {prefix: "ghu_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub user token"}, {prefix: "ghs_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub app installation token"}, {prefix: "ghr_", totalLen: 40, severity: models.SeverityHigh, providerTag: "GitHub refresh token"}, + // Tavily — tvly- prefix + 40 chars = 45 total. + {prefix: "tvly-", totalLen: 45, severity: models.SeverityHigh, providerTag: "Tavily search"}, + // LangSmith — lsv2_pt_ (personal access) or lsv2_sk_ (service key) prefix + 40 chars. + {prefix: "lsv2_pt_", totalLen: 48, severity: models.SeverityHigh, providerTag: "LangSmith API key"}, + {prefix: "lsv2_sk_", totalLen: 48, severity: models.SeverityHigh, providerTag: "LangSmith service key"}, } // credentialFiles is the list of credential files/dirs to check. diff --git a/internal/scan/apikeys_test.go b/internal/scan/apikeys_test.go index b4131be..4901d40 100644 --- a/internal/scan/apikeys_test.go +++ b/internal/scan/apikeys_test.go @@ -44,6 +44,18 @@ func TestAPIKeyScanner_HighRiskEnvKeysContainsKnownKeys(t *testing.T) { "GITHUB_TOKEN", "STRIPE_SECRET_KEY", "DATABASE_URL", + // New entries + "STABILITY_API_KEY", + "WANDB_API_KEY", + "VAULT_TOKEN", + "INFISICAL_TOKEN", + "CLERK_SECRET_KEY", + "DIGITALOCEAN_TOKEN", + "PULUMI_ACCESS_TOKEN", + "SHOPIFY_API_SECRET_KEY", + "HUBSPOT_ACCESS_TOKEN", + "TURSO_AUTH_TOKEN", + "UPSTASH_REDIS_REST_TOKEN", } for _, key := range known { if !scan.HighRiskEnvKeys[key] { @@ -387,6 +399,9 @@ func TestAPIKeyScanner_ValuePatterns(t *testing.T) { {"HuggingFace", "ML_MODEL_CRED", "hf_" + strings.Repeat("b", 34), "HIGH", "HuggingFace"}, {"GitHub classic PAT", "WORK_GH_TOKEN", "ghp_" + strings.Repeat("c", 36), "HIGH", "GitHub"}, {"Twilio SID", "CRED_SID", "SK" + strings.Repeat("f", 32), "UNCERTAIN", "Twilio"}, + {"Tavily", "SEARCH_KEY", "tvly-" + strings.Repeat("t", 40), "HIGH", "Tavily"}, + {"LangSmith personal token", "TRACE_KEY", "lsv2_pt_" + strings.Repeat("l", 40), "HIGH", "LangSmith"}, + {"LangSmith service key", "TRACE_SVC_KEY", "lsv2_sk_" + strings.Repeat("l", 40), "HIGH", "LangSmith"}, } for _, tc := range cases { @@ -444,6 +459,11 @@ func TestAPIKeyScanner_NameRegex_AnchoredPatterns(t *testing.T) { shouldMatch: map[string]string{"XAI_API_KEY": "real-xai-key", "MY_XAI_KEY": "also-real"}, shouldNotMatch: map[string]string{"PROXAI_ENDPOINT": "https://api.proxai.com", "RELAXAI_MODE": "true"}, }, + { + name: "FAL_", + shouldMatch: map[string]string{"FAL_API_KEY": "real-fal-key", "MY_FAL_KEY": "also-real"}, + shouldNotMatch: map[string]string{"DEFAULT_CONFIG": "v", "HALFLIFE_COUNT": "v"}, + }, } for _, tc := range cases { @@ -504,6 +524,56 @@ func TestAPIKeyScanner_NameRegex_NewProviders(t *testing.T) { {"ASSEMBLYAI_API_KEY", "aai-key-value"}, {"AI21_API_KEY", "ai21-key-value"}, {"NVIDIA_NIM_API_KEY", "nim-key-value"}, + // New AI/ML providers + {"STABILITY_API_KEY", "stability-key-value"}, + {"WANDB_PROJECT_KEY", "wandb-key-value"}, + {"TAVILY_API_KEY", "tavily-key-value"}, + {"LANGCHAIN_API_KEY", "langchain-key-value"}, + {"AZURE_OPENAI_API_KEY", "azure-oai-key"}, + {"FAL_API_KEY", "fal-key-value"}, + // Secrets managers + {"VAULT_API_TOKEN", "vault-key-value"}, + {"INFISICAL_API_TOKEN", "infisical-key-value"}, + // New payment providers + {"ADYEN_API_KEY", "adyen-key-value"}, + {"RAZORPAY_SECRET_KEY", "razorpay-key-value"}, + {"MOLLIE_API_KEY", "mollie-key-value"}, + {"PADDLE_API_KEY", "paddle-key-value"}, + // New communication providers + {"VONAGE_API_SECRET", "vonage-key-value"}, + {"KLAVIYO_API_KEY", "klaviyo-key-value"}, + {"MAILCHIMP_API_KEY", "mailchimp-key-value"}, + {"CUSTOMERIO_API_KEY", "customerio-key-value"}, + {"BREVO_API_KEY", "brevo-key-value"}, + // New auth providers + {"CLERK_SECRET_KEY", "clerk-key-value"}, + {"WORKOS_API_KEY", "workos-key-value"}, + // New observability providers + {"NEW_RELIC_LICENSE_KEY", "newrelic-key-value"}, + {"GRAFANA_API_KEY", "grafana-key-value"}, + {"HONEYCOMB_API_KEY", "honeycomb-key-value"}, + // New cloud / IaC providers + {"DIGITALOCEAN_API_KEY", "do-key-value"}, + {"LINODE_API_TOKEN", "linode-key-value"}, + {"RENDER_API_KEY", "render-key-value"}, + {"PULUMI_ACCESS_TOKEN", "pulumi-key-value"}, + {"CLOUDINARY_API_SECRET", "cloudinary-key-value"}, + // New CI/CD + {"CIRCLE_TOKEN", "circle-key-value"}, + // New dev tools / CRM + {"ATLASSIAN_API_TOKEN", "atlassian-key-value"}, + {"JIRA_API_TOKEN", "jira-key-value"}, + {"ZENDESK_API_TOKEN", "zendesk-key-value"}, + {"INTERCOM_ACCESS_TOKEN", "intercom-key-value"}, + {"HUBSPOT_API_KEY", "hubspot-key-value"}, + {"SALESFORCE_CLIENT_SECRET", "sf-key-value"}, + {"SHOPIFY_API_SECRET_KEY", "shopify-key-value"}, + {"SEGMENT_WRITE_KEY", "segment-key-value"}, + {"ALGOLIA_API_KEY", "algolia-key-value"}, + // New database providers + {"TURSO_AUTH_TOKEN", "turso-key-value"}, + {"UPSTASH_REDIS_REST_TOKEN", "upstash-key-value"}, + {"ELASTIC_API_KEY", "elastic-key-value"}, } for _, tc := range cases { diff --git a/internal/scan/scan.go b/internal/scan/scan.go index 79023aa..e8f35fd 100644 --- a/internal/scan/scan.go +++ b/internal/scan/scan.go @@ -156,18 +156,25 @@ var K8SProdPatterns = []string{"prod", "production", "prd", "live"} // only the key name is used. var HighRiskEnvKeys = map[string]bool{ // AI / ML inference - "OPENAI_API_KEY": true, - "ANTHROPIC_API_KEY": true, - "COHERE_API_KEY": true, - "MISTRAL_API_KEY": true, - "REPLICATE_API_KEY": true, - "HUGGINGFACE_TOKEN": true, - "HF_TOKEN": true, // Hugging Face canonical short name (used by huggingface-hub) - "TOGETHER_API_KEY": true, - "GROQ_API_KEY": true, - "VOYAGE_API_KEY": true, - "ELEVEN_LABS_API_KEY": true, - "PINECONE_API_KEY": true, + "OPENAI_API_KEY": true, + "ANTHROPIC_API_KEY": true, + "COHERE_API_KEY": true, + "MISTRAL_API_KEY": true, + "REPLICATE_API_KEY": true, + "HUGGINGFACE_TOKEN": true, + "HF_TOKEN": true, // Hugging Face canonical short name (used by huggingface-hub) + "TOGETHER_API_KEY": true, + "GROQ_API_KEY": true, + "VOYAGE_API_KEY": true, + "ELEVEN_LABS_API_KEY": true, + "PINECONE_API_KEY": true, + "STABILITY_API_KEY": true, // Stability AI — pay-per-image generation + "WANDB_API_KEY": true, // Weights & Biases — model/experiment data + "TAVILY_API_KEY": true, // Tavily search — widely used in LangChain/LangGraph agents + "LANGCHAIN_API_KEY": true, // LangSmith tracing (LangChain ecosystem) + "AZURE_OPENAI_API_KEY": true, // Azure OpenAI — distinct from service principal creds + "FAL_KEY": true, // fal.ai — GPU inference, financial risk + "NVIDIA_API_KEY": true, // NVIDIA NIM — enterprise GPU inference // Cloud: env-based credentials "AWS_ACCESS_KEY_ID": true, @@ -178,44 +185,92 @@ var HighRiskEnvKeys = map[string]bool{ "AZURE_CLIENT_ID": true, "AZURE_TENANT_ID": true, + // Secrets managers (key to all other secrets) + "VAULT_TOKEN": true, // HashiCorp Vault — grants access to all managed secrets + "OP_SERVICE_ACCOUNT_TOKEN": true, // 1Password Connect service account + "OP_CONNECT_TOKEN": true, // 1Password Connect API token + "INFISICAL_TOKEN": true, // Infisical secrets manager + // Source control & CI/CD "GITHUB_TOKEN": true, "GITLAB_TOKEN": true, "BITBUCKET_APP_PASSWORD": true, "NPM_TOKEN": true, "PYPI_API_TOKEN": true, + "CIRCLE_TOKEN": true, // CircleCI — CI supply chain attack surface // Payment "STRIPE_SECRET_KEY": true, "BRAINTREE_PRIVATE_KEY": true, "PAYPAL_CLIENT_SECRET": true, "SQUARE_ACCESS_TOKEN": true, - - // Messaging & comms (can exfiltrate data at scale) - "TWILIO_AUTH_TOKEN": true, - "SENDGRID_API_KEY": true, - "MAILGUN_API_KEY": true, - "SLACK_BOT_TOKEN": true, - "DISCORD_BOT_TOKEN": true, + "ADYEN_API_KEY": true, // Adyen — enterprise e-commerce payments + "RAZORPAY_KEY_SECRET": true, // Razorpay — dominant in South/SE Asia + "MOLLIE_API_KEY": true, // Mollie — dominant in EU + "PADDLE_API_KEY": true, // Paddle — SaaS billing + + // Messaging & comms (can exfiltrate data or send spam at scale) + "TWILIO_AUTH_TOKEN": true, + "SENDGRID_API_KEY": true, + "MAILGUN_API_KEY": true, + "SLACK_BOT_TOKEN": true, + "DISCORD_BOT_TOKEN": true, + "VONAGE_API_SECRET": true, // Vonage/Nexmo — SMS/voice telephony + "KLAVIYO_API_KEY": true, // Klaviyo — e-commerce email, customer PII + "MAILCHIMP_API_KEY": true, // Mailchimp — customer lists, PII + "CUSTOMERIO_API_KEY": true, // Customer.io — behavioral marketing, PII + "BREVO_API_KEY": true, // Brevo (Sendinblue) — EU email, PII // Identity & auth "OKTA_API_TOKEN": true, "AUTH0_CLIENT_SECRET": true, - - // Observability & infra - "DATADOG_API_KEY": true, - "SENTRY_AUTH_TOKEN": true, - "VERCEL_TOKEN": true, - "NETLIFY_AUTH_TOKEN": true, - "CLOUDFLARE_API_TOKEN": true, - "HEROKU_API_KEY": true, - "RAILWAY_TOKEN": true, - "FLY_API_TOKEN": true, + "CLERK_SECRET_KEY": true, // Clerk — popular Next.js auth, auth bypass risk + "WORKOS_API_KEY": true, // WorkOS — enterprise SSO + + // Observability + "DATADOG_API_KEY": true, + "SENTRY_AUTH_TOKEN": true, + "NEW_RELIC_LICENSE_KEY": true, // New Relic — APM data, log exfil + "NEW_RELIC_API_KEY": true, // New Relic user/account API key + "GRAFANA_API_KEY": true, // Grafana Cloud + "GRAFANA_TOKEN": true, // Grafana Cloud access policy token + "HONEYCOMB_API_KEY": true, // Honeycomb — trace data + + // Cloud / hosting / IaC + "VERCEL_TOKEN": true, + "NETLIFY_AUTH_TOKEN": true, + "CLOUDFLARE_API_TOKEN": true, + "HEROKU_API_KEY": true, + "RAILWAY_TOKEN": true, + "FLY_API_TOKEN": true, + "DIGITALOCEAN_TOKEN": true, // DigitalOcean — full infra control + "DO_API_TOKEN": true, // DigitalOcean alternative env var name + "LINODE_TOKEN": true, // Linode/Akamai — full infra control + "RENDER_API_KEY": true, // Render — deploy platform access + "PULUMI_ACCESS_TOKEN": true, // Pulumi — IaC state = all infra secrets + "TFE_TOKEN": true, // Terraform Cloud — IaC state + "CLOUDINARY_API_SECRET": true, // Cloudinary — media storage // Databases (connection strings often embed credentials) - "DATABASE_URL": true, - "MONGODB_URI": true, - "REDIS_URL": true, + "DATABASE_URL": true, + "MONGODB_URI": true, + "REDIS_URL": true, + "MONGODB_ATLAS_PRIVATE_KEY": true, // MongoDB Atlas admin API (separate from connection string) + "TURSO_AUTH_TOKEN": true, // Turso — SQLite-at-edge DB access + "UPSTASH_REDIS_REST_TOKEN": true, // Upstash — serverless Redis/Kafka + "ELASTIC_API_KEY": true, // Elasticsearch — data exfil risk + "ELASTIC_CLOUD_API_KEY": true, // Elastic Cloud management API + + // CRM / e-commerce / dev tools + "ATLASSIAN_API_TOKEN": true, // Atlassian/Jira — project data, PII + "JIRA_API_TOKEN": true, // Jira alternative env var + "HUBSPOT_ACCESS_TOKEN": true, // HubSpot CRM — customer PII + sales data + "SALESFORCE_CLIENT_SECRET": true, // Salesforce — enterprise CRM + "SHOPIFY_API_SECRET_KEY": true, // Shopify — store orders, customer PII + "ZENDESK_API_TOKEN": true, // Zendesk — customer support PII + "INTERCOM_ACCESS_TOKEN": true, // Intercom — customer chat PII + "SEGMENT_WRITE_KEY": true, // Segment — all customer behavioral events + "ALGOLIA_API_KEY": true, // Algolia — search index admin access } // Summarise computes a Summary from a slice of ScanResults. From 33764097c0c4f1553c8c783beee8683097904d2d Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:27:53 +0100 Subject: [PATCH 15/17] refactor: move HighRiskEnvKeys to apikeys.go, K8SProdPatterns to local.go --- internal/scan/apikeys.go | 122 ++++++++++++++++++++++++++++++++++++++ internal/scan/local.go | 3 + internal/scan/scan.go | 125 --------------------------------------- 3 files changed, 125 insertions(+), 125 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index a3653e7..78cad9f 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -13,6 +13,128 @@ import ( "github.com/Pringled/agentcheck/internal/models" ) +// HighRiskEnvKeys is the set of environment variable names that indicate +// high-value credentials are present in the shell. Values are NEVER read or logged; +// only the key name is used. +var HighRiskEnvKeys = map[string]bool{ + // AI / ML inference + "OPENAI_API_KEY": true, + "ANTHROPIC_API_KEY": true, + "COHERE_API_KEY": true, + "MISTRAL_API_KEY": true, + "REPLICATE_API_KEY": true, + "HUGGINGFACE_TOKEN": true, + "HF_TOKEN": true, // Hugging Face canonical short name (used by huggingface-hub) + "TOGETHER_API_KEY": true, + "GROQ_API_KEY": true, + "VOYAGE_API_KEY": true, + "ELEVEN_LABS_API_KEY": true, + "PINECONE_API_KEY": true, + "STABILITY_API_KEY": true, // Stability AI — pay-per-image generation + "WANDB_API_KEY": true, // Weights & Biases — model/experiment data + "TAVILY_API_KEY": true, // Tavily search — widely used in LangChain/LangGraph agents + "LANGCHAIN_API_KEY": true, // LangSmith tracing (LangChain ecosystem) + "AZURE_OPENAI_API_KEY": true, // Azure OpenAI — distinct from service principal creds + "FAL_KEY": true, // fal.ai — GPU inference, financial risk + "NVIDIA_API_KEY": true, // NVIDIA NIM — enterprise GPU inference + + // Cloud: env-based credentials + "AWS_ACCESS_KEY_ID": true, + "AWS_SECRET_ACCESS_KEY": true, + "AWS_SESSION_TOKEN": true, + "GOOGLE_APPLICATION_CREDENTIALS": true, + "AZURE_CLIENT_SECRET": true, + "AZURE_CLIENT_ID": true, + "AZURE_TENANT_ID": true, + + // Secrets managers (key to all other secrets) + "VAULT_TOKEN": true, // HashiCorp Vault — grants access to all managed secrets + "OP_SERVICE_ACCOUNT_TOKEN": true, // 1Password Connect service account + "OP_CONNECT_TOKEN": true, // 1Password Connect API token + "INFISICAL_TOKEN": true, // Infisical secrets manager + + // Source control & CI/CD + "GITHUB_TOKEN": true, + "GITLAB_TOKEN": true, + "BITBUCKET_APP_PASSWORD": true, + "NPM_TOKEN": true, + "PYPI_API_TOKEN": true, + "CIRCLE_TOKEN": true, // CircleCI — CI supply chain attack surface + + // Payment + "STRIPE_SECRET_KEY": true, + "BRAINTREE_PRIVATE_KEY": true, + "PAYPAL_CLIENT_SECRET": true, + "SQUARE_ACCESS_TOKEN": true, + "ADYEN_API_KEY": true, // Adyen — enterprise e-commerce payments + "RAZORPAY_KEY_SECRET": true, // Razorpay — dominant in South/SE Asia + "MOLLIE_API_KEY": true, // Mollie — dominant in EU + "PADDLE_API_KEY": true, // Paddle — SaaS billing + + // Messaging & comms (can exfiltrate data or send spam at scale) + "TWILIO_AUTH_TOKEN": true, + "SENDGRID_API_KEY": true, + "MAILGUN_API_KEY": true, + "SLACK_BOT_TOKEN": true, + "DISCORD_BOT_TOKEN": true, + "VONAGE_API_SECRET": true, // Vonage/Nexmo — SMS/voice telephony + "KLAVIYO_API_KEY": true, // Klaviyo — e-commerce email, customer PII + "MAILCHIMP_API_KEY": true, // Mailchimp — customer lists, PII + "CUSTOMERIO_API_KEY": true, // Customer.io — behavioral marketing, PII + "BREVO_API_KEY": true, // Brevo (Sendinblue) — EU email, PII + + // Identity & auth + "OKTA_API_TOKEN": true, + "AUTH0_CLIENT_SECRET": true, + "CLERK_SECRET_KEY": true, // Clerk — popular Next.js auth, auth bypass risk + "WORKOS_API_KEY": true, // WorkOS — enterprise SSO + + // Observability + "DATADOG_API_KEY": true, + "SENTRY_AUTH_TOKEN": true, + "NEW_RELIC_LICENSE_KEY": true, // New Relic — APM data, log exfil + "NEW_RELIC_API_KEY": true, // New Relic user/account API key + "GRAFANA_API_KEY": true, // Grafana Cloud + "GRAFANA_TOKEN": true, // Grafana Cloud access policy token + "HONEYCOMB_API_KEY": true, // Honeycomb — trace data + + // Cloud / hosting / IaC + "VERCEL_TOKEN": true, + "NETLIFY_AUTH_TOKEN": true, + "CLOUDFLARE_API_TOKEN": true, + "HEROKU_API_KEY": true, + "RAILWAY_TOKEN": true, + "FLY_API_TOKEN": true, + "DIGITALOCEAN_TOKEN": true, // DigitalOcean — full infra control + "DO_API_TOKEN": true, // DigitalOcean alternative env var name + "LINODE_TOKEN": true, // Linode/Akamai — full infra control + "RENDER_API_KEY": true, // Render — deploy platform access + "PULUMI_ACCESS_TOKEN": true, // Pulumi — IaC state = all infra secrets + "TFE_TOKEN": true, // Terraform Cloud — IaC state + "CLOUDINARY_API_SECRET": true, // Cloudinary — media storage + + // Databases (connection strings often embed credentials) + "DATABASE_URL": true, + "MONGODB_URI": true, + "REDIS_URL": true, + "MONGODB_ATLAS_PRIVATE_KEY": true, // MongoDB Atlas admin API (separate from connection string) + "TURSO_AUTH_TOKEN": true, // Turso — SQLite-at-edge DB access + "UPSTASH_REDIS_REST_TOKEN": true, // Upstash — serverless Redis/Kafka + "ELASTIC_API_KEY": true, // Elasticsearch — data exfil risk + "ELASTIC_CLOUD_API_KEY": true, // Elastic Cloud management API + + // CRM / e-commerce / dev tools + "ATLASSIAN_API_TOKEN": true, // Atlassian/Jira — project data, PII + "JIRA_API_TOKEN": true, // Jira alternative env var + "HUBSPOT_ACCESS_TOKEN": true, // HubSpot CRM — customer PII + sales data + "SALESFORCE_CLIENT_SECRET": true, // Salesforce — enterprise CRM + "SHOPIFY_API_SECRET_KEY": true, // Shopify — store orders, customer PII + "ZENDESK_API_TOKEN": true, // Zendesk — customer support PII + "INTERCOM_ACCESS_TOKEN": true, // Intercom — customer chat PII + "SEGMENT_WRITE_KEY": true, // Segment — all customer behavioral events + "ALGOLIA_API_KEY": true, // Algolia — search index admin access +} + // credentialSuffixRe matches env var names that contain a credential-related term. // Provider name patterns require this suffix to avoid false positives on non-credential // vars like GITHUB_WORKSPACE or OPENAI_BASE_URL. diff --git a/internal/scan/local.go b/internal/scan/local.go index 4e93a6f..e55a1af 100644 --- a/internal/scan/local.go +++ b/internal/scan/local.go @@ -10,6 +10,9 @@ import ( "github.com/Pringled/agentcheck/internal/models" ) +// K8SProdPatterns is the set of substrings that identify a Kubernetes context as production. +var K8SProdPatterns = []string{"prod", "production", "prd", "live"} + // toolCheck specifies a simple binary-outcome CLI tool check. // A check runs cmd and produces: // - confirmedFinding when rc == 0 (tool confirmed accessible/authenticated). diff --git a/internal/scan/scan.go b/internal/scan/scan.go index e8f35fd..d27209a 100644 --- a/internal/scan/scan.go +++ b/internal/scan/scan.go @@ -148,131 +148,6 @@ func runScanner(sc Scanner) (result models.ScanResult) { return sc.Scan() } -// K8SProdPatterns is the set of substrings that identify a Kubernetes context as production. -var K8SProdPatterns = []string{"prod", "production", "prd", "live"} - -// HighRiskEnvKeys is the set of environment variable names that indicate -// high-value credentials are present in the shell. Values are NEVER read or logged; -// only the key name is used. -var HighRiskEnvKeys = map[string]bool{ - // AI / ML inference - "OPENAI_API_KEY": true, - "ANTHROPIC_API_KEY": true, - "COHERE_API_KEY": true, - "MISTRAL_API_KEY": true, - "REPLICATE_API_KEY": true, - "HUGGINGFACE_TOKEN": true, - "HF_TOKEN": true, // Hugging Face canonical short name (used by huggingface-hub) - "TOGETHER_API_KEY": true, - "GROQ_API_KEY": true, - "VOYAGE_API_KEY": true, - "ELEVEN_LABS_API_KEY": true, - "PINECONE_API_KEY": true, - "STABILITY_API_KEY": true, // Stability AI — pay-per-image generation - "WANDB_API_KEY": true, // Weights & Biases — model/experiment data - "TAVILY_API_KEY": true, // Tavily search — widely used in LangChain/LangGraph agents - "LANGCHAIN_API_KEY": true, // LangSmith tracing (LangChain ecosystem) - "AZURE_OPENAI_API_KEY": true, // Azure OpenAI — distinct from service principal creds - "FAL_KEY": true, // fal.ai — GPU inference, financial risk - "NVIDIA_API_KEY": true, // NVIDIA NIM — enterprise GPU inference - - // Cloud: env-based credentials - "AWS_ACCESS_KEY_ID": true, - "AWS_SECRET_ACCESS_KEY": true, - "AWS_SESSION_TOKEN": true, - "GOOGLE_APPLICATION_CREDENTIALS": true, - "AZURE_CLIENT_SECRET": true, - "AZURE_CLIENT_ID": true, - "AZURE_TENANT_ID": true, - - // Secrets managers (key to all other secrets) - "VAULT_TOKEN": true, // HashiCorp Vault — grants access to all managed secrets - "OP_SERVICE_ACCOUNT_TOKEN": true, // 1Password Connect service account - "OP_CONNECT_TOKEN": true, // 1Password Connect API token - "INFISICAL_TOKEN": true, // Infisical secrets manager - - // Source control & CI/CD - "GITHUB_TOKEN": true, - "GITLAB_TOKEN": true, - "BITBUCKET_APP_PASSWORD": true, - "NPM_TOKEN": true, - "PYPI_API_TOKEN": true, - "CIRCLE_TOKEN": true, // CircleCI — CI supply chain attack surface - - // Payment - "STRIPE_SECRET_KEY": true, - "BRAINTREE_PRIVATE_KEY": true, - "PAYPAL_CLIENT_SECRET": true, - "SQUARE_ACCESS_TOKEN": true, - "ADYEN_API_KEY": true, // Adyen — enterprise e-commerce payments - "RAZORPAY_KEY_SECRET": true, // Razorpay — dominant in South/SE Asia - "MOLLIE_API_KEY": true, // Mollie — dominant in EU - "PADDLE_API_KEY": true, // Paddle — SaaS billing - - // Messaging & comms (can exfiltrate data or send spam at scale) - "TWILIO_AUTH_TOKEN": true, - "SENDGRID_API_KEY": true, - "MAILGUN_API_KEY": true, - "SLACK_BOT_TOKEN": true, - "DISCORD_BOT_TOKEN": true, - "VONAGE_API_SECRET": true, // Vonage/Nexmo — SMS/voice telephony - "KLAVIYO_API_KEY": true, // Klaviyo — e-commerce email, customer PII - "MAILCHIMP_API_KEY": true, // Mailchimp — customer lists, PII - "CUSTOMERIO_API_KEY": true, // Customer.io — behavioral marketing, PII - "BREVO_API_KEY": true, // Brevo (Sendinblue) — EU email, PII - - // Identity & auth - "OKTA_API_TOKEN": true, - "AUTH0_CLIENT_SECRET": true, - "CLERK_SECRET_KEY": true, // Clerk — popular Next.js auth, auth bypass risk - "WORKOS_API_KEY": true, // WorkOS — enterprise SSO - - // Observability - "DATADOG_API_KEY": true, - "SENTRY_AUTH_TOKEN": true, - "NEW_RELIC_LICENSE_KEY": true, // New Relic — APM data, log exfil - "NEW_RELIC_API_KEY": true, // New Relic user/account API key - "GRAFANA_API_KEY": true, // Grafana Cloud - "GRAFANA_TOKEN": true, // Grafana Cloud access policy token - "HONEYCOMB_API_KEY": true, // Honeycomb — trace data - - // Cloud / hosting / IaC - "VERCEL_TOKEN": true, - "NETLIFY_AUTH_TOKEN": true, - "CLOUDFLARE_API_TOKEN": true, - "HEROKU_API_KEY": true, - "RAILWAY_TOKEN": true, - "FLY_API_TOKEN": true, - "DIGITALOCEAN_TOKEN": true, // DigitalOcean — full infra control - "DO_API_TOKEN": true, // DigitalOcean alternative env var name - "LINODE_TOKEN": true, // Linode/Akamai — full infra control - "RENDER_API_KEY": true, // Render — deploy platform access - "PULUMI_ACCESS_TOKEN": true, // Pulumi — IaC state = all infra secrets - "TFE_TOKEN": true, // Terraform Cloud — IaC state - "CLOUDINARY_API_SECRET": true, // Cloudinary — media storage - - // Databases (connection strings often embed credentials) - "DATABASE_URL": true, - "MONGODB_URI": true, - "REDIS_URL": true, - "MONGODB_ATLAS_PRIVATE_KEY": true, // MongoDB Atlas admin API (separate from connection string) - "TURSO_AUTH_TOKEN": true, // Turso — SQLite-at-edge DB access - "UPSTASH_REDIS_REST_TOKEN": true, // Upstash — serverless Redis/Kafka - "ELASTIC_API_KEY": true, // Elasticsearch — data exfil risk - "ELASTIC_CLOUD_API_KEY": true, // Elastic Cloud management API - - // CRM / e-commerce / dev tools - "ATLASSIAN_API_TOKEN": true, // Atlassian/Jira — project data, PII - "JIRA_API_TOKEN": true, // Jira alternative env var - "HUBSPOT_ACCESS_TOKEN": true, // HubSpot CRM — customer PII + sales data - "SALESFORCE_CLIENT_SECRET": true, // Salesforce — enterprise CRM - "SHOPIFY_API_SECRET_KEY": true, // Shopify — store orders, customer PII - "ZENDESK_API_TOKEN": true, // Zendesk — customer support PII - "INTERCOM_ACCESS_TOKEN": true, // Intercom — customer chat PII - "SEGMENT_WRITE_KEY": true, // Segment — all customer behavioral events - "ALGOLIA_API_KEY": true, // Algolia — search index admin access -} - // Summarise computes a Summary from a slice of ScanResults. // UNCERTAIN findings contribute to the uncertain count but not to findings_total, // since they represent incomplete checks rather than confirmed findings. From 1e5ea56058e773e9f24c706cc88b9e8180920a5a Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:32:53 +0100 Subject: [PATCH 16/17] fix: remove AZURE_CLIENT_ID and AZURE_TENANT_ID from HighRiskEnvKeys These are public identifiers, not secrets. AZURE_CLIENT_SECRET remains. Reporting non-secret IDs at HIGH severity produces false positives for any user running az login interactively. --- internal/scan/apikeys.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index 78cad9f..89c1414 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -44,8 +44,6 @@ var HighRiskEnvKeys = map[string]bool{ "AWS_SESSION_TOKEN": true, "GOOGLE_APPLICATION_CREDENTIALS": true, "AZURE_CLIENT_SECRET": true, - "AZURE_CLIENT_ID": true, - "AZURE_TENANT_ID": true, // Secrets managers (key to all other secrets) "VAULT_TOKEN": true, // HashiCorp Vault — grants access to all managed secrets From bcb29e6e70eca621fa779442b54bdf4e3e5f667b Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 7 Mar 2026 11:33:53 +0100 Subject: [PATCH 17/17] fix: drop CIRCLECI regex from providerNamePatterns CIRCLE_TOKEN is the only real CircleCI credential var and is already covered by the exact-match in HighRiskEnvKeys. The CIRCLECI regex pattern added no meaningful coverage. --- internal/scan/apikeys.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/scan/apikeys.go b/internal/scan/apikeys.go index 89c1414..f73dd03 100644 --- a/internal/scan/apikeys.go +++ b/internal/scan/apikeys.go @@ -231,7 +231,6 @@ var providerNamePatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)GITHUB`), regexp.MustCompile(`(?i)GITLAB`), regexp.MustCompile(`(?i)BITBUCKET`), - regexp.MustCompile(`(?i)CIRCLECI`), // CircleCI — CI supply chain // Productivity / project tools regexp.MustCompile(`(?i)(^|_)LINEAR_`), // (^|_) anchor avoids mid-word false positives regexp.MustCompile(`(?i)NOTION`),