From 5c4dd89b266d871817b64e2e2aab9916b4c1e0bc Mon Sep 17 00:00:00 2001 From: henry Date: Sat, 28 Feb 2026 14:19:54 +0800 Subject: [PATCH 1/4] feat(create): add chat-archive command with dedup manifest --- cmd/create/chat_archive.go | 251 +++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 cmd/create/chat_archive.go diff --git a/cmd/create/chat_archive.go b/cmd/create/chat_archive.go new file mode 100644 index 00000000..242ea721 --- /dev/null +++ b/cmd/create/chat_archive.go @@ -0,0 +1,251 @@ +package create + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "sort" + "strings" + "time" + + eos "github.com/CodeMonkeyCybersecurity/eos/pkg/eos_cli" + "github.com/CodeMonkeyCybersecurity/eos/pkg/eos_io" + "github.com/spf13/cobra" +) + +type chatArchiveEntry struct { + SourcePath string `json:"source_path"` + DestPath string `json:"dest_path"` + SHA256 string `json:"sha256"` + SizeBytes int64 `json:"size_bytes"` + DuplicateOf string `json:"duplicate_of,omitempty"` + Copied bool `json:"copied"` + Conversation string `json:"conversation,omitempty"` +} + +type chatArchiveManifest struct { + GeneratedAt string `json:"generated_at"` + Sources []string `json:"sources"` + DestDir string `json:"dest_dir"` + Entries []chatArchiveEntry `json:"entries"` +} + +var CreateChatArchiveCmd = &cobra.Command{ + Use: "chat-archive", + Short: "Copy and deduplicate chat transcripts into a local archive", + Long: `Find transcript-like files (jsonl/json/html), copy unique files into one archive, +and write an index manifest with duplicate mappings. + +Examples: + eos create chat-archive + eos create chat-archive --source ~/.openclaw/agents/main/sessions --source ~/dev + eos create chat-archive --dest ~/Dev/eos/outputs/chat-archive --dry-run`, + RunE: eos.Wrap(runCreateChatArchive), +} + +func init() { + CreateCmd.AddCommand(CreateChatArchiveCmd) + CreateChatArchiveCmd.Flags().StringSlice("source", []string{"~/.openclaw/agents/main/sessions", "~/dev"}, "Source directories to scan") + CreateChatArchiveCmd.Flags().String("dest", "~/Dev/eos/outputs/chat-archive", "Destination archive directory") + CreateChatArchiveCmd.Flags().Bool("dry-run", false, "Show what would be archived without copying files") +} + +func runCreateChatArchive(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { + sources, _ := cmd.Flags().GetStringSlice("source") + dest, _ := cmd.Flags().GetString("dest") + dryRun, _ := cmd.Flags().GetBool("dry-run") + + expandedSources := make([]string, 0, len(sources)) + for _, s := range sources { + expandedSources = append(expandedSources, expandHome(s)) + } + dest = expandHome(dest) + + if !dryRun { + if err := os.MkdirAll(dest, 0o755); err != nil { + return fmt.Errorf("create destination dir: %w", err) + } + } + + files, err := discoverTranscriptFiles(expandedSources) + if err != nil { + return err + } + + byHash := map[string]string{} + manifest := chatArchiveManifest{ + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Sources: expandedSources, + DestDir: dest, + Entries: make([]chatArchiveEntry, 0, len(files)), + } + + copied := 0 + dups := 0 + for _, src := range files { + hash, size, err := fileSHA256(src) + if err != nil { + continue + } + + conversation := strings.TrimSuffix(filepath.Base(src), filepath.Ext(src)) + entry := chatArchiveEntry{SourcePath: src, SHA256: hash, SizeBytes: size, Conversation: conversation} + + if firstDest, ok := byHash[hash]; ok { + entry.DuplicateOf = firstDest + entry.DestPath = firstDest + entry.Copied = false + dups++ + manifest.Entries = append(manifest.Entries, entry) + continue + } + + ext := filepath.Ext(src) + if ext == "" { + ext = ".bin" + } + destFile := filepath.Join(dest, fmt.Sprintf("%s%s", hash[:16], ext)) + entry.DestPath = destFile + entry.Copied = true + + if !dryRun { + if err := copyArchiveFile(src, destFile); err != nil { + return fmt.Errorf("copy %s -> %s: %w", src, destFile, err) + } + } + + byHash[hash] = destFile + copied++ + manifest.Entries = append(manifest.Entries, entry) + } + + if !dryRun { + manifestPath := filepath.Join(dest, "manifest.json") + b, err := json.MarshalIndent(manifest, "", " ") + if err != nil { + return fmt.Errorf("marshal manifest: %w", err) + } + if err := os.WriteFile(manifestPath, b, 0o644); err != nil { + return fmt.Errorf("write manifest: %w", err) + } + fmt.Printf("Archive complete. %d unique files copied, %d duplicates mapped.\n", copied, dups) + fmt.Printf("Manifest: %s\n", manifestPath) + } else { + fmt.Printf("Dry run complete. %d unique files, %d duplicates.\n", copied, dups) + } + + _ = rc + return nil +} + +func discoverTranscriptFiles(roots []string) ([]string, error) { + out := make([]string, 0) + seen := map[string]struct{}{} + + isCandidate := func(path string) bool { + lp := strings.ToLower(path) + if strings.HasSuffix(lp, ".jsonl") || strings.HasSuffix(lp, ".chat") { + return true + } + if strings.HasSuffix(lp, ".html") && strings.Contains(lp, "chat") { + return true + } + if strings.HasSuffix(lp, ".json") { + base := strings.ToLower(filepath.Base(path)) + if !strings.Contains(base, "chat") && !strings.Contains(base, "conversation") && !strings.Contains(base, "session") && !strings.Contains(base, "transcript") { + return false + } + b, err := os.ReadFile(path) + if err != nil { + return false + } + h := strings.ToLower(string(b)) + hasMessages := strings.Contains(h, "\"messages\"") + hasRole := strings.Contains(h, "\"role\"") + hasContent := strings.Contains(h, "\"content\"") + hasConversation := strings.Contains(h, "\"conversation\"") + return (hasMessages && (hasRole || hasContent)) || (hasConversation && hasContent) + } + return false + } + + for _, root := range roots { + info, err := os.Stat(root) + if err != nil || !info.IsDir() { + continue + } + err = filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error { + if err != nil { + return nil + } + if d.IsDir() { + name := strings.ToLower(d.Name()) + if name == ".git" || name == "node_modules" || name == "target" || name == "vendor" { + return filepath.SkipDir + } + return nil + } + if isCandidate(path) { + if _, ok := seen[path]; !ok { + seen[path] = struct{}{} + out = append(out, path) + } + } + return nil + }) + if err != nil { + return nil, err + } + } + + sort.Strings(out) + return out, nil +} + +func fileSHA256(path string) (string, int64, error) { + f, err := os.Open(path) + if err != nil { + return "", 0, err + } + defer f.Close() + + h := sha256.New() + n, err := io.Copy(h, f) + if err != nil { + return "", 0, err + } + return hex.EncodeToString(h.Sum(nil)), n, nil +} + +func copyArchiveFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + if _, err := io.Copy(out, in); err != nil { + return err + } + return out.Sync() +} + +func expandHome(path string) string { + if strings.HasPrefix(path, "~/") { + home, err := os.UserHomeDir() + if err == nil { + return filepath.Join(home, strings.TrimPrefix(path, "~/")) + } + } + return path +} From 5f60cd3b995ac986fbf5ead1c6dee44e2b72c323 Mon Sep 17 00:00:00 2001 From: henry Date: Sat, 28 Feb 2026 15:39:50 +0800 Subject: [PATCH 2/4] fix(chat-archive): exclude self-archives and tighten transcript detection --- cmd/create/chat_archive.go | 72 +++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/cmd/create/chat_archive.go b/cmd/create/chat_archive.go index 242ea721..b4095d14 100644 --- a/cmd/create/chat_archive.go +++ b/cmd/create/chat_archive.go @@ -71,7 +71,7 @@ func runCreateChatArchive(rc *eos_io.RuntimeContext, cmd *cobra.Command, args [] } } - files, err := discoverTranscriptFiles(expandedSources) + files, err := discoverTranscriptFiles(expandedSources, dest) if err != nil { return err } @@ -91,6 +91,9 @@ func runCreateChatArchive(rc *eos_io.RuntimeContext, cmd *cobra.Command, args [] if err != nil { continue } + if size == 0 { + continue // skip empty transcript artifacts + } conversation := strings.TrimSuffix(filepath.Base(src), filepath.Ext(src)) entry := chatArchiveEntry{SourcePath: src, SHA256: hash, SizeBytes: size, Conversation: conversation} @@ -108,7 +111,11 @@ func runCreateChatArchive(rc *eos_io.RuntimeContext, cmd *cobra.Command, args [] if ext == "" { ext = ".bin" } - destFile := filepath.Join(dest, fmt.Sprintf("%s%s", hash[:16], ext)) + slug := sanitizeName(strings.TrimSuffix(filepath.Base(src), filepath.Ext(src))) + if slug == "" { + slug = "chat" + } + destFile := filepath.Join(dest, fmt.Sprintf("%s-%s%s", hash[:12], slug, ext)) entry.DestPath = destFile entry.Copied = true @@ -142,21 +149,29 @@ func runCreateChatArchive(rc *eos_io.RuntimeContext, cmd *cobra.Command, args [] return nil } -func discoverTranscriptFiles(roots []string) ([]string, error) { +func discoverTranscriptFiles(roots []string, dest string) ([]string, error) { out := make([]string, 0) seen := map[string]struct{}{} + destAbs := strings.ToLower(filepath.Clean(dest)) isCandidate := func(path string) bool { lp := strings.ToLower(path) - if strings.HasSuffix(lp, ".jsonl") || strings.HasSuffix(lp, ".chat") { - return true + base := strings.ToLower(filepath.Base(path)) + + // Strong path clues first + hasPathClue := strings.Contains(lp, "/.openclaw/") || strings.Contains(lp, "/sessions/") || strings.Contains(lp, "/transcripts/") || strings.Contains(lp, "/chats/") || strings.Contains(lp, "conversation") + + if strings.HasSuffix(lp, ".jsonl") { + return hasPathClue || strings.Contains(base, "chat") || strings.Contains(base, "session") || strings.Contains(base, "conversation") || strings.Contains(base, "transcript") } - if strings.HasSuffix(lp, ".html") && strings.Contains(lp, "chat") { + if strings.HasSuffix(lp, ".chat") { return true } + if strings.HasSuffix(lp, ".html") { + return strings.Contains(base, "chat") || strings.Contains(base, "conversation") || strings.Contains(base, "transcript") + } if strings.HasSuffix(lp, ".json") { - base := strings.ToLower(filepath.Base(path)) - if !strings.Contains(base, "chat") && !strings.Contains(base, "conversation") && !strings.Contains(base, "session") && !strings.Contains(base, "transcript") { + if !hasPathClue && !strings.Contains(base, "chat") && !strings.Contains(base, "conversation") && !strings.Contains(base, "session") && !strings.Contains(base, "transcript") { return false } b, err := os.ReadFile(path) @@ -182,9 +197,22 @@ func discoverTranscriptFiles(roots []string) ([]string, error) { if err != nil { return nil } + lpath := strings.ToLower(filepath.Clean(path)) + if lpath == destAbs || strings.HasPrefix(lpath, destAbs+"/") { + if d.IsDir() { + return filepath.SkipDir + } + return nil + } + if strings.Contains(lpath, "/dev/eos/outputs/chat-archive") || strings.Contains(lpath, "/desktop/conversationarchive") { + if d.IsDir() { + return filepath.SkipDir + } + return nil + } if d.IsDir() { name := strings.ToLower(d.Name()) - if name == ".git" || name == "node_modules" || name == "target" || name == "vendor" { + if name == ".git" || name == "node_modules" || name == "target" || name == "vendor" || name == ".cache" { return filepath.SkipDir } return nil @@ -240,6 +268,32 @@ func copyArchiveFile(src, dst string) error { return out.Sync() } +func sanitizeName(s string) string { + s = strings.ToLower(strings.TrimSpace(s)) + if s == "" { + return "" + } + var b strings.Builder + for _, r := range s { + switch { + case (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9'): + b.WriteRune(r) + case r == '-' || r == '_': + b.WriteRune('-') + case r == ' ': + b.WriteRune('-') + } + } + out := strings.Trim(b.String(), "-") + for strings.Contains(out, "--") { + out = strings.ReplaceAll(out, "--", "-") + } + if len(out) > 40 { + out = out[:40] + } + return out +} + func expandHome(path string) string { if strings.HasPrefix(path, "~/") { home, err := os.UserHomeDir() From 5b54809d2f34ce865a0b3aa0610d30cb2ec55e50 Mon Sep 17 00:00:00 2001 From: henry Date: Sun, 1 Mar 2026 11:32:50 +0800 Subject: [PATCH 3/4] fix(chat-archive): include codex/dev jsonl and memory.md with cleaner recursion filters --- cmd/create/chat_archive.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmd/create/chat_archive.go b/cmd/create/chat_archive.go index b4095d14..3063f43a 100644 --- a/cmd/create/chat_archive.go +++ b/cmd/create/chat_archive.go @@ -49,7 +49,7 @@ Examples: func init() { CreateCmd.AddCommand(CreateChatArchiveCmd) - CreateChatArchiveCmd.Flags().StringSlice("source", []string{"~/.openclaw/agents/main/sessions", "~/dev"}, "Source directories to scan") + CreateChatArchiveCmd.Flags().StringSlice("source", []string{"~/.openclaw/agents/main/sessions", "~/.codex/sessions", "~/Dev", "~/dev"}, "Source directories to scan") CreateChatArchiveCmd.Flags().String("dest", "~/Dev/eos/outputs/chat-archive", "Destination archive directory") CreateChatArchiveCmd.Flags().Bool("dry-run", false, "Show what would be archived without copying files") } @@ -161,7 +161,14 @@ func discoverTranscriptFiles(roots []string, dest string) ([]string, error) { // Strong path clues first hasPathClue := strings.Contains(lp, "/.openclaw/") || strings.Contains(lp, "/sessions/") || strings.Contains(lp, "/transcripts/") || strings.Contains(lp, "/chats/") || strings.Contains(lp, "conversation") + if base == "memory.md" { + return true + } if strings.HasSuffix(lp, ".jsonl") { + // User requirement: archive all JSONL under ~/Dev recursively. + if strings.Contains(lp, "/dev/") { + return true + } return hasPathClue || strings.Contains(base, "chat") || strings.Contains(base, "session") || strings.Contains(base, "conversation") || strings.Contains(base, "transcript") } if strings.HasSuffix(lp, ".chat") { @@ -212,7 +219,7 @@ func discoverTranscriptFiles(roots []string, dest string) ([]string, error) { } if d.IsDir() { name := strings.ToLower(d.Name()) - if name == ".git" || name == "node_modules" || name == "target" || name == "vendor" || name == ".cache" { + if name == ".git" || name == "node_modules" || name == "target" || name == "vendor" || name == ".cache" || name == "outputs" || name == "dist" || name == "build" { return filepath.SkipDir } return nil From 2dce1688c0c059c3e0d6bb3d5aee43367c729499 Mon Sep 17 00:00:00 2001 From: henry Date: Sat, 14 Mar 2026 21:13:31 +0800 Subject: [PATCH 4/4] docs(governance): thin-proxy CLAUDE.md refactor (refs #75) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace 1833-line CLAUDE.md monolith with 75-line thin proxy that @imports governance contracts from cybermonkey/prompts submodule. Changes: - Add prompts/ git submodule (ssh://git@vhost7:9001/cybermonkey/prompts.git) - CLAUDE.md: 1833 lines → 75 lines (within 200-line budget) - .claude/rules/go-patterns.md: architecture, constants, logging, idempotency - .claude/rules/cli-patterns.md: cmd/pkg separation, flag validation, human-centric input - .claude/rules/secrets-vault.md: Vault/Consul patterns, token auth hierarchy - .claude/rules/debugging.md: diagnostic logging, evidence collection Path-scoped rules load only when touching relevant files, preventing context saturation that caused agents to ignore most of the old CLAUDE.md. Co-Authored-By: Claude Sonnet 4.6 --- .claude/rules/cli-patterns.md | 128 +++ .claude/rules/debugging.md | 96 ++ .claude/rules/go-patterns.md | 167 +++ .claude/rules/secrets-vault.md | 126 +++ .gitmodules | 3 + CLAUDE.md | 1861 +------------------------------- prompts | 1 + 7 files changed, 572 insertions(+), 1810 deletions(-) create mode 100644 .claude/rules/cli-patterns.md create mode 100644 .claude/rules/debugging.md create mode 100644 .claude/rules/go-patterns.md create mode 100644 .claude/rules/secrets-vault.md create mode 100644 .gitmodules create mode 160000 prompts diff --git a/.claude/rules/cli-patterns.md b/.claude/rules/cli-patterns.md new file mode 100644 index 00000000..c587f8b6 --- /dev/null +++ b/.claude/rules/cli-patterns.md @@ -0,0 +1,128 @@ +--- +description: Eos CLI patterns — cobra commands, flag validation, human-centric input handling +paths: + - "cmd/**/*.go" + - "pkg/interaction/**" + - "pkg/verify/**" +--- + +# Eos CLI Patterns + +## Command Structure + +Verb-first with flag-based operations: +``` +eos [verb] [noun] --[operation] [target] [--flags...] + +eos update hecate --add bionicgpt --dns example.com +eos update vault --fix --dry-run +eos delete env production --force +``` + +Exception: standard CRUD verbs use positional args: +``` +eos update services start nginx # 'start' is a verb, not an operation flag +``` + +## Human-Centric Flag Handling (P0 — Breaking) + +If a required flag is missing, NEVER fail immediately. ALWAYS offer interactive fallback with informed consent. + +**Violation**: `if flag == "" { return fmt.Errorf("--token is required") }` + +**Correct pattern** — use the full fallback chain: +1. CLI flag (if explicitly set via `cmd.Flags().Changed()`) +2. Environment variable (if configured) +3. Interactive prompt (if TTY available, with help text explaining WHY and HOW) +4. Default value (if `AllowEmpty` is true) +5. Error with clear remediation steps (non-interactive mode only) + +```go +// CORRECT: Human-centric with fallback chain +tokenFlag, _ := cmd.Flags().GetString("token") +tokenWasSet := cmd.Flags().Changed("token") + +result, err := interaction.GetRequiredString(rc, tokenFlag, tokenWasSet, &interaction.RequiredFlagConfig{ + FlagName: "token", + EnvVarName: "VAULT_TOKEN", + PromptMessage: "Enter Vault root token: ", + HelpText: "Required for cluster operations. Get via: vault token create", + IsSecret: true, +}) +if err != nil { + return fmt.Errorf("failed to get vault token: %w", err) +} +logger.Info("Using Vault token", zap.String("source", string(result.Source))) +``` + +Required elements: +- **Help text**: WHY is this needed? HOW to get the value? +- **Source logging**: always log which fallback was used (CLI/env/prompt/default) +- **Validation**: validate input, retry with clear guidance (max 3 attempts) +- **Security**: `IsSecret: true` for passwords/tokens (no terminal echo) + +## Missing Dependencies (P0 — Breaking) + +NEVER error out immediately when a dependency is missing. ALWAYS offer informed consent to install: +```go +interaction.CheckDependencyWithPrompt(rc, interaction.DependencyConfig{ + Name: "docker", + Description: "Container runtime required for service deployment", + InstallCmd: "curl -fsSL https://get.docker.com | sh", + AskConsent: true, +}) +``` + +## Flag Bypass Vulnerability Prevention (P0 — Breaking) + +Cobra's `--` separator stops flag parsing and passes everything as positional args. This bypasses safety flags. + +**Vulnerable pattern** (user types `eos delete env prod -- --force`): +- Cobra sees args: `["prod", "--force"]` — flags are NOT set +- `--force` check passes silently — production deleted without confirmation + +**MANDATORY MITIGATION**: ALL commands accepting positional arguments MUST call `verify.ValidateNoFlagLikeArgs` as the first line of `RunE`: + +```go +RunE: eos.Wrap(func(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { + logger := otelzap.Ctx(rc.Ctx) + + // CRITICAL: Detect flag-like args passed as positional (-- bypass) + if err := verify.ValidateNoFlagLikeArgs(args); err != nil { + return err // clear user-facing error with remediation + } + + // rest of command logic... +}) +``` + +Affected command types (any using `cobra.ExactArgs`, `cobra.MaximumNArgs`, `cobra.MinimumNArgs`): +- Safety-critical: `cmd/delete/`, `cmd/promote/` — production deletion, approval overrides +- All others: `cmd/backup/`, `cmd/create/`, `cmd/update/` + +See `pkg/verify/validators.go:ValidateNoFlagLikeArgs` for implementation. + +## Drift Correction Pattern + +Services that drift from canonical state (wrong permissions, config values): +``` +eos update --fix # detect and correct drift +eos update --fix --dry-run # preview corrections without applying +``` + +NEVER create separate `eos fix ` commands — use `--fix` flag on existing `eos update` commands. + +## Configuration Drift Decision + +``` +Service has drifted? +├─ Use: eos update --fix +├─ Compares: Current state vs. canonical state from eos create +├─ Corrects: Permissions, ownership, config values +└─ Verifies: Post-fix state matches canonical + +Want to check only? +└─ Use: eos update --fix --dry-run + +DEPRECATED: eos fix vault → use eos update vault --fix +``` diff --git a/.claude/rules/debugging.md b/.claude/rules/debugging.md new file mode 100644 index 00000000..f1058171 --- /dev/null +++ b/.claude/rules/debugging.md @@ -0,0 +1,96 @@ +--- +description: Eos debugging patterns — diagnostic logging, debug commands, evidence collection +paths: + - "cmd/debug/**" + - "pkg/**/*.go" +--- + +# Debugging Patterns + +## Diagnostic Logging Strategy + +In `cmd/debug/` handlers, use two distinct output modes: + +| Phase | Output method | Purpose | +|---|---|---| +| Diagnostic checks (health, config validation) | `logger.Info/Warn/Error(...)` | Structured — captured by telemetry | +| Progress indicators | `logger.Debug(...)` or `logger.Info(...)` | Visible to user in real-time | +| Issue detection | `logger.Warn/Error(...)` with zap fields | Structured error data | +| **Final report rendering** | `fmt.Print(report.Render())` ONLY | Terminal-formatted output AFTER telemetry | + +```go +// CORRECT: cmd/debug handler pattern +func runVaultDiagnostic(rc *eos_io.RuntimeContext) error { + logger := otelzap.Ctx(rc.Ctx) + + // Phase 1: diagnostics via structured logger (telemetry captured) + logger.Info("Checking Vault seal status") + sealed, err := vault.CheckSealStatus(rc) + if err != nil { + logger.Error("Failed to check seal status", zap.Error(err)) + } + logger.Info("Vault seal status", zap.Bool("sealed", sealed)) + + // Phase 2: terminal-formatted report ONLY after all diagnostics done + report := buildVaultReport(sealed, ...) + fmt.Print(report.Render()) // OK here — final output only + return nil +} +``` + +## Evidence Collection + +When collecting diagnostic evidence, capture: +1. **State**: current configuration, running services, connectivity +2. **Timestamps**: when check was performed, service start times +3. **Context**: environment variables (redacted secrets), config file hashes +4. **Errors**: full error chains including root cause + +```go +// Evidence struct pattern +type DiagnosticEvidence struct { + Timestamp time.Time `json:"timestamp"` + ServiceName string `json:"service_name"` + Checks []CheckResult `json:"checks"` + Errors []string `json:"errors"` + Config map[string]string `json:"config"` // no secret values +} +``` + +## Debug Command Structure + +Debug commands live in `cmd/debug/` and follow this pattern: + +``` +eos debug [service] # full diagnostic check +eos debug [service] --fix # diagnose and attempt auto-remediation +eos debug [service] --json # machine-readable output for CI/automation +``` + +Output format: +- Human mode (default): coloured terminal report with summary + details +- JSON mode (`--json`): structured JSON for parsing by other tools + +## Automatic Debug Output Capture + +For commands that call external tools (`vault`, `consul`, `docker`): +```go +// Capture stdout+stderr for evidence +cmd := exec.CommandContext(rc.Ctx, "vault", "status") +out, err := cmd.CombinedOutput() +if err != nil { + logger.Error("vault status failed", + zap.Error(err), + zap.String("output", string(out)), // attach full output + ) +} +``` + +## Anti-Patterns + +| Anti-pattern | Why it's wrong | Do this instead | +|---|---|---| +| `fmt.Println("checking vault...")` in diagnostic phase | Bypasses telemetry, no structured fields | `logger.Info("checking vault status")` | +| `fmt.Print(...)` in pkg/ functions | pkg/ functions have no terminal context | Return structured data, let cmd/ render | +| Swallowing errors in diagnostics | Hidden failures give false-positive health | Log and continue: `logger.Warn("...", zap.Error(err))` | +| `log.Fatal(...)` in pkg/ | Kills process without cleanup | Return error, let cmd/ handle exit | diff --git a/.claude/rules/go-patterns.md b/.claude/rules/go-patterns.md new file mode 100644 index 00000000..ca658aa5 --- /dev/null +++ b/.claude/rules/go-patterns.md @@ -0,0 +1,167 @@ +--- +description: Eos Go patterns — architecture, constants, logging, idempotency, retry logic +paths: + - "**/*.go" + - "pkg/**/*.go" +--- + +# Eos Go Patterns + +## Architecture: cmd/ vs pkg/ (P0 — Breaking) + +**cmd/**: Orchestration ONLY. +- Define `cobra.Command` with flags +- Parse flags into config struct +- Call `pkg/[feature]/Function(rc, config)` +- Return result — NO business logic +- **If cmd/ file exceeds 100 lines → move logic to pkg/** + +**pkg/**: ALL business logic. +- Pattern: **ASSESS → INTERVENE → EVALUATE** + 1. ASSESS: Check current state + 2. INTERVENE: Apply changes if needed + 3. EVALUATE: Verify and report results +- Always use `*eos_io.RuntimeContext` for all operations + +```go +// Good cmd/ file (thin orchestration) +RunE: eos.Wrap(func(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { + cfg := &vault.ClusterConfig{Token: tokenFlag} + return vault.UpdateCluster(rc, cfg) // all logic in pkg/ +}) + +// Bad cmd/ file (business logic in cmd/) +RunE: func(cmd *cobra.Command, args []string) error { + client := api.NewClient(...) // WRONG — this belongs in pkg/ + resp, err := client.Do(...) // WRONG + return err +} +``` + +## Logging (P0 — Breaking) + +**ALWAYS** use `otelzap.Ctx(rc.Ctx)` — structured logging goes to terminal AND telemetry. + +**NEVER** use `fmt.Print*` / `fmt.Println` in pkg/ or cmd/ (except one exception below). + +**Exception — cmd/debug/ final report rendering ONLY:** +```go +// CORRECT: diagnostics via logger, final output via fmt +logger.Info("Checking Vault config") // diagnostic — telemetry captured +logger.Warn("Seal status: sealed") // diagnostic +fmt.Print(report.Render()) // ONLY at end, after all telemetry +``` + +## Constants — Single Source of Truth (P0 — Breaking) + +NEVER hardcode literal values. Every value must be a named constant defined in EXACTLY ONE place. + +| Value type | Location | +|------------|----------| +| Port numbers | `pkg/shared/ports.go` | +| Common paths | `pkg/shared/paths.go` | +| Vault paths/URLs | `pkg/vault/constants.go` | +| Consul paths | `pkg/consul/constants.go` | +| Service-specific | `pkg/[service]/constants.go` | + +**FORBIDDEN hardcoded values:** +```go +// WRONG — hardcoded everywhere +os.MkdirAll("/etc/vault.d", 0755) +net.Listen("tcp", "localhost:8200") +exec.Command("systemctl", "start", "vault.service") + +// CORRECT — named constants +os.MkdirAll(vault.VaultConfigDir, vault.VaultDirPerm) +net.Listen("tcp", fmt.Sprintf("%s:%d", shared.LocalhostIP, shared.PortVault)) +exec.Command("systemctl", "start", vault.VaultServiceName) +``` + +**Circular import exception**: Document with `// NOTE: Duplicates B.ConstName to avoid circular import` + +**File permissions** must have security rationale in the constant definition: +```go +// VaultTLSKeyPerm restricts private key access to vault user only. +// RATIONALE: Private keys must not be world-readable. +// SECURITY: Prevents credential theft via filesystem access. +// THREAT MODEL: Mitigates insider threat and container escape attacks. +const VaultTLSKeyPerm = 0600 +``` + +## Idempotency (P1) + +All pkg/ operations MUST be safe to run multiple times: +- Check before creating: verify state before applying changes +- Use `os.MkdirAll` not `os.Mkdir` (no error if exists) +- Use upsert patterns for config writes +- Compare current state to desired state before modifying + +## Retry Logic (P1) + +**Transient failures → retry with backoff:** +- Network timeouts, connection refused (service starting) +- Lock contention, resource temporarily unavailable +- HTTP 429/503 (rate limiting, service overloaded) + +**Deterministic failures → fail fast, no retry:** +- Config/validation errors, missing required files +- Authentication failures (wrong credentials) +- Permission denied + +```go +// Transient: retry +err := retry.Do(func() error { + return vault.CheckHealth(rc) +}, retry.Attempts(5), retry.Delay(2*time.Second)) + +// Deterministic: fail fast +if cfg.Token == "" { + return fmt.Errorf("vault token required: %w", ErrMissingConfig) +} +``` + +## Error Context (P1) + +Wrap errors with context at EVERY layer: +```go +// WRONG — no context +return err + +// CORRECT — context at each layer +return fmt.Errorf("failed to initialize vault cluster: %w", err) +``` + +User-facing errors use typed error wrappers: +```go +return eos_err.NewUserError("vault token expired — run: vault token renew") +return eos_err.NewSystemError("vault unsealing failed", err) +``` + +Capture command output in errors: +```go +out, err := cmd.CombinedOutput() +if err != nil { + return fmt.Errorf("command failed: %w\noutput: %s", err, out) +} +``` + +## Code Integration (P0) + +**Before writing new code**, search for existing functionality: +- `grep -r "FunctionName" pkg/` to find existing implementations +- ALWAYS enhance existing functions rather than creating duplicates +- NEVER create a second HTTP client for the same service — add methods to the existing one +- Only deprecate functions if absolutely necessary — prefer evolution over replacement +- Verify integration points: ensure new code is wired into existing callers + +## Common Anti-Patterns + +| Anti-pattern | Correct approach | +|---|---| +| `fmt.Println("done")` in pkg/ | `logger.Info("operation complete", zap.String("op", "done"))` | +| New HTTP client for existing service | Add method to existing client in `pkg/[service]/client.go` | +| Hardcoded `"/etc/vault.d"` | Use `vault.VaultConfigDir` constant | +| `os.MkdirAll(dir, 0755)` | Use `vault.VaultDirPerm` or `consul.ConsulDirPerm` | +| Business logic in `cmd/*.go` | Move to `pkg/[feature]/*.go` | +| `_ = someFunc()` (discarding errors) | `if err != nil { return fmt.Errorf(...): %w", err) }` | +| Standalone `*.md` docs (except ROADMAP.md, README.md) | Put in inline comments or update ROADMAP.md | diff --git a/.claude/rules/secrets-vault.md b/.claude/rules/secrets-vault.md new file mode 100644 index 00000000..7d74646b --- /dev/null +++ b/.claude/rules/secrets-vault.md @@ -0,0 +1,126 @@ +--- +description: Eos secrets and Vault/Consul patterns — storage, delivery, cluster auth +paths: + - "pkg/vault/**" + - "pkg/consul/**" + - "pkg/secrets/**" + - "cmd/create/**" + - "cmd/update/**" +--- + +# Secrets and Vault/Consul Patterns + +## Storage Decision + +| Data type | Storage | Delivery | +|---|---|---| +| Passwords, API keys, tokens, TLS keys | Vault KV (`secret/[service]/[key]`) | Vault Agent template | +| Feature flags, ports, URLs, log levels | Consul KV (`service/[service]/config/[key]`) | Consul Template or direct read | +| Both secrets + config | Vault + Consul | Consul Template (both backends) | + +**Never**: hardcode credentials, store secrets in env files, or use `.env` without Vault Agent rendering. + +## Secret Storage Pattern + +```go +// At service installation time +secretManager, err := secrets.NewSecretManager(rc, envConfig) +requiredSecrets := map[string]secrets.SecretType{ + "db_password": secrets.SecretTypePassword, + "api_key": secrets.SecretTypeAPIKey, + "jwt_secret": secrets.SecretTypeToken, +} +serviceSecrets, err := secretManager.GetOrGenerateServiceSecrets("myservice", requiredSecrets) +// Stored at: secret/myservice/{db_password,api_key,jwt_secret} +``` + +**Path convention**: `secret/[service-name]/[secret-key]` + +## Vault Cluster Authentication (P1 — Critical) + +### Authentication Hierarchy + +Eos uses a priority-based token resolution chain: + +1. **Explicit CLI token** — `--token` flag (highest priority) +2. **VAULT_TOKEN env var** — operator-set environment variable +3. **Interactive prompt** — TTY-available fallback with help text +4. **Vault Agent token** — file at `/run/eos/vault_agent_eos.token` +5. **AppRole** — programmatic auth with Role ID + Secret ID +6. **Error with remediation** — non-interactive mode, clear steps (lowest priority) + +Use `interaction.GetRequiredString()` for the CLI fallback chain (see cli-patterns.md). + +### Token Validation Sequence + +Before using a Vault token, validate: +1. Check token is non-empty +2. Verify token format (starts with `s.` or `hvs.`) +3. Call `/v1/auth/token/lookup-self` to verify it's not expired +4. Check token policies include required capabilities +5. Log token source and expiry for observability + +### Token Security + +```go +// NEVER log full token — log only last 4 chars +logger.Info("Using Vault token", zap.String("token_suffix", token[len(token)-4:])) + +// NEVER store token in environment variable after lookup +// Use the file-based token from Vault Agent: /run/eos/vault_agent_eos.token + +// NEVER expose token in error messages +return fmt.Errorf("vault auth failed — token invalid or expired") // no token value +``` + +### Common Vault Auth Pitfalls + +| Pitfall | What happens | Fix | +|---|---|---| +| Token passed in URL query param | Token in server logs | Use Authorization header | +| Token stored in VAULT_TOKEN env | Visible via `/proc//environ` | Use temp file, delete after use | +| Not checking token expiry | Silent auth failure mid-operation | Validate with lookup-self before use | +| AppRole Secret ID reuse | Rotation breaks silently | Use `SecretIDNumUses=1` or short TTL | + +## Vault Agent Template (Secrets Only) + +Use when: service only needs Vault secrets, no dynamic Consul config. + +```hcl +# /etc/vault.d/templates/myservice.env.ctmpl +DATABASE_PASSWORD={{ with secret "secret/myservice/db_password" }}{{ .Data.data.value }}{{ end }} +API_KEY={{ with secret "secret/myservice/api_key" }}{{ .Data.data.value }}{{ end }} +``` + +```hcl +# In vault agent config +template { + source = "/etc/vault.d/templates/myservice.env.ctmpl" + destination = "/opt/myservice/.env" + perms = "0640" + command = "docker compose -f /opt/myservice/docker-compose.yml up -d --force-recreate" +} +``` + +## Consul Template (Secrets + Config) + +Use when: service needs both Vault secrets AND dynamic Consul config. + +```hcl +# /etc/consul-template.d/myservice.env.ctmpl +PORT={{ key "service/myservice/config/port" }} +ENABLE_RAG={{ key "service/myservice/config/feature_flags/enable_rag" }} +DATABASE_PASSWORD={{ with secret "secret/myservice/db_password" }}{{ .Data.data.value }}{{ end }} +``` + +Reuse the Vault Agent token: `{{ file "/run/eos/vault_agent_eos.token" }}` + +## Docker Operations (P1 — Critical) + +Container operations: ALWAYS use Docker SDK (`github.com/docker/docker/client`), NOT shell exec. + +Docker Compose validation: use `docker.ValidateComposeWithShellFallback(ctx, composeFile, envFile)`: +- Strategy: SDK first (35μs), shell fallback (`docker compose config`) if SDK fails +- NEVER run `docker compose up` without validation first + +Template rendering: use `pkg/templates/render.go` — NEVER `template.New()` scattered in packages. diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..30e74143 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "prompts"] + path = prompts + url = ssh://git@vhost7:9001/cybermonkey/prompts.git diff --git a/CLAUDE.md b/CLAUDE.md index 0c36d26b..fdbfcc04 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,1834 +1,75 @@ -# CLAUDE.md +# Eos -*Last Updated: 2025-11-07* +Go CLI for Ubuntu server administration — Vault, Consul, Nomad, Caddy/Hecate, and containerised services. +Code Monkey Cybersecurity (ABN 77 177 673 061). Dual-licensed: AGPL-3.0-or-later + Do No Harm License. -AI assistant guidance for Eos - A Go-based CLI for Ubuntu server administration by Code Monkey Cybersecurity (ABN 77 177 673 061). +## Commands -**IMPORTANT**: For roadmap, technical debt tracking, and future work planning, see [ROADMAP.md](ROADMAP.md). This file focuses on immediate development standards and patterns. - -**RECENT ADDITIONS** (2025-10-28): -- ✅ Caddy Admin API infrastructure completed - see [ROADMAP.md](ROADMAP.md#-technical-debt---caddy-configuration-management-future-direction) -- ✅ QUIC/HTTP3 firewall support (UDP/443) - see [ROADMAP.md](ROADMAP.md#-quichttp3-support---firewall-configuration-2025-10-28) - -## Mission & Values - -### Philosphy -- **Human centric**: Technology serves humans, not the other way around, actionable output, addresses barriers to entry, encourage end-user ducation and self-efficacy, feminist (for example, informed consent), safe effective high-quality -- **Evidence based**: accepts falliblism, error correction, value for time, value for money, decisions grounded in security research and best practices -- **Sustainable innovation**: Maintainable code, comprehensive documentation, iterative improvement, response ready, incorporates recent research and best practice. Solve problems once, encode in Eos, never solve again -- **Collaboration and listening**: adversarial collaboration, transparent decision making, ownership accountability responsibility, open source, codesign - -**Iterative Philosophy**: Eos is built iteratively. We build on what exists, solve complex problems once, encode them in Eos, and never solve them again. Each improvement makes the next one easier. - -**Code Integration Philosophy**: When writing new code, ALWAYS iterate on existing functions rather than creating new ones. Check for existing functionality in the codebase first. If similar functionality exists, enhance it rather than duplicate it. Only deprecate functions if absolutely necessary - prefer evolution over replacement. Ensure all new code is properly wired into existing systems and follows established patterns. - -**HTTP Client Consolidation Rule**: NEVER create separate HTTP clients for the same service. When adding new API endpoints, check for existing HTTP clients first. If a client exists, add methods to it. If multiple clients exist (technical debt), create a unified client and deprecate the old ones. Example: Authentik had THREE separate HTTP clients (pkg/authentik/types.go, pkg/hecate/authentik/client.go, pkg/hecate/authentik/export.go) that should have been ONE unified client with shared TLS config, timeouts, and retry logic. See [ROADMAP.md](ROADMAP.md) section "Authentik Client Consolidation" for the consolidation pattern. - -## CRITICAL RULES (P0 - Breaking) - -These violations cause immediate failure: - -1. **Logging**: ONLY use `otelzap.Ctx(rc.Ctx)` - NEVER `fmt.Print*/Println` - - **CRITICAL**: Structured logging (`logger.Info/Warn/Error`) goes to BOTH terminal AND telemetry - - User sees ALL logger output on their terminal in real-time - - `fmt.Println` is unstructured - breaks telemetry, forensics, and debugging - - This is a dev tool - verbose structured output is fine and helps debugging - - **NO EXCEPTIONS**: Always use logger, even for user-facing output - - **Debug Commands Pattern** (`cmd/debug/*.go`): ALL diagnostic checks MUST use structured logging. Final report formatting MAY use `fmt.Print()` ONLY after diagnostics complete. Example: `logger.Info("Checking Vault config")` during check, then `fmt.Print(report.Render())` at end. Rationale: Preserves telemetry for diagnostics while allowing terminal-optimized report rendering. -2. **Architecture**: Business logic in `pkg/`, orchestration ONLY in `cmd/` (see Architecture Enforcement below). Use official and well supported SDKs and APIs where possible. -3. **Pattern**: ALWAYS follow Assess → Intervene → Evaluate in helpers -4. **Context**: Always use `*eos_io.RuntimeContext` for all operations -5. **Completion**: Must pass `go build`, `golangci-lint run`, `go test -v ./pkg/...` -6. **Secrets**: Use `secrets.SecretManager` for credentials - NEVER hardcode. Use `secrets.SecretManager.GetOrGenerateServiceSecrets()` for service secrets. And leverage vault for secrets management. -7. **Security**: Complete a red team code review and generic targeted criticism of your work before you commit -8. **Evidence-based, adversarially collaborative** approach always with yourself and with me -9. **READMEs** Put a README.md in each directory to document the purpose of the directory and how to use it. -10. **Pre-commit validation**: ALWAYS run `go build -o /tmp/eos-build ./cmd/` before completing a task. If build fails, fix ALL errors before responding to user. Zero tolerance for compile-time errors. -11. **Code Integration & Iteration (P0 - CRITICAL)**: Before writing new code, search for existing functionality. ALWAYS iterate on and enhance existing functions rather than creating duplicates. Ensure all code is properly wired into existing systems. Only deprecate when absolutely necessary - prefer evolution over replacement. Verify integration points work correctly. -12. **Constants - SINGLE SOURCE OF TRUTH (ZERO HARDCODED VALUES - P0)**: NEVER use hardcoded literal values in code. Each value must be a named constant defined in EXACTLY ONE place. - - **Service-specific constants**: `pkg/[service]/constants.go` - - Vault: `pkg/vault/constants.go` - - Consul: `pkg/consul/constants.go` - - Nomad: `pkg/nomad/constants.go` - - **Shared infrastructure**: `pkg/shared/` - - Ports: `pkg/shared/ports.go` - - Common paths: `pkg/shared/paths.go` - - **COMPREHENSIVE list of FORBIDDEN hardcoded values**: - - ✗ **File paths**: `"/usr/local/bin/vault"`, `"/etc/vault.d"`, `"/opt/vault"` - - ✗ **IP addresses**: `"shared.GetInternalHostname"`, `"0.0.0.0"`, `"localhost"` - - ✗ **Port numbers**: `8200`, `8500`, `4646` - - ✗ **Hostnames**: `"localhost"`, `"vault"`, `"consul"` - - ✗ **User/Group names**: `"vault"`, `"consul"`, `"root"` - - ✗ **UID/GID values**: `995`, `0`, `1000` (lookup dynamically via user.Lookup) - - ✗ **File permissions**: `0755`, `0644`, `0600` - - ✗ **Environment variable names**: `"VAULT_ADDR"`, `"CONSUL_HTTP_ADDR"` - - ✗ **Service names**: `"vault.service"`, `"consul.service"` - - ✗ **URLs/Endpoints**: `"https://shared.GetInternalHostname:8200"`, `"/v1/sys/health"` - - ✗ **Timeouts/Durations**: `5 * time.Second`, `30 * time.Minute` - - ✗ **Retry counts**: `5`, `3`, delay values - - ✗ **Storage paths**: `"secret/vault"`, `"service/consul/config"` - - **Violation examples**: - - ✗ `os.MkdirAll("/etc/vault.d", 0755)` → use `vault.VaultConfigDir, vault.VaultDirPerm` - - ✗ `net.Listen("tcp", "shared.GetInternalHostname:8200")` → use `vault.LocalhostIP, shared.PortVault` - - ✗ `exec.Command("systemctl", "start", "vault.service")` → use `vault.VaultServiceName` - - **Circular import exception**: Document with `// NOTE: Duplicates B.ConstName to avoid circular import` - - **Enforcement**: Run monthly audit: `scripts/audit_hardcoded_values.sh` -12. **File Permissions - SECURITY CRITICAL (P0)**: NEVER hardcode chmod/chown permissions (0755, 0600, etc.) in code. Use centralized permission constants. - - **Vault permissions**: ONLY in `pkg/vault/constants.go` (VaultConfigPerm, VaultTLSKeyPerm, etc.) - - **Consul permissions**: ONLY in `pkg/consul/constants.go` - - **MUST document security rationale**: Each permission constant must include: - - `// RATIONALE: Why this permission level` - - `// SECURITY: What threats this mitigates` - - `// THREAT MODEL: Attack scenarios prevented` - - **Violation examples**: - - ✗ `os.MkdirAll(dir, 0755)` - use `vault.VaultDirPerm` - - ✗ `os.WriteFile(file, data, 0600)` - use `vault.VaultSecretFilePerm` - - ✗ `os.Chmod(file, 0644)` - use `vault.VaultConfigPerm` - - **Required for**: SOC2, PCI-DSS, HIPAA compliance audits -13. **Required Flag Prompting - HUMAN-CENTRIC (P0 - BREAKING)**: If a required flag is missing, NEVER fail immediately. ALWAYS offer interactive fallback with informed consent. - - **Philosophy**: "Technology serves humans, not the other way around" - missing flags are barriers to entry that violate human-centric design - - **Violation example**: `if flag == "" { return fmt.Errorf("--flag is required") }` ← BREAKS HUMAN-CENTRICITY - - **Correct pattern**: Use fallback chain with informed consent: - 1. CLI flag (if explicitly set via `cmd.Flags().Changed()`) - 2. Environment variable (if configured, e.g., `VAULT_TOKEN`) - 3. Interactive prompt (if TTY available, with help text explaining WHY and HOW) - 4. Default value (if `AllowEmpty` is true and default makes sense) - 5. Error with remediation (if non-interactive mode, include clear steps) - - **Required elements** (all MUST be present): - - ✓ **Help text**: WHY is this required? HOW to get the value? (e.g., "Get via: vault token create") - - ✓ **Source logging**: ALWAYS log which fallback was used (CLI/env/prompt) for observability - - ✓ **Validation**: Validate input, retry with clear guidance (max 3 attempts) - - ✓ **Security**: Use `IsSecret: true` for passwords/tokens (no terminal echo) - - ✓ **Non-interactive handling**: Detect early, return error with actionable remediation - - ✓ **Empty detection**: Use `cmd.Flags().Changed()` to distinguish `--flag=""` from not provided - - **Implementation pattern**: - ```go - // GOOD: Human-centric with fallback chain - tokenFlag, _ := cmd.Flags().GetString("token") - tokenWasSet := cmd.Flags().Changed("token") - - result, err := interaction.GetRequiredString(rc, tokenFlag, tokenWasSet, &interaction.RequiredFlagConfig{ - FlagName: "token", - EnvVarName: "VAULT_TOKEN", - PromptMessage: "Enter Vault root token: ", - HelpText: "Required for cluster operations. Get via: vault token create", - IsSecret: true, - }) - if err != nil { - return fmt.Errorf("failed to get vault token: %w", err) - } - - log.Info("Using Vault token", zap.String("source", string(result.Source))) - ``` - - **Reference implementation**: [cmd/update/vault_cluster.go:287-334](cmd/update/vault_cluster.go#L287-L334) (`getAuthenticatedVaultClient` helper demonstrates pattern) - - **Migration**: Required for NEW code starting 2025-01-28. Existing ad-hoc patterns grandfathered but encouraged to migrate. - - -## Quick Decision Trees - -``` -New Service Deployment? -├─ System service (fail2ban, osquery) → Docker Compose in /opt/[service] -├─ Web application (Umami, Grafana) → Docker Compose in /opt/[service] -└─ Infrastructure (Vault, Consul) → Check existing patterns in pkg/ - -Need User Input (P0 - Human-Centric)? -├─ Flag explicitly provided (cmd.Flags().Changed) → Use it -├─ Env var set (and configured in RequiredFlagConfig) → Use it, log source -├─ Flag required & missing & TTY available → Use interaction.GetRequiredString(...) -│ ├─ Show help text (WHY needed, HOW to get) -│ ├─ Prompt with validation (IsSecret: true for passwords) -│ ├─ Retry on validation failure (max 3 attempts) -│ └─ Log which source provided value (observability) -├─ Flag required & missing & non-interactive → Error with remediation steps -└─ Flag optional → Use default value, don't prompt - -Error Type? -├─ User fixable → eos_err.NewUserError() → exit(0) -├─ Config/validation → Fail fast, don't retry -└─ System failure → eos_err.NewSystemError() → exit(1) - -Retry Decision? -├─ Transient (network, timeout, lock) → Retry with backoff -└─ Deterministic (config, validation, missing file) → Fail fast - -Secrets Management? -├─ Environment discovered → Use appropriate backend (Vault/file) -├─ Password/token → Use secrets.SecretManager.GetOrGenerateServiceSecrets() -└─ Never hardcode → Store via secretManager.StoreSecret() - -Command Structure? -└─ VERB-FIRST with FLAG-BASED operations: - - Format: eos [verb] [noun] --[operation] [target] [--flags...] - - Examples: - eos update hecate --add bionicgpt --dns X --upstream Y - eos update kvm --add --guest-agent --name vm1 - eos update vault --fix --dry-run - eos update wazuh --add authentik --wazuh-url X - - EXCEPTION: Standard CRUD verbs (start, stop, restart) use positional args: - eos update services start nginx # OK - 'start' is a verb, not an operation - eos update services stop apache2 # OK - 'stop' is a verb, not an operation - - DEPRECATED (remove in v2.0): - eos update [noun] [operation] [target] # Old subcommand pattern - eos update hecate add bionicgpt # Legacy syntax, use --add instead - -Configuration Drift Correction (P0 - NEW PATTERN)? -├─ Service has drifted from canonical state (permissions, config values)? -│ ├─ Use: eos update --fix -│ ├─ Compares: Current state vs. 'eos create ' canonical state -│ ├─ Corrects: Permissions, ownership, config values, duplicate binaries -│ ├─ Verifies: Post-fix state matches canonical -│ └─ Example: eos update vault --fix -│ -├─ Want to check drift without fixing? -│ ├─ Use: eos update --fix --dry-run -│ ├─ Pattern: --dry-run works consistently across all update operations -│ ├─ Example: eos update consul --fix --dry-run -│ └─ Example: eos update vault --ports X->Y --dry-run -│ -├─ CI/CD pipeline verification? -│ └─ Use: eos update --fix --dry-run && check exit code -│ -└─ Old 'eos fix' commands? - ├─ DEPRECATED: Use 'eos update --fix' instead - ├─ eos fix vault → eos update vault --fix - ├─ eos fix consul → eos update consul --fix - ├─ eos fix mattermost → eos update mattermost --fix - └─ Will be removed in Eos v2.0.0 (approximately 6 months) - -Adding a Constant? -├─ Vault-related path/URL → pkg/vault/constants.go ONLY -├─ Consul-related path/URL → pkg/consul/constants.go ONLY -├─ Port number → pkg/shared/ports.go ONLY -├─ Service-specific config → pkg/[service]/constants.go -├─ Found duplicate constant → DELETE all but ONE, update all references -└─ Circular import prevents use → Document duplication reason in comment - -Writing New Command? -├─ In cmd/[verb]/*.go (ORCHESTRATION ONLY): -│ ├─ Define cobra.Command with flags -│ ├─ Parse flags into config struct -│ ├─ Call pkg/[feature]/Function(rc, config) -│ └─ Return result (no business logic!) -│ └─ RULE: If cmd/ file >100 lines, move logic to pkg/ -│ -└─ In pkg/[feature]/*.go (ALL BUSINESS LOGIC): - ├─ ASSESS: Check current state - ├─ INTERVENE: Apply changes if needed - ├─ EVALUATE: Verify and report results - └─ Use RuntimeContext, structured logging - -Secret/Config Delivery? -├─ Secrets (passwords, API keys, tokens): -│ ├─ Store: Vault via secrets.SecretManager -│ ├─ Deliver: Vault Agent template rendering -│ └─ Rotate: Automatic via Vault Agent watch -│ -├─ Non-secret config (ports, URLs, feature flags): -│ ├─ Store: Consul KV at service/[name]/config/ -│ ├─ Deliver: Consul Template or direct read -│ └─ Update: Dynamic via Consul KV updates -│ -└─ Mixed (secrets + config): - └─ Use Consul Template with both Vault and Consul backends - -Docker Operations (P1 - CRITICAL)? -├─ Container operations (start, stop, inspect, logs): -│ └─ ALWAYS use Docker SDK (github.com/docker/docker/client) -│ └─ Example: pkg/container/docker.go, pkg/docker/compose_precipitate.go -│ -├─ Docker Compose validation: -│ ├─ ALWAYS use: docker.ValidateComposeWithShellFallback(ctx, composeFile, envFile) -│ ├─ Strategy: SDK first (35μs), shell fallback if SDK fails -│ │ 1. SDK validation (pkg/docker/compose_validate.go:ValidateComposeFile) -│ │ - YAML parsing + variable substitution + image validation -│ │ - No docker CLI dependency, works in CI -│ │ 2. Shell fallback ('docker compose config') -│ │ - Handles edge cases, authoritative validation -│ ├─ Example: pkg/hecate/validation_files.go:49 ✓ -│ └─ Tests: pkg/docker/compose_validate_test.go (12 tests, all passing) -│ -├─ User-facing operations (docker compose up -d): -│ ├─ Shell acceptable (user needs to see output) -│ └─ BUT validate with SDK FIRST -│ -└─ Template rendering: - └─ Use pkg/templates/render.go (unified, security-hardened) - └─ NO ad-hoc template.New() scattered in packages - -Flag Validation (P0 - CRITICAL)? -└─ Command accepts positional args (cobra.ExactArgs, cobra.MaximumNArgs)? - ├─ ALWAYS add at start of RunE: if err := verify.ValidateNoFlagLikeArgs(args); err != nil { return err } - ├─ Prevents '--' separator bypass (e.g., 'eos delete env prod -- --force') - ├─ Required for ALL commands with positional arguments - └─ See: pkg/verify/validators.go:ValidateNoFlagLikeArgs() - -Dependency Not Found (P0 - CRITICAL - Human-Centric)? -├─ NEVER error out immediately when dependency missing -├─ ALWAYS offer informed consent to install: -│ ├─ Explain what the dependency is and why it's needed -│ ├─ Show installation command(s) clearly -│ ├─ Ask y/N (default No for safety) -│ └─ If yes: attempt auto-install OR guide user through manual install -│ -└─ Pattern (use pkg/interaction/dependency.go): - ├─ Check: interaction.CheckDependencyWithPrompt(rc, interaction.DependencyConfig{...}) - ├─ Provides: Clear explanation, install commands, consent prompt - ├─ Handles: Auto-install (if safe) or graceful exit with instructions - └─ Example: Ollama, Docker, system packages - -Debug Command Output (cmd/debug/*.go)? -├─ Diagnostic checks (health, config validation) → logger.Info/Warn/Error (structured) -├─ Progress indicators → logger.Debug (or logger.Info if user-visible) -├─ Issue detection → logger.Warn/Error with structured fields -├─ Interim results → logger.Info with zap fields -└─ Final report rendering → fmt.Print(report.String()) ONLY - └─ Rationale: Terminal-focused formatting AFTER all telemetry captured -``` - -## Secret and Configuration Management (P0 - CRITICAL) - -**Philosophy**: Secrets belong in Vault, configuration belongs in Consul, delivery is automated. - -### Storage Layer - -#### Secrets (Vault) -**What belongs in Vault:** -- Passwords (database, service accounts) -- API keys (third-party services, internal APIs) -- Tokens (JWT secrets, session keys, ACL tokens) -- TLS certificates and private keys -- Encryption keys - -**Storage pattern:** -```go -// At service installation time -secretManager, err := secrets.NewSecretManager(rc, envConfig) -requiredSecrets := map[string]secrets.SecretType{ - "db_password": secrets.SecretTypePassword, - "api_key": secrets.SecretTypeAPIKey, - "jwt_secret": secrets.SecretTypeToken, -} -serviceSecrets, err := secretManager.GetOrGenerateServiceSecrets("myservice", requiredSecrets) - -// Secrets stored at: secret/myservice/{db_password,api_key,jwt_secret} -``` - -**Path convention**: `secret/[service-name]/[secret-key]` - -#### Configuration (Consul KV) -**What belongs in Consul KV:** -- Feature flags (enable_rag, enable_audit_log) -- Service endpoints (http://service:port) -- Port numbers -- Timeouts and retry limits -- Log levels -- Non-sensitive connection strings - -**Storage pattern:** -```go -// Write config to Consul KV -consul.KV().Put(&api.KVPair{ - Key: "service/myservice/config/port", - Value: []byte("8080"), -}, nil) - -consul.KV().Put(&api.KVPair{ - Key: "service/myservice/config/feature_flags/enable_rag", - Value: []byte("true"), -}, nil) -``` - -**Path convention**: `service/[service-name]/config/[category]/[key]` - -### Delivery Layer - -#### Option 1: Vault Agent Template (Secrets Only) - -**When to use:** -- Service only needs secrets from Vault -- No dynamic configuration from Consul -- Simple .env file or config file generation -- Examples: PostgreSQL passwords, API keys - -**How it works:** -1. Vault Agent runs as systemd service (`vault-agent-eos.service`) -2. Agent authenticates via AppRole -3. Renders template files with secrets from Vault -4. Watches Vault for changes, re-renders on rotation - -**Implementation:** -```hcl -# /etc/vault.d/templates/myservice.env.ctmpl -DATABASE_PASSWORD={{ with secret "secret/myservice/db_password" }}{{ .Data.data.value }}{{ end }} -API_KEY={{ with secret "secret/myservice/api_key" }}{{ .Data.data.value }}{{ end }} -JWT_SECRET={{ with secret "secret/myservice/jwt_secret" }}{{ .Data.data.value }}{{ end }} -``` - -```hcl -# Add to vault agent config -template { - source = "/etc/vault.d/templates/myservice.env.ctmpl" - destination = "/opt/myservice/.env" - perms = "0640" - command = "docker compose -f /opt/myservice/docker-compose.yml up -d --force-recreate" -} -``` - -**Pros:** -- Already integrated in Eos (vault-agent-eos.service exists) -- Automatic secret rotation -- Secure: secrets never written to disk except in final config -- Simple for secrets-only scenarios - -**Cons:** -- Cannot access Consul KV -- Vault Agent must be running -- Limited to Vault data sources - -#### Option 2: Consul Template (Secrets + Config) - -**When to use:** -- Service needs both Vault secrets AND Consul configuration -- Dynamic configuration changes without redeployment -- Service discovery via Consul -- Examples: Multi-tenant apps, microservices with dynamic config - -**How it works:** -1. Consul Template runs as systemd service or Docker sidecar -2. Connects to both Consul and Vault -3. Renders templates combining both data sources -4. Watches both for changes, re-renders on updates - -**Implementation:** -```hcl -# /etc/consul-template.d/myservice.env.ctmpl -# From Consul KV -PORT={{ key "service/myservice/config/port" }} -ENABLE_RAG={{ key "service/myservice/config/feature_flags/enable_rag" }} -LOG_LEVEL={{ key "service/myservice/config/log_level" }} - -# From Vault -DATABASE_PASSWORD={{ with secret "secret/myservice/db_password" }}{{ .Data.data.value }}{{ end }} -API_KEY={{ with secret "secret/myservice/api_key" }}{{ .Data.data.value }}{{ end }} - -# Service discovery via Consul -{{ range service "database" }} -DATABASE_URL=postgresql://user:password@{{ .Address }}:{{ .Port }}/mydb -{{ end }} -``` - -```hcl -# /etc/consul-template.d/myservice.hcl -consul { - address = "localhost:8500" -} - -vault { - address = "https://localhost:8200" - token = "{{ file "/run/eos/vault_agent_eos.token" }}" # Reuse Vault Agent token - unwrap_token = false - renew_token = true -} - -template { - source = "/etc/consul-template.d/myservice.env.ctmpl" - destination = "/opt/myservice/.env" - perms = "0640" - command = "docker compose -f /opt/myservice/docker-compose.yml up -d --force-recreate" - wait { - min = "2s" - max = "10s" - } -} -``` - -**Pros:** -- Access to both Vault AND Consul -- Service discovery built-in -- Dynamic config updates -- Can template ANY file format (env, JSON, YAML, HCL) - -**Cons:** -- Additional service to manage -- More complex than Vault Agent alone -- Requires both Consul and Vault to be healthy - -#### Option 3: Custom Entrypoint (Simple/Legacy) - -**When to use:** -- Quick prototyping -- Legacy services not yet migrated -- Temporary deployments -- Services that don't support file-based config - -**Implementation:** ```bash -#!/bin/bash -# /opt/myservice/entrypoint.sh - -# Fetch secrets from Vault using agent token -export DATABASE_PASSWORD=$(VAULT_TOKEN=$(cat /run/eos/vault_agent_eos.token) vault kv get -field=value secret/myservice/db_password) -export API_KEY=$(VAULT_TOKEN=$(cat /run/eos/vault_agent_eos.token) vault kv get -field=value secret/myservice/api_key) - -# Fetch config from Consul -export PORT=$(consul kv get service/myservice/config/port) -export ENABLE_RAG=$(consul kv get service/myservice/config/feature_flags/enable_rag) - -# Start main process -exec /app/myservice +go build -o /tmp/eos-build ./cmd/ # ALWAYS run before committing — zero compile-error tolerance +go test ./pkg/... # Unit tests +go test -race ./pkg/... # Race-detection (CI-blocking) +go test -tags=integration ./... # Integration tests +golangci-lint run # Lint +make all # Lint + test + build ``` -**Pros:** -- Simple, no additional daemons -- Works with any service -- Easy to debug - -**Cons:** -- No automatic rotation -- Secrets in environment variables (less secure) -- No watch/reload on changes -- Must restart container for updates - -### Decision Matrix - -| Scenario | Storage | Delivery | Example | -|----------|---------|----------|---------| -| **Secrets only, static** | Vault | Vault Agent Template | Database passwords, TLS certs | -| **Secrets + static config** | Vault + .env file | Vault Agent + static file | Simple web apps | -| **Secrets + dynamic config** | Vault + Consul KV | Consul Template | Multi-tenant SaaS, microservices | -| **Service discovery needed** | Vault + Consul | Consul Template | Distributed systems | -| **Quick prototype** | Vault | Custom entrypoint | Development, testing | - -### Eos Standard Pattern (Recommended) - -For new services in Eos, use **Consul Template** as the standard: - -**Rationale:** -1. **Unified approach**: One tool for all use cases -2. **Future-proof**: Supports adding Consul config later without refactoring -3. **Service discovery**: Built-in support for Consul catalog -4. **Consistent**: All services use same pattern -5. **Observable**: Consul Template has built-in monitoring - -**Implementation checklist:** -- [ ] Store secrets in Vault via `secrets.SecretManager.GetOrGenerateServiceSecrets()` -- [ ] Store non-secret config in Consul KV at `service/[name]/config/` -- [ ] Create template file at `/etc/consul-template.d/[service].env.ctmpl` -- [ ] Create Consul Template config at `/etc/consul-template.d/[service].hcl` -- [ ] Add systemd service `consul-template-[service].service` OR Docker sidecar -- [ ] Template renders to `/opt/[service]/.env` with perms 0640 -- [ ] Command triggers service restart on template change - -### Migration Path - -**Existing services using static .env:** -1. Phase 1: Continue using `secretManager.GetOrGenerateServiceSecrets()` (no change) -2. Phase 2: Create Consul Template to render .env from Vault (secrets.SecretManager still stores) -3. Phase 3: Move non-secret config to Consul KV -4. Phase 4: Remove static .env generation from Eos install code - -**Example: BionicGPT Migration** -``` -Current: Eos writes .env file at install time with secrets from Vault -Target: Consul Template renders .env from Vault (secrets) + Consul KV (config) - -Steps: -1. Create /etc/consul-template.d/bionicgpt.env.ctmpl -2. Create /etc/consul-template.d/bionicgpt.hcl -3. Create consul-template-bionicgpt.service -4. Move feature flags to Consul KV (ENABLE_RAG, ENABLE_AUDIT_LOG) -5. Keep secrets in Vault (POSTGRES_PASSWORD, JWT_SECRET, LITELLM_MASTER_KEY) -6. Remove static .env generation from pkg/bionicgpt/install.go -``` - -### Reference Implementation - -See existing patterns: -- Vault Agent: [pkg/vault/phase13_write_agent_config.go](pkg/vault/phase13_write_agent_config.go) -- Vault Agent template: [pkg/shared/vault_agent.go](pkg/shared/vault_agent.go) -- Secret storage: [pkg/secrets/manager.go](pkg/secrets/manager.go) - -## Architecture Enforcement: cmd/ vs pkg/ - -**The Iron Rule**: `cmd/` = Cobra orchestration ONLY. `pkg/` = ALL business logic. - -### Good Example: cmd/fix/consul.go (✓ ~60 lines) -```go -// cmd/fix/consul.go - PURE ORCHESTRATION -func runConsulFix(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { - // Parse flags - dryRun, _ := cmd.Flags().GetBool("dry-run") - permissionsOnly, _ := cmd.Flags().GetBool("permissions-only") - - // Create config - config := &fix.Config{ - DryRun: dryRun, - PermissionsOnly: permissionsOnly, - } - - // Delegate to pkg/ - ALL business logic lives there - return fix.RunFixes(rc, config) -} -``` - -### Bad Example: Business Logic in cmd/ (✗ 400+ lines) -```go -// cmd/fix/something.go - VIOLATES ARCHITECTURE -func runSomethingFix(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { - // ✗ WRONG: File operations in cmd/ - info, err := os.Stat("/etc/something/config.hcl") - if err != nil { ... } - - // ✗ WRONG: Permission fixing in cmd/ - if err := os.Chmod(path, 0640); err != nil { ... } - - // ✗ WRONG: Loops and complex logic in cmd/ - for _, path := range paths { - // Business logic here... - } - - // This should ALL be in pkg/something/fix/fix.go! -} -``` - -### Correct Pattern: Move to pkg/ -```go -// cmd/fix/something.go (~60 lines) -func runSomethingFix(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { - config := parseFlags(cmd) - return somethingfix.RunFixes(rc, config) // ✓ Delegate to pkg/ -} - -// pkg/something/fix/fix.go -func RunFixes(rc *eos_io.RuntimeContext, config *Config) error { - // ✓ ASSESS - issues := assessPermissions(rc) - - // ✓ INTERVENE - if !config.DryRun { - results := fixPermissions(rc, issues) - } - - // ✓ EVALUATE - displayResults(rc, results) - return nil -} -``` - -### Enforcement Heuristics -- **cmd/ file >100 lines?** → Move business logic to pkg/ -- **File operations (os.Stat, os.Chmod)?** → Must be in pkg/ -- **Loops over data structures?** → Must be in pkg/ -- **Complex conditionals?** → Must be in pkg/ -- **Only cobra, flag parsing, delegation?** → OK in cmd/ - -## Project Constraints - -### MUST: -- Use structured logging via `otelzap.Ctx(rc.Ctx)` -- Follow Assess → Intervene → Evaluate pattern -- Keep business logic in `pkg/`, orchestration in `cmd/` -- Verify all operations with explicit checks -- Use environment discovery pattern (`environment.DiscoverEnvironment()`) -- Initialize secret manager for any credential operations -- Add `*Last Updated: YYYY-MM-DD*` to all .md files -- Capture command output in errors for context -- Detect error type before retrying (fail fast on config errors) -- Include remediation steps in error messages -- Use Docker Compose for containerized services in `/opt/[service]` -- Store service-specific configs in appropriate directories -- Sanitize user-provided URLs with `shared.SanitizeURL()` before validation - -### MUST NOT: -- Use `fmt.Print/Printf/Println` for output (but `fmt.Errorf` is OK) -- Put business logic in `cmd/` files -- Skip verification steps -- Create tactical documentation files (.md) -- Hardcode values - use flags or prompts -- Hardcode credentials - use SecretManager -- Retry deterministic errors (config validation, missing files) -- Return generic errors without context (e.g., "command failed") -- Execute operations without logging diagnostics -- Assist with offensive security or malicious code - -## Quick Command Reference - -| Pattern | Example | Location | -|---------|---------|----------| -| Command file | `cmd/create/umami.go` | Orchestration only | -| Package helper | `pkg/crypto/generate.go` | Business logic | -| Error wrapping | `fmt.Errorf("failed to X: %w", err)` | All errors | -| User prompt | `logger.Info("terminal prompt: X")` | Before input | -| Secret storage | `secretManager.GetOrGenerateServiceSecrets()` | Credentials | -| Testing | See `PATTERNS.md#testing` | All packages | - -## Architecture Overview - -### Package Structure -``` -cmd/[verb]/ # Orchestration only - ├── create/ # Service creation commands - ├── read/ # Status/inspection commands - ├── update/ # Modification commands - ├── delete/ # Removal commands - ├── list/ # Listing commands - ├── backup/ # Backup operations - ├── self/ # Eos self-management - ├── build/ # Build operations - ├── deploy/ # Deployment commands - ├── promote/ # Promotion workflows - └── env/ # Environment management - -pkg/[feature]/ # Business logic - ├── types.go # Types, constants - ├── install.go # Installation logic - ├── configure.go # Configuration logic - └── verify.go # Verification logic - -Key packages: - ├── eos_io/ # RuntimeContext, I/O utilities - ├── eos_err/ # Error handling (UserError, SystemError) - ├── secrets/ # Secret management abstraction - ├── environment/ # Environment discovery - ├── execute/ # Command execution utilities - ├── crypto/ # Cryptographic utilities - ├── container/ # Container operations - └── shared/ # Shared utilities including validation (SanitizeURL, ValidateURL) -``` - -### Command Flow -1. User input → 2. Cobra routing → 3. RuntimeContext → 4. Orchestration (`cmd/`) -→ 5. Environment discovery → 6. Secret initialization → 7. Business logic (`pkg/`) -→ 8. Assess/Intervene/Evaluate → 9. Error handling - -### Service Deployment Pattern -Most services follow this pattern (see `cmd/create/umami.go` as reference): -1. Discover environment (`environment.DiscoverEnvironment()`) -2. Initialize secret manager (`secrets.NewSecretManager()`) -3. Create installation directory in `/opt/[service]` -4. Copy Docker Compose file from `assets/` -5. Generate/retrieve secrets via SecretManager -6. Template configuration files -7. Deploy with `docker compose up -d` -8. Verify deployment -9. Provide user instructions - -## Testing Requirements - -Before marking complete: -```bash -go build -o /tmp/eos-build ./cmd/ # Must compile -golangci-lint run # Must pass linting -go test -v ./pkg/... # Must pass tests -``` - -## Shift-Left Strategy: Automated Pre-Commit Validation - -**Philosophy**: Catch errors at development time, not at user installation time. - -### Problem Statement - -Compile-time errors that reach the main branch violate P0 Rule #10 and create poor user experience: -- Users see cryptic build failures during `install.sh` -- CI/CD pipelines fail after merge -- Development velocity slows due to fixing broken builds -- Trust in codebase quality erodes - -### Solution: Three-Layer Defense - -#### Layer 1: AI Assistant Pre-Commit Check (P0 - MANDATORY) - -**RULE**: Before completing ANY task, AI assistants MUST run ALL validation checks: - -```bash -# 1. Build verification (P0 - CRITICAL) -go build -o /tmp/eos-build ./cmd/ - -# 2. Linting (P0 - REQUIRED by CLAUDE.md:726) -golangci-lint run - -# 3. Tests (P1 - RECOMMENDED) -go test -v ./pkg/... -``` - -**Enforcement**: -- If build fails → fix ALL errors before responding to user. Zero tolerance. -- If golangci-lint fails → fix linter issues before responding to user. -- If tests fail → fix tests or acknowledge failures in commit message. - -**Rationale**: These checks are documented in "Testing Requirements" (CLAUDE.md:721-728) and enforced by pre-commit hook (Layer 2). AI must verify locally before marking task complete to prevent broken commits reaching the repository. - -**This is P0 Rule #10**: Never mark a task complete without verifying the build AND linter pass. - -#### Layer 2: Git Pre-Commit Hook (AUTOMATED) - -Installed automatically in `.git/hooks/pre-commit`, this hook runs before every commit and enforces: - -1. **Build validation**: `go build -o /tmp/eos-build ./cmd/` (full build - P0) -2. **Static analysis**: `go vet` on staged files only (performance optimized) -3. **Format checking**: `gofmt` on staged files only (performance optimized) -4. **Comprehensive linting**: `golangci-lint run` on staged files (P0 - REQUIRED) -5. **Secret scanning**: `gitleaks protect --staged` (security critical - if installed) -6. **Package tests**: `go test -short` on affected packages (non-blocking) - -**Performance**: Hook runs ONLY on staged files (100-300x faster than full codebase scan) -- 1 file changed: ~2 seconds (vs. ~120 seconds) -- 5 files changed: ~5 seconds (vs. ~120 seconds) -- Result: Developers won't bypass with `--no-verify` - -**Installation**: -```bash -# Automatic (hook already present in repo) -git clone && cd eos -# Hook is active immediately - -# Manual (if hook gets removed) -./scripts/install-git-hooks.sh -``` - -**Bypass** (NOT RECOMMENDED): -```bash -git commit --no-verify # Only use if you know what you're doing -``` - -#### Layer 3: CI/CD Pipeline (ACTIVE ✅) - -**Status**: FULLY OPERATIONAL - -Production-ready GitHub Actions workflows running on every PR and push to main: - -**Quality Workflows**: -- `.github/workflows/comprehensive-quality.yml` - golangci-lint, staticcheck, security scans - - Triggers: PRs to main/develop, pushes to main, daily at 2:00 AM UTC - - Runs: gofmt, go vet, staticcheck, golangci-lint, gosec, trivy, TODO/FIXME checks -- `.github/workflows/lint.yml` - formatting, go vet, ineffassign - - Triggers: Pushes to .go files, PRs - - Runs: golangci-lint, gofmt, go vet, ineffassign - -**Testing Workflows**: -- `.github/workflows/comprehensive-testing.yml` - full test suite with race detection - - Triggers: PRs, pushes to main - - Runs: quick tests, full tests, integration tests, race condition checks -- `.github/workflows/coverage-enforcement.yml` - enforces 70% coverage threshold - - Triggers: PRs, pushes to main - - Runs: coverage analysis, generates reports, enforces thresholds -- `.github/workflows/test.yml` - quick unit tests - - Triggers: PRs, pushes - - Runs: fast unit test suite - -**Security Workflows**: -- `.github/workflows/security.yml` - gosec security scanner - - Triggers: PRs, pushes, daily scans - - Runs: gosec with SARIF output uploaded to GitHub Security -- `.github/workflows/codeql.yml` - CodeQL advanced security analysis - - Triggers: PRs, pushes, weekly scans - - Runs: GitHub's semantic code analysis engine - -**Additional Workflows**: -- `.github/workflows/fuzz.yml` - fuzz testing for critical components -- `.github/workflows/quality-gates.yml` - enforces code quality standards - -**Verification**: -```bash -# View recent workflow runs -ls -la .github/workflows/ - -# Check specific workflow status -cat .github/workflows/comprehensive-quality.yml | grep "on:" -``` - -**Coverage**: All checks run automatically on every PR and push, providing final safety net before code reaches production. - -**Note**: Requires `.golangci.yml` configuration file (now present in repository root). - -### Developer Workflow - -``` -Write code - ↓ -[Layer 1] AI runs go build before completion - ↓ -git add . - ↓ -git commit ← [Layer 2] Pre-commit hook validates - ↓ -git push - ↓ -[Layer 3] CI/CD validates (future) - ↓ -Merge to main -``` - -### Error Prevention Examples - -**Example 1: Unused Import** -```go -// BEFORE: AI completes task without building -import ( - "bytes" // ← Unused, will break build - "fmt" -) -// AI marks task complete ✗ WRONG - -// AFTER: AI runs go build before completion -// Build fails with "bytes imported and not used" -// AI fixes error, verifies build, then marks complete ✓ CORRECT -``` - -**Example 2: Function Signature Mismatch** -```go -// BEFORE: AI writes code without verifying -proceed, err := interaction.PromptYesNo(...) // ← Wrong signature -// AI marks task complete ✗ WRONG - -// AFTER: AI runs go build before completion -// Build fails with "assignment mismatch: 2 variables but PromptYesNo returns 1 value" -// AI uses PromptYesNoSafe instead, verifies build ✓ CORRECT -``` - -### Enforcement Checklist - -- [ ] AI assistants verify build before task completion (P0 Rule #10) -- [ ] Pre-commit hook installed in `.git/hooks/pre-commit` -- [ ] Hook is executable: `chmod +x .git/hooks/pre-commit` -- [ ] Developers understand bypass is discouraged: `--no-verify` -- [ ] CI/CD pipeline planned for future implementation - -### Related Documentation - -- **P0 Rule #10**: Pre-commit validation (line 48) -- **Pre-Completion Review Checklist**: Architecture compliance (line 764) -- **Testing Requirements**: Manual validation commands (line 721) - -## AI Assistant Guidelines - -### Efficiency Tips -- Batch related file reads in single response -- Use Task tool for open-ended searches -- Use TodoWrite for multi-step operations -- Search before asking for clarification -- Check existing patterns in codebase first - -### Adversarial Collaboration -When asked to review ("come talk to me as an adversarial collaborator"): -1. **What's Good**: Acknowledge working patterns and solid foundations -2. **What's Not Great**: Identify inefficiencies and code smells -3. **What's Broken**: Call out bugs, security issues, broken patterns -4. **What We're Not Thinking About**: Surface blindspots and missing considerations - -Then systematically fix all P0, P1, P2, P3 issues found. - -### Code Patterns -For detailed examples see `PATTERNS.md`: -- Logging patterns → `PATTERNS.md#logging` -- Error handling → `PATTERNS.md#errors` -- Assess/Intervene/Evaluate → `PATTERNS.md#aie-pattern` -- Interactive prompting → `PATTERNS.md#prompting` -- Helper structure → `PATTERNS.md#helpers` - -### Context Continuity -When looking for context: -1. First check our previous conversations to see if we've discussed this topic -2. Pick up from where we most recently left off -3. Don't rehash old ground unless explicitly asked - -Work as a partner in an adversarially collaborative process, following the user's lead and providing fact-based targeted criticism. +## Governing contracts -### Pre-Completion Review Checklist +**IMPORTANT:** Read the relevant contract before starting any work. -Before completing any task, verify: +Compact ruleset (60 rules, always loaded): @prompts/GOVERNANCE-SUMMARY.md +Anti-patterns catalogue (always loaded): @prompts/ANTI-PATTERNS.md -**Architecture Compliance (P0)**: -- [ ] All business logic is in `pkg/` -- [ ] `cmd/` files only contain orchestration -- [ ] `cmd/` files are <100 lines (if not, refactor to pkg/) -- [ ] No file operations (os.Stat, os.Chmod) in cmd/ -- [ ] No loops or complex conditionals in cmd/ +All governance contracts vendored from `cybermonkey/prompts` at `prompts/`: -**Pattern Compliance (P0)**: -- [ ] pkg/ functions follow Assess → Intervene → Evaluate -- [ ] All operations use RuntimeContext -- [ ] All logging uses otelzap.Ctx(rc.Ctx) -- [ ] Secrets use SecretManager -- [ ] Errors include context and remediation -- [ ] Required flags use interaction.GetRequiredString() with fallback chain (P0 #13) +| Contract | File | Governs | +|----------|------|---------| +| Session workflow | @prompts/SOAPIER.md | 14-step SOAPIER process | +| Documentation | @prompts/DOCUMENTATION.md | Diataxis, frontmatter, naming | +| Testing | @prompts/TESTING.md | 70/20/10, coverage, evidence | +| Testing (Go) | @prompts/TESTING-GO.md | Race detection, testify, table-driven tests | +| Workflow | @prompts/WORKFLOW.md | CI, PRs, branch lifecycle | +| Git Rules | @prompts/GIT-RULES.md | Signing, linear history | +| Security | @prompts/SECURITY.md | Secrets, OWASP, SLSA | +| Coordination | @prompts/COORDINATION.md | Multi-agent isolation | -**Testing (P0)**: -- [ ] `go build -o /tmp/eos-build ./cmd/` compiles -- [ ] `go vet ./pkg/...` passes -- [ ] `go vet ./cmd/...` passes -- [ ] `gofmt -l` returns nothing +Submodule update runbook: @prompts/docs/runbooks/RUNBOOK-update-submodule.md -## Common Anti-Patterns +## Eos-specific patterns -| Never Do This | Always Do This | -|--------------|----------------| -| `fmt.Println("text")` | `logger.Info("text")` | -| Business logic in `cmd/` | Delegate to `pkg/` (see Architecture Enforcement) | -| `os.Stat()` in `cmd/*.go` | Move to `pkg/*/assess.go` | -| File operations in `runCommand()` | Create `pkg/*/fix.go` with business logic | -| `exec.Command().Run()` | Check with `exec.LookPath()` first | -| Skip verification | Explicit verification checks | -| Create tactical .md files | Use inline `// TODO:` comments | -| Retry all errors blindly | Detect error type, fail fast on config errors | -| `return fmt.Errorf("failed")` | Include output, context, remediation | -| Silent operations | Log before/during/after with context | -| Hardcode credentials | Use `secrets.SecretManager` | -| Skip environment discovery | Call `environment.DiscoverEnvironment()` | -| `strings.TrimSpace(url)` only | Use `shared.SanitizeURL(url)` | -| Error when dependency missing | Offer informed consent to install (see Dependency Not Found) | -| Silent dependency checks | Use `interaction.CheckDependencyWithPrompt()` | -| `eos fix vault` | `eos update vault --fix` (fix is deprecated) | -| `eos fix consul` | `eos update consul --fix` | -| `eos fix mattermost` | `eos update mattermost --fix` | -| `eos update hecate add bionicgpt` | `eos update hecate --add bionicgpt` (subcommand is deprecated) | -| `eos update wazuh add authentik` | `eos update wazuh --add authentik` (subcommand is deprecated) | -| Subcommand for operations | Flag for operations (e.g., --add, --enable, --fix) | -| `cmd.AddCommand(addCmd)` for operations | `cmd.Flags().Bool("add")` for operations | -| `if flag == "" { return error }` | Use `interaction.GetRequiredString()` (P0 - human-centric) | -| Ad-hoc flag prompting in cmd/ | Use unified `pkg/interaction/required_flag.go` pattern | -| Prompt without help text | Always include HelpText (WHY needed, HOW to get) | -| Silent env var fallback | Always log which source provided value (observability) | -| Can't detect `--flag=""` vs missing | Use `cmd.Flags().Changed()` to distinguish | -| Create multiple HTTP clients for same service | Consolidate into ONE unified client (see HTTP Client Consolidation Rule) | -| Add new endpoint as separate client | Add method to existing client instead | -| Duplicate TLS config, timeouts, retry logic | Share infrastructure via unified client | +Domain rules are in `.claude/rules/` with path scoping — they load automatically when you touch those files: -## Priority Levels +| Rule file | Loads when touching | Key patterns | +|-----------|-------------------|--------------| +| `go-patterns.md` | `**/*.go` | Architecture, constants, logging, idempotency, retry | +| `cli-patterns.md` | `cmd/**/*.go` | cmd/ vs pkg/ enforcement, flag validation, human-centric input | +| `secrets-vault.md` | `pkg/vault/**`, `pkg/consul/**` | Vault/Consul storage, Vault Agent, token auth | +| `debugging.md` | `cmd/debug/**` | Diagnostic logging, evidence collection, report rendering | -- **P0 (BREAKING)**: Violations cause immediate failure -- **P1 (CRITICAL)**: Must fix before marking complete -- **P2 (IMPORTANT)**: Should follow unless justified -- **P3 (RECOMMENDED)**: Best practices - -## Idempotency Principles - -1. Check before acting - Don't assume state -2. Handle "already done" gracefully - Not an error -3. Focus on end result, not the action -4. Use conditional operations - -Example: -```go -// Check if directory exists before creating -if _, err := os.Stat(targetDir); os.IsNotExist(err) { - logger.Info("Creating directory", zap.String("path", targetDir)) - if err := os.MkdirAll(targetDir, 0755); err != nil { - return fmt.Errorf("failed to create directory: %w", err) - } -} else { - logger.Debug("Directory already exists", zap.String("path", targetDir)) -} -``` - -## Retry Logic (P1 - CRITICAL) - -**RULE**: Only retry TRANSIENT failures. Never retry DETERMINISTIC failures. - -### Transient Failures (RETRY) -- Network timeouts -- Temporary resource locks -- Race conditions -- Service not ready yet (starting up) -- Temporary disk full - -### Deterministic Failures (FAIL FAST) -- Configuration validation errors -- Missing required files -- Invalid credentials -- Multiple network interfaces (needs user decision) -- Permission denied -- Command not found - -### Implementation Pattern -```go -// GOOD: Detect error type before retrying -if err := operation(); err != nil { - if isConfigError(err) || isValidationError(err) { - // Don't retry - config won't fix itself - return fmt.Errorf("configuration invalid: %w", err) - } - // Only retry transient errors - return retry.WithBackoff(rc, operation) -} - -// BAD: Blindly retry all errors -return retry.WithBackoff(rc, operation) // Will retry config errors -``` - -### Logging Requirements -```go -// When retrying -logger.Warn("Operation failed, will retry", - zap.Error(err), - zap.String("reason", "network timeout"), // WHY retrying - zap.Int("attempt", attempt)) - -// When failing fast -logger.Error("Operation failed, not retrying", - zap.Error(err), - zap.String("reason", "configuration error"), // WHY not retrying - zap.String("remediation", "fix config and retry")) // What user should do -``` - -## Error Context (P1 - CRITICAL) - -**RULE**: Errors must be actionable. Always include context and remediation. - -### Required Error Information -1. **What failed**: Specific operation, not just "command failed" -2. **Why it failed**: Root cause from stdout/stderr -3. **How to fix**: Actionable remediation steps -4. **System state**: Relevant context (interfaces, ports, services) - -### Implementation Pattern -```go -// GOOD: Rich error context -output, err := execute.Run(ctx, opts) -if err != nil { - return fmt.Errorf("failed to validate docker-compose.yml: %s\n"+ - "File location: %s\n"+ - "Fix: Check YAML syntax with 'docker compose config'", - output, composeFile) -} - -// BAD: Generic error -if err != nil { - return fmt.Errorf("command failed: %w", err) -} -``` - -### Capture Command Output -```go -// Always capture output for error context -output, err := execute.Run(rc.Ctx, execute.Options{ - Command: "docker", - Args: []string{"compose", "up", "-d"}, - WorkDir: serviceDir, - Capture: true, // REQUIRED for error context -}) - -if err != nil { - // Include output in error - this is the actual error message - return fmt.Errorf("docker compose failed: %s", output) -} -``` - -### User vs System Errors -```go -// User can fix → exit 0, friendly message -if missingDockerCompose { - return eos_err.NewUserError( - "Docker not found. Please install Docker:\n"+ - " Ubuntu: sudo apt install docker.io docker-compose-v2\n"+ - " Or visit: https://docs.docker.com/engine/install/ubuntu/") -} - -// System failure → exit 1, technical details -if err := os.WriteFile(path, data, 0640); err != nil { - return eos_err.NewSystemError("failed to write %s: %w", path, err) -} -``` - -## Secrets Management (P1 - CRITICAL) - -**RULE**: All credentials go through SecretManager. Never hardcode or prompt without storage. - -### Pattern -```go -// 1. Discover environment -envConfig, err := environment.DiscoverEnvironment(rc) -if err != nil { - return fmt.Errorf("failed to discover environment: %w", err) -} - -// 2. Initialize secret manager -secretManager, err := secrets.NewSecretManager(rc, envConfig) -if err != nil { - return fmt.Errorf("failed to initialize secret manager: %w", err) -} - -// 3. Define required secrets -requiredSecrets := map[string]secrets.SecretType{ - "database_password": secrets.SecretTypePassword, - "api_key": secrets.SecretTypeToken, -} - -// 4. Get or generate secrets -serviceSecrets, err := secretManager.GetOrGenerateServiceSecrets("myservice", requiredSecrets) -if err != nil { - return fmt.Errorf("failed to manage secrets: %w", err) -} - -// 5. Use secrets -dbPassword := serviceSecrets.Secrets["database_password"] -logger.Info("Using secret from backend", zap.String("backend", serviceSecrets.Backend)) -``` - -### Secret Types -- `SecretTypePassword`: Auto-generated strong password -- `SecretTypeToken`: Auto-generated token -- `SecretTypeAPIKey`: Auto-generated API key -- Custom generation via `crypto.GeneratePassword(length)` - -## Vault Cluster Authentication (P1 - CRITICAL) - -**RULE**: Vault cluster operations require admin-level tokens. Use hierarchical authentication with clear security boundaries. - -### Architecture Pattern - -Vault cluster operations (Raft, Autopilot, snapshots) use **shell commands** (`vault operator raft ...`) rather than SDK clients. This creates a unique authentication pattern: - -1. **Token-only return**: Functions return `token string`, not `*api.Client` -2. **Environment variable usage**: Token set via `VAULT_TOKEN` env var for shell commands -3. **Validation before use**: Token validated via SDK, then used in shell commands - -### Authentication Hierarchy - -Authentication attempts in order (fail-fast on deterministic errors): - -```go -// 1. --token flag (highest priority - explicit user input) -if token, _ := cmd.Flags().GetString("token"); token != "" { - // Validate token has required capabilities - _, err := vault.GetVaultClientWithToken(rc, token) - if err != nil { - return "", fmt.Errorf("invalid token: %w", err) - } - return token, nil -} - -// 2. VAULT_TOKEN environment variable (CI/CD, scripted usage) -if token := os.Getenv("VAULT_TOKEN"); token != "" { - _, err := vault.GetVaultClientWithToken(rc, token) - if err != nil { - return "", fmt.Errorf("invalid token: %w", err) - } - return token, nil -} - -// 3. Admin authentication (Vault Agent → AppRole → Root with consent) -adminClient, err := vault.GetAdminClient(rc) -if err != nil { - return "", fmt.Errorf("admin authentication failed: %w", err) -} -return adminClient.Token(), nil -``` - -### Token Validation Sequence - -Validation order matters - fail fast on infrastructure issues: - -```go -// Check 0: Vault seal status (BEFORE token validation) -sealStatus, err := client.Sys().SealStatus() -if err != nil { - return fmt.Errorf("cannot connect to Vault: %w", err) -} -if sealStatus.Sealed { - return fmt.Errorf("Vault is sealed - unseal first") -} - -// Check 1: Token validity -secret, err := client.Auth().Token().LookupSelf() -if err != nil { - return fmt.Errorf("token invalid or expired: %w", err) -} - -// Check 2: Token TTL (Time To Live) -ttlSeconds := secret.Data["ttl"].(json.Number).Int64() -if ttlSeconds < 60 { - return fmt.Errorf("token expires in %ds - too short for cluster operations", ttlSeconds) -} -if ttlSeconds < 300 { - logger.Warn("Token expires soon", zap.Int64("ttl_seconds", ttlSeconds)) -} - -// Check 3: Required policies -hasAdminPolicy := false -for _, policy := range secret.Data["policies"].([]interface{}) { - if policy == "root" || policy == shared.EosAdminPolicyName { - hasAdminPolicy = true - break - } -} -if !hasAdminPolicy { - return fmt.Errorf("token lacks eos-admin-policy or root") -} - -// Check 4: Specific capabilities -capabilities, err := client.Sys().CapabilitiesSelf("sys/storage/raft/configuration") -if err != nil { - return fmt.Errorf("cannot verify capabilities: %w", err) -} -hasCapability := false -for _, cap := range capabilities { - if cap == "root" || cap == "sudo" || cap == "read" { - hasCapability = true - break - } -} -if !hasCapability { - return fmt.Errorf("token lacks required Raft capabilities") -} -``` - -### Token Security - -**CRITICAL**: Tokens are secrets and MUST NOT be logged. - -```go -// GOOD: Never log token values -logger.Info("Using token from --token flag") // ✓ No token value - -// BAD: Logging token exposes secrets -logger.Info("Token", zap.String("value", token)) // ✗ NEVER DO THIS - -// GOOD: Use sanitization helper if you need to reference token -logger.Debug("Token type", zap.String("token", sanitizeTokenForLogging(token))) -// Output: "Token type: hvs.***" -``` - -**Token sanitization helper** (in `pkg/vault/auth_cluster.go`): -```go -// sanitizeTokenForLogging returns safe version for logging -func sanitizeTokenForLogging(token string) string { - if len(token) <= 4 { - return "***" - } - prefix := token[:4] - if prefix == "hvs." || prefix == "s.12" { - return prefix + "***" - } - return "***" -} -``` - -### Error Messages - -Errors must be **actionable** with clear remediation steps: - -```go -// GOOD: Clear remediation -return fmt.Errorf("Vault is sealed - cannot perform cluster operations\n\n"+ - "Unseal Vault first:\n"+ - " vault operator unseal\n"+ - " Or: eos update vault unseal\n\n"+ - "Seal status:\n"+ - " Sealed: %t\n"+ - " Progress: %d/%d keys provided", - sealStatus.Sealed, sealStatus.Progress, sealStatus.T) - -// BAD: Vague error -return fmt.Errorf("operation failed") // ✗ Not actionable -``` - -### Implementation Files - -- **Orchestration**: [cmd/update/vault_cluster.go](cmd/update/vault_cluster.go) - Command handlers, flag parsing -- **Business Logic**: [pkg/vault/auth_cluster.go](pkg/vault/auth_cluster.go) - Authentication, validation -- **Cluster Operations**: [pkg/vault/raft_*.go](pkg/vault/) - Raft, Autopilot, snapshot functions - -### Common Pitfalls - -1. **✗ Returning unused client**: Cluster ops use shell commands, not SDK client - ```go - // BAD: Returns client that's never used - func getAuth(rc *RC, cmd *cobra.Command) (*api.Client, string, error) - - // GOOD: Returns only token - func getAuth(rc *RC, cmd *cobra.Command) (string, error) - ``` - -2. **✗ Validating token before seal check**: Sealed Vault fails token lookup - ```go - // BAD: Token validation fails on sealed Vault with confusing error - secret, err := client.Auth().Token().LookupSelf() // ✗ Fails if sealed - - // GOOD: Check seal status FIRST - sealStatus, err := client.Sys().SealStatus() // ✓ Check seal first - if sealStatus.Sealed { return fmt.Errorf("vault sealed") } - secret, err := client.Auth().Token().LookupSelf() // Then validate token - ``` - -3. **✗ Ignoring token TTL**: Long operations fail mid-execution - ```go - // BAD: No TTL check, cluster snapshot may take 10+ minutes - err := vault.TakeRaftSnapshot(rc, token, outputPath) // ✗ May fail if token expires - - // GOOD: Reject short-lived tokens upfront - if ttlSeconds < 60 { - return fmt.Errorf("token expires in %ds - get longer-lived token", ttlSeconds) - } - ``` - -4. **✗ Logging token values**: Exposes secrets in logs/telemetry - ```go - // BAD: Token in logs - logger.Info("Token", zap.String("value", token)) // ✗ SECURITY VIOLATION - - // GOOD: Never log tokens - logger.Info("Using token from --token flag") // ✓ No value logged - ``` - -### Reference Implementation - -See complete working example: [cmd/update/vault_cluster.go:272-324](cmd/update/vault_cluster.go#L272-L324) - -## Debug Verbosity (P2 - IMPORTANT) - -### Diagnostic Logging Strategy - -**Before Operations**: Log system state for forensics -```go -logger.Debug("Pre-operation diagnostics", - zap.String("service_dir", serviceDir), - zap.Bool("compose_file_exists", composeExists), - zap.String("docker_version", dockerVersion)) -``` - -**During Operations**: Trace command execution -```go -logger.Debug("Executing command", - zap.String("command", "docker"), - zap.Strings("args", args), - zap.String("working_dir", workDir)) -``` - -**After Operations**: Verify results -```go -logger.Debug("Post-operation verification", - zap.Bool("container_running", running), - zap.String("container_id", containerID)) -``` - -### Automatic Debug Output Capture (PLANNED - Infrastructure Ready) - -**Current Status**: Infrastructure implemented in [pkg/debug/capture.go](pkg/debug/capture.go) (151 lines), but NOT YET integrated into debug commands. - -**Evidence of Non-Integration**: -- ✅ `pkg/debug/capture.go` exists with `CaptureDebugOutput()` and `CaptureStdoutFunc()` -- ❌ Zero debug commands in `cmd/debug/*.go` use these functions -- ❌ Commands like [cmd/debug/vault.go:266-285](cmd/debug/vault.go#L266-L285) implement their own file writing instead - -**When to Migrate**: After completing current work (drift correction, Ceph integration), migrate debug commands one at a time with comprehensive testing. - -**Target Philosophy**: All `eos debug ...` commands automatically save their output to the user's directory for forensic analysis. No flags required - fully automatic, non-fatal if capture fails. - -**Target Capture Location**: `~/.eos/debug/eos-debug-{service}-{timestamp}.{ext}` -- Fallback to `/tmp` if home directory unavailable -- Timestamped filenames: `20060102-150405` format -- Format-aware extensions: `.txt`, `.json`, `.md` - -**Two Available Capture Patterns**: - -1. **Direct Capture** (for commands returning strings): -```go -// EXAMPLE - Not yet implemented in actual commands -func runVaultDebug(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { - output, err := vault.GenerateDebugReport(rc, format) - if err != nil { - return fmt.Errorf("failed to generate debug report: %w", err) - } - - // Automatic capture - captureConfig := &debug.CaptureConfig{ - ServiceName: "vault", - Output: output, - Format: format, - } - if _, captureErr := debug.CaptureDebugOutput(rc, captureConfig); captureErr != nil { - logger.Warn("Failed to auto-capture debug output", zap.Error(captureErr)) - } - - fmt.Print(output) - return nil -} -``` - -2. **Stdout Wrapper** (for commands printing directly): -```go -// EXAMPLE - Not yet implemented in actual commands -func runDebugConsul(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { - return debug.CaptureStdoutFunc(rc, "consul", func() error { - return consuldebug.RunDiagnostics(rc, config) - }) -} -``` - -**Migration Checklist (Per Debug Command)**: -- [ ] Read current command implementation -- [ ] Identify if command returns output string or prints to stdout -- [ ] Choose appropriate capture pattern (Direct vs Stdout Wrapper) -- [ ] Integrate capture function -- [ ] Remove any existing `--output` flag logic (replaced by automatic capture) -- [ ] Test with `go build && sudo eos debug [service]` -- [ ] Verify file saved to `~/.eos/debug/` -- [ ] Verify user sees log message with file location -- [ ] Verify output still displays correctly to user - -**Affected Commands** (13 files to migrate): -``` -cmd/debug/vault.go (266 lines - has --output flag, remove it) -cmd/debug/consul.go (272 lines) -cmd/debug/nomad.go (512 lines) -cmd/debug/bionicgpt.go (9210 bytes) -cmd/debug/ceph.go (5451 bytes) -cmd/debug/hecate.go (2889 bytes) -cmd/debug/mattermost.go (2586 bytes) -cmd/debug/openwebui.go (12715 bytes) -cmd/debug/wazuh.go (5160 bytes) -cmd/debug/bootstrap.go (26483 bytes - may not need capture) -cmd/debug/iris.go (48743 bytes - may not need capture) -cmd/debug/watchdog_traces.go (10189 bytes - may not need capture) -``` - -**Reference**: See [pkg/debug/capture.go](pkg/debug/capture.go) for implementation details - -### Evidence Collection (PLANNED - Infrastructure Ready) - -**Current Status**: Infrastructure implemented in [pkg/remotedebug/evidence.go](pkg/remotedebug/evidence.go) (265 lines), but NOT YET integrated into remotedebug commands. - -**Evidence of Non-Integration**: -- ✅ `pkg/remotedebug/evidence.go` exists with full evidence repository implementation -- ❌ Zero commands in `cmd/` use `NewEvidenceRepository()`, `CreateEvidence()`, or `StoreSession()` -- ❌ Remote debug SSH command doesn't persist evidence to disk yet - -**Why We Built This**: Original remotedebug implementation had critical gaps: -- ❌ Evidence only lived in memory during execution -- ❌ No forensic trail if remotedebug crashed or connection dropped -- ❌ Evidence was only human-readable strings, not machine-parseable -- ❌ No metadata for chain of custody or integrity verification - -**Solution Built (Not Yet Used)**: Structured evidence collection with automatic capture to disk. - -**Evidence vs Debug Output (Important Distinction)**: -- **Debug output**: Interactive diagnostic sessions, capture is backup -- **Evidence collection**: Automated gathering over SSH, capture IS the primary artifact -- **Different requirements**: Evidence needs chain of custody, integrity verification, structured storage - -**Evidence Types**: -```go -type EvidenceType string - -const ( - EvidenceTypeFile EvidenceType = "file" // File system evidence - EvidenceTypeCommand EvidenceType = "command" // Command output - EvidenceTypeLogEntry EvidenceType = "log" // Log file entry - EvidenceTypeMetric EvidenceType = "metric" // System metric - EvidenceTypeConfig EvidenceType = "config" // Configuration file - EvidenceTypeProcess EvidenceType = "process" // Process information - EvidenceTypeNetwork EvidenceType = "network" // Network state - EvidenceTypeSnapshot EvidenceType = "snapshot" // System snapshot -) -``` - -**Structured Evidence**: -```go -type StructuredEvidence struct { - Type EvidenceType // Type of evidence - Timestamp time.Time // When collected - Source string // Where from (hostname/IP) - Collector string // Who collected (user@host) - Data json.RawMessage // Actual evidence (structured JSON) - Checksum string // SHA256 for integrity verification - Metadata map[string]string // Additional context -} -``` - -**Evidence Session**: -```go -type EvidenceSession struct { - SessionID string // Unique session identifier - StartTime time.Time // Session start - EndTime time.Time // Session end - Host string // Target hostname - Collector string // Who ran the collection - Command string // Command that triggered collection - Evidence []StructuredEvidence // All collected evidence - Issues []Issue // Detected issues - Warnings []Warning // Warnings - Report *SystemReport // Complete system report -} -``` - -**Storage Structure**: -``` -~/.eos/evidence/ - ├── index.json # Searchable index (future) - ├── 20251022-143052-vhost5/ # Per-session evidence - │ ├── manifest.json # Session metadata - │ ├── evidence.json # All structured evidence - │ ├── issues.json # Detected issues with evidence - │ ├── warnings.json # Warnings - │ ├── report.json # Complete system report - │ └── summary.txt # Human-readable summary -``` - -**Usage Pattern**: -```go -// Create evidence repository -repo, err := remotedebug.NewEvidenceRepository() -if err != nil { - return fmt.Errorf("failed to create evidence repository: %w", err) -} - -// Create evidence item -diskEvidence, err := remotedebug.CreateEvidence( - remotedebug.EvidenceTypeMetric, - hostname, - diskInfo, // Any struct that can be marshaled to JSON -) - -// Store complete session -session := &remotedebug.EvidenceSession{ - SessionID: "session-" + time.Now().Format("20060102-150405"), - StartTime: startTime, - EndTime: time.Now(), - Host: hostname, - Collector: "user@workstation", - Command: "eos remotedebug ssh", - Evidence: collectedEvidence, - Issues: analyzedIssues, - Warnings: warnings, - Report: systemReport, -} - -sessionDir, err := repo.StoreSession(session) -if err != nil { - logger.Warn("Failed to store evidence session", zap.Error(err)) -} else { - logger.Info("Evidence session saved", - zap.String("location", sessionDir), - zap.Int("evidence_count", len(session.Evidence))) -} -``` - -**Integrity Verification**: -```go -// Verify evidence hasn't been tampered with -if evidence.VerifyEvidence() { - logger.Info("Evidence integrity verified") -} else { - logger.Error("Evidence checksum mismatch - possible tampering") -} -``` - -**What We Did NOT Implement (With Reasons)**: - -1. **❌ Evidence Signing/Encryption** - - **WHY NOT**: Dev/ops tool, not forensic investigation tool - - **ALTERNATIVE**: File permissions + checksums sufficient for integrity - - **IF NEEDED**: Add when compliance requirements are clear - -2. **❌ Chain-of-Custody Signatures** - - **WHY NOT**: No legal requirement stated - - **ALTERNATIVE**: Metadata tracking sufficient for troubleshooting - - **IF NEEDED**: Add for specific compliance (SOC2, PCI-DSS, etc.) - -3. **❌ Centralized Evidence Server** - - **WHY NOT**: Over-engineering, network dependency - - **ALTERNATIVE**: Local storage + optional rsync to central location - - **IF NEEDED**: Users script `rsync ~/.eos/evidence/` to server - -4. **❌ Automatic Retention/Rotation** - - **WHY NOT**: User's machine, user's policy - - **ALTERNATIVE**: Manual cleanup: `rm -rf ~/.eos/evidence/2024*` - - **IF NEEDED**: Add optional `--cleanup` flag later - -**Cleanup Commands**: -```bash -# View evidence sessions -ls -lh ~/.eos/evidence/ - -# View specific session -cat ~/.eos/evidence/20251022-143052-vhost5/summary.txt - -# Cleanup old evidence (manual) -find ~/.eos/evidence/ -type d -mtime +30 -exec rm -rf {} \; - -# Cleanup by size (keep only 1GB) -du -sh ~/.eos/evidence/ | awk '$1 > 1 {print "Evidence directory exceeds 1GB"}' -``` - -**Reference**: See `pkg/remotedebug/evidence.go` for implementation details - -## Flag Bypass Vulnerability Prevention (P0 - CRITICAL) - -### The Vulnerability - -When Cobra encounters the `--` separator in command-line arguments, it **stops parsing flags** and treats everything after it as positional arguments. This creates a security vulnerability where users can accidentally (or maliciously) bypass flag-based safety checks. - -**Example:** -```bash -# User intends to use --force flag -sudo eos delete env production -- --force - -# What Cobra sees: -# - Command: delete env -# - Args: ["production", "--force"] # Both are positional args! -# - Flags: force=false # Flag was never set! -``` - -**Security Impact:** -- Bypasses `--force` safety checks (production deletion, running VM deletion) -- Bypasses `--dry-run` validation -- Bypasses `--emergency-override` authentication -- Bypasses approval workflow requirements - -### Affected Commands (40+ files) - -Any command using `cobra.ExactArgs()`, `cobra.MaximumNArgs()`, or `cobra.MinimumNArgs()` is vulnerable. - -**Priority 1 (Safety-Critical):** -- `cmd/delete/env.go` - Production environment deletion -- `cmd/delete/kvm.go` - Running VM forced deletion -- `cmd/promote/approve.go` - Emergency approval override -- `cmd/promote/stack.go` - Multi-environment promotion - -**All affected:** See backup/*, create/*, delete/*, update/*, promote/* commands - -### Mandatory Mitigation Pattern - -**RULE**: ALL commands that accept positional arguments MUST validate them at the start of `RunE`. - -```go -// REQUIRED at start of every RunE that accepts args -RunE: eos.Wrap(func(rc *eos_io.RuntimeContext, cmd *cobra.Command, args []string) error { - logger := otelzap.Ctx(rc.Ctx) - - // CRITICAL: Detect flag-like args (--force, -f, etc.) - if err := verify.ValidateNoFlagLikeArgs(args); err != nil { - return err // User-friendly error with remediation - } - - // Rest of command logic... -}) -``` - -### Implementation Details - -See [pkg/verify/validators.go:271-294](pkg/verify/validators.go#L271-L294) for the validator implementation. - -**What it catches:** -- Long flags: `--force`, `--dry-run`, `--emergency-override` -- Short flags: `-f`, `-v`, `-i` -- Allows negative numbers: `-1`, `-42` (distinguishes from flags) - -**Error message example:** -``` -argument 1 looks like a long flag: '--force' -Did you use the '--' separator by mistake? -Remove the '--' separator to use flags properly. -Example: Use 'eos delete env prod --force' instead of 'eos delete env prod -- --force' -``` - -### Migration Checklist - -When adding this to existing commands: - -1. Add import: `"github.com/CodeMonkeyCybersecurity/eos/pkg/verify"` -2. Add validation as FIRST line in RunE (after logger initialization) -3. Test with: `eos [command] arg -- --flag` (should error) -4. Test normal usage: `eos [command] arg --flag` (should work) - -### Testing - -```bash -# Should FAIL with clear error -eos delete env production -- --force -eos create config -- hecate -eos promote approve id -- --emergency-override - -# Should SUCCEED -eos delete env production --force -eos create config --hecate -eos promote approve id --emergency-override -``` - -## Memory Notes - -- No emojis in code or documentation -- Prefer editing existing files over creating new ones -- Build iteratively on existing patterns -- Solve complex problems once, encode in Eos, never solve again - -## Documentation Policy (P0 - CRITICAL) - -**RULE**: Documentation MUST live in exactly ONE of these locations: - -1. **CLAUDE.md** - Development standards, patterns, critical rules -2. **ROADMAP.md** - Project roadmap, technical debt tracking, future work -3. **README.md** (per-directory) - Directory purpose and usage -4. **Inline comments** - ALL other documentation (implementation details, rationale, examples) - -**FORBIDDEN**: Creating standalone documentation files (*.md) except the above. Examples of FORBIDDEN files: -- ✗ `SECURITY_IMPROVEMENTS.md` - Put security patterns in CLAUDE.md, implementation in inline comments -- ✗ `P0_SECURITY_FIXES_COMPLETE.md` - Put completion status in ROADMAP.md -- ✗ `BACKUP_ADVERSARIAL_ANALYSIS.md` - Put analysis in inline comments in the code -- ✗ `DEPLOYMENT_GUIDE.md` - Put deployment steps in README.md -- ✗ `API_REFERENCE.md` - Use godoc comments in code - -**WHY**: Multiple documentation locations create: -- **Maintenance burden**: Updates must be synchronized across files -- **Stale documentation**: Standalone docs drift from code reality -- **Discovery problems**: Developers don't know which file to check -- **Duplication**: Same information in multiple places - -**CORRECT PATTERN**: -```go -// SECURITY: Password exposure mitigation (CVSS 7.5) -// Changed from RESTIC_PASSWORD env var to temporary password file. -// RATIONALE: Environment variables visible via 'ps auxe' and /proc//environ -// MITIGATION: -// 1. Create temp file with os.CreateTemp() (unpredictable name) -// 2. Set permissions to 0400 (owner read-only) -// 3. Write password to file -// 4. Pass file path via RESTIC_PASSWORD_FILE -// 5. Delete file immediately after use (defer cleanup) -// EVIDENCE: This pattern prevents password scraping attacks -// COMPLIANCE: PCI-DSS 8.2.1, SOC2 CC6.1 -passwordFile, err := os.CreateTemp("", "restic-password-*") -defer os.Remove(passwordFile.Name()) -defer passwordFile.Close() -os.Chmod(passwordFile.Name(), TempPasswordFilePerm) -``` - -**WRONG PATTERN**: -```go -// See SECURITY_IMPROVEMENTS.md for password security pattern -passwordFile, err := os.CreateTemp("", "restic-password-*") -``` +## Architecture (quick reference) -**Migration**: If standalone documentation files exist, consolidate them: -- Security patterns → CLAUDE.md or inline comments -- Completion status → ROADMAP.md -- Implementation details → Inline comments -- Then DELETE the standalone file +- **cmd/**: Orchestration ONLY — cobra + flags + call pkg/. If >100 lines → move to pkg/. +- **pkg/**: ALL business logic — ASSESS → INTERVENE → EVALUATE. Always use `*eos_io.RuntimeContext`. +- **Logging:** ONLY `otelzap.Ctx(rc.Ctx)` — never `fmt.Print*` (exception: `fmt.Print(report.Render())` at end of cmd/debug/ handlers) +- **Constants:** NEVER hardcode — use `pkg/shared/ports.go`, `pkg/shared/paths.go`, `pkg/[service]/constants.go` +- **Human-centric:** Missing flags → `interaction.GetRequiredString()` fallback chain, never hard-fail -**Exception**: README.md files in directories are ALLOWED and ENCOURAGED for directory-level documentation +## Documentation -## External References +- Patterns and detailed examples: @prompts/docs/CANONICAL.md +- Roadmap and technical debt: [ROADMAP.md](ROADMAP.md) +- Per-directory docs: `README.md` in each directory -- Detailed patterns: [PATTERNS.md](./docs/PATTERNS.md) -- Documentation index: [docs/INDEX.md](./docs/INDEX.md) -- Knowledge base: [Athena Wiki](https://wiki.cybermonkey.net.au) -- Company: [Code Monkey Cybersecurity](https://cybermonkey.net.au/) -- Social: [Facebook](https://www.facebook.com/codemonkeycyber) | [X/Twitter](https://x.com/codemonkeycyber) | [LinkedIn](https://www.linkedin.com/company/codemonkeycyber) | [YouTube](https://www.youtube.com/@CodeMonkeyCybersecurity) +**FORBIDDEN**: standalone `*.md` files other than ROADMAP.md and per-directory README.md. Put patterns in `.claude/rules/`, implementation rationale in inline comments. -## License Awareness +## Cross-repo work -Eos is dual-licensed: -- GNU Affero General Public License v3 (AGPL-3.0-or-later) -- Do No Harm License +If a fix belongs in a different repo, create the issue in the **target repo** first, then use the ISoBAR cross-repo template from `prompts/COORDINATION.md`. -When suggesting code, ensure compliance with both licenses. Focus on defensive security, human benefit, and open collaboration. +Repo inventory: hecate (gateway, vhost7), moni (backend, vhost11), contracts (data contracts), aphrodite (UI), prompts (governance). ---- +## Testing -*"Cybersecurity. With humans."* \ No newline at end of file +@prompts/TESTING.md +@prompts/TESTING-GO.md diff --git a/prompts b/prompts new file mode 160000 index 00000000..6f1d85fd --- /dev/null +++ b/prompts @@ -0,0 +1 @@ +Subproject commit 6f1d85fd6ad4108531bb27b70efcd160e2ac727d