diff --git a/config/phantom.yaml b/config/phantom.yaml index 6d6815a..1350f79 100644 --- a/config/phantom.yaml +++ b/config/phantom.yaml @@ -1,7 +1,7 @@ name: phantom port: 3100 role: swe -model: claude-sonnet-4-6 +model: claude-opus-4-6 effort: max max_budget_usd: 0 timeout_minutes: 240 diff --git a/src/agent/__tests__/prompt-assembler.test.ts b/src/agent/__tests__/prompt-assembler.test.ts index 8c9a180..6029682 100644 --- a/src/agent/__tests__/prompt-assembler.test.ts +++ b/src/agent/__tests__/prompt-assembler.test.ts @@ -7,6 +7,7 @@ const baseConfig: PhantomConfig = { port: 3100, role: "swe", model: "claude-opus-4-6", + model_source: "config", effort: "max", max_budget_usd: 0, timeout_minutes: 240, diff --git a/src/cli/__tests__/init.test.ts b/src/cli/__tests__/init.test.ts index 83e9580..a3c4368 100644 --- a/src/cli/__tests__/init.test.ts +++ b/src/cli/__tests__/init.test.ts @@ -62,7 +62,7 @@ describe("phantom init", () => { expect(config.name).toBe("phantom"); expect(config.role).toBe("swe"); expect(config.port).toBe(3100); - expect(config.model).toBe("claude-sonnet-4-6"); + expect(config.model).toBe("claude-opus-4-6"); }); test("accepts custom name and role", async () => { @@ -313,7 +313,7 @@ describe("phantom init --yes (environment-aware)", () => { const raw = readFileSync("config/phantom.yaml", "utf-8"); const config = YAML.parse(raw); - expect(config.model).toBe("claude-sonnet-4-6"); + expect(config.model).toBe("claude-opus-4-6"); }); test("reads PHANTOM_DOMAIN from environment", async () => { diff --git a/src/cli/doctor.ts b/src/cli/doctor.ts index 98bb6db..cddbdaa 100644 --- a/src/cli/doctor.ts +++ b/src/cli/doctor.ts @@ -1,4 +1,4 @@ -import { existsSync, readFileSync } from "node:fs"; +import { constants, accessSync, existsSync, readFileSync } from "node:fs"; import { parseArgs } from "node:util"; type CheckResult = { @@ -148,6 +148,52 @@ async function checkEvolvedConfig(): Promise { return { name: "Evolved Config", status: "ok", message: "All config files present" }; } +async function checkEvolutionPipeline(): Promise { + const sessionLog = "phantom-config/memory/session-log.jsonl"; + const metricsFile = "phantom-config/meta/metrics.json"; + + if (!existsSync("phantom-config")) { + return { name: "Evolution Pipeline", status: "warn", message: "phantom-config/ not found", fix: "phantom init" }; + } + + // Check session-log.jsonl is writable + if (!existsSync(sessionLog)) { + return { + name: "Evolution Pipeline", + status: "warn", + message: "session-log.jsonl not found", + fix: "Run a session to generate it, or run phantom init", + }; + } + + try { + accessSync(sessionLog, constants.R_OK | constants.W_OK); + } catch { + return { name: "Evolution Pipeline", status: "fail", message: "session-log.jsonl not writable" }; + } + + // Check metrics.json shows pipeline activity + if (!existsSync(metricsFile)) { + return { name: "Evolution Pipeline", status: "warn", message: "metrics.json not found", fix: "phantom init" }; + } + + try { + const raw = readFileSync(metricsFile, "utf-8"); + const metrics = JSON.parse(raw) as { session_count?: number }; + const count = metrics.session_count ?? 0; + if (count === 0) { + return { + name: "Evolution Pipeline", + status: "warn", + message: "No sessions recorded yet (pipeline not yet active)", + }; + } + return { name: "Evolution Pipeline", status: "ok", message: `${count} sessions processed` }; + } catch { + return { name: "Evolution Pipeline", status: "fail", message: "metrics.json not parseable" }; + } +} + async function checkPhantomHealth(port: number): Promise { try { const resp = await fetch(`http://localhost:${port}/health`, { signal: AbortSignal.timeout(3000) }); @@ -194,6 +240,7 @@ export async function runDoctor(args: string[]): Promise { checkMcpConfig(), checkDatabase(), checkEvolvedConfig(), + checkEvolutionPipeline(), checkPhantomHealth(port), ]); diff --git a/src/cli/init.ts b/src/cli/init.ts index db315a1..f83a2b3 100644 --- a/src/cli/init.ts +++ b/src/cli/init.ts @@ -198,7 +198,7 @@ export async function runInit(args: string[]): Promise { name: values.name ?? envName ?? "phantom", role: values.role ?? envRole ?? "swe", port: values.port ? Number.parseInt(values.port, 10) : envPort ? Number.parseInt(envPort, 10) : 3100, - model: envModel ?? "claude-sonnet-4-6", + model: envModel ?? "claude-opus-4-6", domain: envDomain, public_url: envPublicUrl, effort: envEffort, @@ -224,7 +224,7 @@ export async function runInit(args: string[]): Promise { port: values.port ? Number.parseInt(values.port, 10) : Number.parseInt(await prompt(rl, "HTTP port", "3100"), 10), - model: await prompt(rl, "Model (claude-sonnet-4-6, claude-opus-4-6)", "claude-sonnet-4-6"), + model: await prompt(rl, "Model (claude-opus-4-6, claude-sonnet-4-6)", "claude-opus-4-6"), }; console.log("\nSlack setup (optional, press Enter to skip):"); diff --git a/src/cli/status.ts b/src/cli/status.ts index 7342d2f..1e552de 100644 --- a/src/cli/status.ts +++ b/src/cli/status.ts @@ -6,9 +6,16 @@ type HealthResponse = { version: string; agent: string; role: { id: string; name: string }; + model?: string; + model_source?: "config" | "env"; channels: Record; memory: { qdrant: boolean; ollama: boolean }; - evolution: { generation: number }; + evolution: { + generation: number; + session_count?: number; + sessions_since_consolidation?: number; + session_log_depth?: number; + }; onboarding?: string; peers?: Record; }; @@ -74,9 +81,18 @@ export async function runStatus(args: string[]): Promise { const memoryStr = data.memory.qdrant && data.memory.ollama ? "ok" : data.memory.qdrant || data.memory.ollama ? "degraded" : "offline"; + const modelStr = data.model ? `${data.model}${data.model_source === "env" ? " (env override)" : ""}` : "unknown"; + + const evo = data.evolution; + const evoDetail = + evo.session_count != null + ? `gen ${evo.generation} (${evo.session_count} sessions, ${evo.sessions_since_consolidation ?? 0} since consolidation, ${evo.session_log_depth ?? 0} queued)` + : `gen ${evo.generation}`; + console.log( `${data.agent} | ${data.role.name} | v${data.version} | ` + - `gen ${data.evolution.generation} | ` + + `${evoDetail} | ` + + `model: ${modelStr} | ` + `up ${formatUptime(data.uptime)} | ` + `channels: ${channelStr} | ` + `memory: ${memoryStr}`, diff --git a/src/config/__tests__/loader.test.ts b/src/config/__tests__/loader.test.ts index d50f052..69eec0c 100644 --- a/src/config/__tests__/loader.test.ts +++ b/src/config/__tests__/loader.test.ts @@ -106,6 +106,7 @@ model: claude-opus-4-6 process.env.PHANTOM_MODEL = "claude-sonnet-4-6"; const config = loadConfig(path); expect(config.model).toBe("claude-sonnet-4-6"); + expect(config.model_source).toBe("env"); } finally { if (saved !== undefined) { process.env.PHANTOM_MODEL = saved; @@ -116,6 +117,27 @@ model: claude-opus-4-6 } }); + test("model_source defaults to config when no env override", () => { + const path = writeYaml( + "model-source-default.yaml", + ` +name: test-phantom +model: claude-opus-4-6 +`, + ); + const saved = process.env.PHANTOM_MODEL; + try { + process.env.PHANTOM_MODEL = undefined; + const config = loadConfig(path); + expect(config.model_source).toBe("config"); + } finally { + if (saved !== undefined) { + process.env.PHANTOM_MODEL = saved; + } + cleanup(); + } + }); + test("env var overrides YAML domain", () => { const path = writeYaml( "env-domain.yaml", diff --git a/src/config/loader.ts b/src/config/loader.ts index 741db6e..b3600cb 100644 --- a/src/config/loader.ts +++ b/src/config/loader.ts @@ -29,7 +29,12 @@ export function loadConfig(path?: string): PhantomConfig { // Environment variable overrides for runtime flexibility. // These let operators change settings via env without editing YAML. if (process.env.PHANTOM_MODEL) { - config.model = process.env.PHANTOM_MODEL; + const envModel = process.env.PHANTOM_MODEL; + if (envModel !== config.model) { + console.warn(`[config] Model override: ${config.model} (yaml) -> ${envModel} (env)`); + } + config.model = envModel; + config.model_source = "env"; } if (process.env.PHANTOM_DOMAIN) { config.domain = process.env.PHANTOM_DOMAIN; diff --git a/src/config/schemas.ts b/src/config/schemas.ts index dbce22c..5b4de87 100644 --- a/src/config/schemas.ts +++ b/src/config/schemas.ts @@ -13,7 +13,8 @@ export const PhantomConfigSchema = z.object({ public_url: z.string().url().optional(), port: z.number().int().min(1).max(65535).default(3100), role: z.string().min(1).default("swe"), - model: z.string().min(1).default("claude-sonnet-4-6"), + model: z.string().min(1).default("claude-opus-4-6"), + model_source: z.enum(["config", "env"]).default("config"), effort: z.enum(["low", "medium", "high", "max"]).default("max"), max_budget_usd: z.number().min(0).default(0), timeout_minutes: z.number().min(1).default(240), diff --git a/src/core/server.ts b/src/core/server.ts index 6244bb3..dfc2455 100644 --- a/src/core/server.ts +++ b/src/core/server.ts @@ -10,12 +10,20 @@ import { handleUiRequest } from "../ui/serve.ts"; const VERSION = "0.18.2"; type MemoryHealthProvider = () => Promise; +type EvolutionInfo = { + generation: number; + session_count: number; + sessions_since_consolidation: number; + session_log_depth: number; +}; type EvolutionVersionProvider = () => number; +type EvolutionInfoProvider = () => EvolutionInfo; type McpServerProvider = () => PhantomMcpServer | null; type ChannelHealthProvider = () => Record; type RoleInfoProvider = () => { id: string; name: string } | null; type OnboardingStatusProvider = () => string; type WebhookHandler = (req: Request) => Promise; +type ModelInfoProvider = () => { model: string; model_source: "config" | "env" }; type PeerHealthProvider = () => Record; type TriggerDeps = { runtime: AgentRuntime; @@ -25,11 +33,13 @@ type TriggerDeps = { let memoryHealthProvider: MemoryHealthProvider | null = null; let evolutionVersionProvider: EvolutionVersionProvider | null = null; +let evolutionInfoProvider: EvolutionInfoProvider | null = null; let mcpServerProvider: McpServerProvider | null = null; let channelHealthProvider: ChannelHealthProvider | null = null; let roleInfoProvider: RoleInfoProvider | null = null; let onboardingStatusProvider: OnboardingStatusProvider | null = null; let webhookHandler: WebhookHandler | null = null; +let modelInfoProvider: ModelInfoProvider | null = null; let peerHealthProvider: PeerHealthProvider | null = null; let triggerDeps: TriggerDeps | null = null; @@ -41,6 +51,10 @@ export function setEvolutionVersionProvider(provider: EvolutionVersionProvider): evolutionVersionProvider = provider; } +export function setEvolutionInfoProvider(provider: EvolutionInfoProvider): void { + evolutionInfoProvider = provider; +} + export function setMcpServerProvider(provider: McpServerProvider): void { mcpServerProvider = provider; } @@ -61,6 +75,10 @@ export function setWebhookHandler(handler: WebhookHandler): void { webhookHandler = handler; } +export function setModelInfoProvider(provider: ModelInfoProvider): void { + modelInfoProvider = provider; +} + export function setPeerHealthProvider(provider: PeerHealthProvider): void { peerHealthProvider = provider; } @@ -92,11 +110,13 @@ export function startServer(config: PhantomConfig, startedAt: number): ReturnTyp // Both up -> ok. One up -> degraded. Both down + configured -> down. Not configured -> ok. const status = allHealthy ? "ok" : someHealthy ? "degraded" : memory.configured ? "down" : "ok"; const evolutionGeneration = evolutionVersionProvider ? evolutionVersionProvider() : 0; + const evolutionInfo = evolutionInfoProvider ? evolutionInfoProvider() : null; const roleInfo = roleInfoProvider ? roleInfoProvider() : null; const onboardingStatus = onboardingStatusProvider ? onboardingStatusProvider() : null; const peers = peerHealthProvider ? peerHealthProvider() : null; + const modelInfo = modelInfoProvider ? modelInfoProvider() : null; return Response.json({ status, @@ -105,9 +125,10 @@ export function startServer(config: PhantomConfig, startedAt: number): ReturnTyp agent: config.name, ...(config.public_url ? { public_url: config.public_url } : {}), role: roleInfo ?? { id: config.role, name: config.role }, + ...(modelInfo ? { model: modelInfo.model, model_source: modelInfo.model_source } : {}), channels, memory, - evolution: { + evolution: evolutionInfo ?? { generation: evolutionGeneration, }, ...(onboardingStatus ? { onboarding: onboardingStatus } : {}), diff --git a/src/evolution/__tests__/application.test.ts b/src/evolution/__tests__/application.test.ts index 9ac42b9..97699bc 100644 --- a/src/evolution/__tests__/application.test.ts +++ b/src/evolution/__tests__/application.test.ts @@ -62,8 +62,11 @@ describe("Application", () => { const content = readFileSync(`${TEST_DIR}/user-profile.md`, "utf-8"); expect(content).toContain("Prefers TypeScript"); - expect(change.file).toBe("user-profile.md"); - expect(change.content).toBe("- Prefers TypeScript"); + expect(change).not.toBeNull(); + if (change) { + expect(change.file).toBe("user-profile.md"); + expect(change.content).toBe("- Prefers TypeScript"); + } }); test("replaces content in file", () => { @@ -99,6 +102,73 @@ describe("Application", () => { expect(content).not.toContain("Preferences go here."); }); + test("skips append when all content lines already exist", () => { + writeFileSync(`${TEST_DIR}/user-profile.md`, "# User Profile\n\n- Prefers TypeScript\n", "utf-8"); + const delta: ConfigDelta = { + file: "user-profile.md", + type: "append", + content: "- Prefers TypeScript", + rationale: "User said so", + session_ids: ["s1"], + tier: "free", + }; + applyDelta(delta, testConfig()); + + const content = readFileSync(`${TEST_DIR}/user-profile.md`, "utf-8"); + // Should not have duplicated the line + const matches = content.match(/Prefers TypeScript/g); + expect(matches).toHaveLength(1); + }); + + test("appends only new lines from partial overlap", () => { + writeFileSync(`${TEST_DIR}/user-profile.md`, "# User Profile\n\n- Prefers TypeScript\n", "utf-8"); + const delta: ConfigDelta = { + file: "user-profile.md", + type: "append", + content: "- Prefers TypeScript\n- Uses Bun runtime", + rationale: "Mixed new and existing", + session_ids: ["s1"], + tier: "free", + }; + applyDelta(delta, testConfig()); + + const content = readFileSync(`${TEST_DIR}/user-profile.md`, "utf-8"); + const tsMatches = content.match(/Prefers TypeScript/g); + expect(tsMatches).toHaveLength(1); + expect(content).toContain("Uses Bun runtime"); + }); + + test("returns null for no-op append", () => { + writeFileSync(`${TEST_DIR}/user-profile.md`, "# User Profile\n\n- Prefers TypeScript\n", "utf-8"); + const delta: ConfigDelta = { + file: "user-profile.md", + type: "append", + content: "- Prefers TypeScript", + rationale: "Duplicate", + session_ids: ["s1"], + tier: "free", + }; + const result = applyDelta(delta, testConfig()); + expect(result).toBeNull(); + }); + + test("appends when content is genuinely new", () => { + writeFileSync(`${TEST_DIR}/user-profile.md`, "# User Profile\n\n- Prefers TypeScript\n", "utf-8"); + const delta: ConfigDelta = { + file: "user-profile.md", + type: "append", + content: "- Uses Bun runtime", + rationale: "New preference", + session_ids: ["s1"], + tier: "free", + }; + applyDelta(delta, testConfig()); + + const content = readFileSync(`${TEST_DIR}/user-profile.md`, "utf-8"); + expect(content).toContain("Prefers TypeScript"); + expect(content).toContain("Uses Bun runtime"); + }); + test("creates new file if it does not exist", () => { const delta: ConfigDelta = { file: "new-file.md", diff --git a/src/evolution/__tests__/consolidation.test.ts b/src/evolution/__tests__/consolidation.test.ts new file mode 100644 index 0000000..789240f --- /dev/null +++ b/src/evolution/__tests__/consolidation.test.ts @@ -0,0 +1,75 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import type { EvolutionConfig } from "../config.ts"; +import { compressUserProfile } from "../consolidation.ts"; + +const TEST_DIR = "/tmp/phantom-test-consolidation"; + +function testConfig(): EvolutionConfig { + return { + cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, + gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, + reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, + paths: { + config_dir: TEST_DIR, + constitution: `${TEST_DIR}/constitution.md`, + version_file: `${TEST_DIR}/meta/version.json`, + metrics_file: `${TEST_DIR}/meta/metrics.json`, + evolution_log: `${TEST_DIR}/meta/evolution-log.jsonl`, + golden_suite: `${TEST_DIR}/meta/golden-suite.jsonl`, + session_log: `${TEST_DIR}/memory/session-log.jsonl`, + }, + }; +} + +describe("compressUserProfile", () => { + beforeEach(() => { + mkdirSync(TEST_DIR, { recursive: true }); + }); + + afterEach(() => { + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("deduplicates repeated lines when over size limit", () => { + // Generate content that exceeds max_file_lines (200) + const lines = ["# User Profile", ""]; + for (let i = 0; i < 150; i++) { + lines.push(`- Preference ${i}`); + } + // Add duplicates to push over 200 + for (let i = 0; i < 60; i++) { + lines.push(`- Preference ${i}`); + } + writeFileSync(`${TEST_DIR}/user-profile.md`, lines.join("\n"), "utf-8"); + + const result = compressUserProfile(testConfig()); + expect(result).toBe(true); + + const compressed = readFileSync(`${TEST_DIR}/user-profile.md`, "utf-8"); + const prefMatches = compressed.match(/- Preference 0$/gm); + expect(prefMatches).toHaveLength(1); + }); + + test("returns false when under size limit even with duplicates", () => { + const content = ["# User Profile", "", "- Prefers TypeScript", "- Prefers TypeScript"].join("\n"); + writeFileSync(`${TEST_DIR}/user-profile.md`, content, "utf-8"); + + const result = compressUserProfile(testConfig()); + expect(result).toBe(false); + }); + + test("returns false when no duplicates exist", () => { + const content = "# User Profile\n\n- Prefers TypeScript\n- Uses Bun runtime\n"; + writeFileSync(`${TEST_DIR}/user-profile.md`, content, "utf-8"); + + const result = compressUserProfile(testConfig()); + expect(result).toBe(false); + }); + + test("returns false when file does not exist", () => { + const result = compressUserProfile(testConfig()); + expect(result).toBe(false); + }); +}); diff --git a/src/evolution/__tests__/engine.test.ts b/src/evolution/__tests__/engine.test.ts index 7af4012..83d974e 100644 --- a/src/evolution/__tests__/engine.test.ts +++ b/src/evolution/__tests__/engine.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, test } from "bun:test"; -import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, readFileSync, readdirSync, rmSync, writeFileSync } from "node:fs"; import { EvolutionEngine } from "../engine.ts"; import type { SessionSummary } from "../types.ts"; @@ -148,7 +148,7 @@ describe("EvolutionEngine", () => { test("afterSession with no signals returns current version", async () => { const engine = new EvolutionEngine(CONFIG_PATH); - const session = makeSession({ user_messages: ["What time is it?"] }); + const session = makeSession({ user_messages: ["What time is it?"], outcome: "abandoned", tools_used: [] }); const result = await engine.afterSession(session); expect(result.changes_applied).toHaveLength(0); }); @@ -264,4 +264,40 @@ describe("EvolutionEngine", () => { expect(config.userProfile).toContain("TypeScript"); expect(config.meta.version).toBeGreaterThan(0); }); + + test("backupConfig creates versioned backup directory", async () => { + const engine = new EvolutionEngine(CONFIG_PATH); + const session = makeSession({ + user_messages: ["No, use TypeScript not JavaScript"], + }); + await engine.afterSession(session); + + const backupDir = `${TEST_DIR}/data/config-backups`; + expect(existsSync(backupDir)).toBe(true); + + const version = engine.getCurrentVersion(); + const versionDir = `${backupDir}/v${version}`; + expect(existsSync(versionDir)).toBe(true); + expect(existsSync(`${versionDir}/constitution.md`)).toBe(true); + expect(existsSync(`${versionDir}/persona.md`)).toBe(true); + }); + + test("backupConfig retains only last 5 versions", async () => { + const engine = new EvolutionEngine(CONFIG_PATH); + + // Run 7 sessions that each produce a change + for (let i = 0; i < 7; i++) { + const session = makeSession({ + session_id: `session-backup-${i}`, + user_messages: [`No, prefer tool-${i} over the old one`], + }); + await engine.afterSession(session); + } + + const backupDir = `${TEST_DIR}/data/config-backups`; + if (existsSync(backupDir)) { + const entries = readdirSync(backupDir).filter((e) => e.startsWith("v")); + expect(entries.length).toBeLessThanOrEqual(5); + } + }); }); diff --git a/src/evolution/__tests__/judge-activation.test.ts b/src/evolution/__tests__/judge-activation.test.ts index 59935ce..3cb33ba 100644 --- a/src/evolution/__tests__/judge-activation.test.ts +++ b/src/evolution/__tests__/judge-activation.test.ts @@ -79,10 +79,20 @@ function setupWithJudgeMode(enabled: "auto" | "always" | "never"): void { } let savedApiKey: string | undefined; +let savedAuthToken: string | undefined; +let savedOauthToken: string | undefined; +let savedJudgeKey: string | undefined; describe("Judge Activation", () => { beforeEach(() => { savedApiKey = process.env.ANTHROPIC_API_KEY; + savedAuthToken = process.env.ANTHROPIC_AUTH_TOKEN; + savedOauthToken = process.env.CLAUDE_CODE_OAUTH_TOKEN; + savedJudgeKey = process.env.JUDGE_API_KEY; + // Clear all auth env vars so tests control them explicitly + process.env.ANTHROPIC_AUTH_TOKEN = undefined; + process.env.CLAUDE_CODE_OAUTH_TOKEN = undefined; + process.env.JUDGE_API_KEY = undefined; }); afterEach(() => { @@ -91,6 +101,21 @@ describe("Judge Activation", () => { } else { process.env.ANTHROPIC_API_KEY = undefined; } + if (savedAuthToken !== undefined) { + process.env.ANTHROPIC_AUTH_TOKEN = savedAuthToken; + } else { + process.env.ANTHROPIC_AUTH_TOKEN = undefined; + } + if (savedOauthToken !== undefined) { + process.env.CLAUDE_CODE_OAUTH_TOKEN = savedOauthToken; + } else { + process.env.CLAUDE_CODE_OAUTH_TOKEN = undefined; + } + if (savedJudgeKey !== undefined) { + process.env.JUDGE_API_KEY = savedJudgeKey; + } else { + process.env.JUDGE_API_KEY = undefined; + } rmSync(TEST_DIR, { recursive: true, force: true }); }); @@ -108,6 +133,30 @@ describe("Judge Activation", () => { expect(engine.usesLLMJudges()).toBe(false); }); + test("auto mode enables judges with ANTHROPIC_AUTH_TOKEN alone", () => { + process.env.ANTHROPIC_API_KEY = undefined; + process.env.ANTHROPIC_AUTH_TOKEN = "auth-token-test"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + + test("auto mode enables judges with CLAUDE_CODE_OAUTH_TOKEN alone", () => { + process.env.ANTHROPIC_API_KEY = undefined; + process.env.CLAUDE_CODE_OAUTH_TOKEN = "oauth-token-test"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + + test("auto mode enables judges with JUDGE_API_KEY alone", () => { + process.env.ANTHROPIC_API_KEY = undefined; + process.env.JUDGE_API_KEY = "sk-judge-test-key"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + test("never mode disables judges even when API key is set", () => { process.env.ANTHROPIC_API_KEY = "sk-test-key"; setupWithJudgeMode("never"); diff --git a/src/evolution/__tests__/observation-judge.test.ts b/src/evolution/__tests__/observation-judge.test.ts new file mode 100644 index 0000000..55f8e34 --- /dev/null +++ b/src/evolution/__tests__/observation-judge.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, test } from "bun:test"; +import { toSessionObservations } from "../judges/observation-judge.ts"; +import type { ObservationExtractionResultType } from "../judges/schemas.ts"; + +describe("toSessionObservations", () => { + test("preserves affected_files from judge output", () => { + const judgeResult: ObservationExtractionResultType = { + session_summary: "Test session", + session_outcome: "success", + observations: [ + { + type: "domain_fact_learned", + summary: "API runs on port 8080", + detail: "User mentioned their API configuration", + evidence: "Our API runs on port 8080", + importance: 0.7, + importance_reasoning: "Useful for future tasks", + affected_config_files: ["domain-knowledge.md", "strategies/task-patterns.md"], + }, + ], + implicit_signals: { + user_satisfaction: 0.8, + user_satisfaction_evidence: "User seemed happy", + agent_performance: 0.7, + agent_performance_evidence: "Task completed", + }, + meta: { + total_user_messages: 3, + total_corrections: 0, + tools_used: ["Read"], + primary_task_type: "configuration", + }, + }; + + const observations = toSessionObservations(judgeResult); + expect(observations).toHaveLength(1); + expect(observations[0].affected_files).toEqual(["domain-knowledge.md", "strategies/task-patterns.md"]); + expect(observations[0].type).toBe("domain_fact"); + }); + + test("sets affected_files to undefined when empty", () => { + const judgeResult: ObservationExtractionResultType = { + session_summary: "Test session", + session_outcome: "success", + observations: [ + { + type: "preference_stated", + summary: "User prefers tabs", + detail: "Formatting preference", + evidence: "I prefer tabs", + importance: 0.5, + importance_reasoning: "Style preference", + affected_config_files: [], + }, + ], + implicit_signals: { + user_satisfaction: 0.8, + user_satisfaction_evidence: "OK", + agent_performance: 0.7, + agent_performance_evidence: "OK", + }, + meta: { + total_user_messages: 1, + total_corrections: 0, + tools_used: [], + primary_task_type: "general", + }, + }; + + const observations = toSessionObservations(judgeResult); + expect(observations[0].affected_files).toBeUndefined(); + }); +}); diff --git a/src/evolution/__tests__/reflection.test.ts b/src/evolution/__tests__/reflection.test.ts index 69e359a..8dc6cc9 100644 --- a/src/evolution/__tests__/reflection.test.ts +++ b/src/evolution/__tests__/reflection.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from "bun:test"; import { buildCritiqueFromObservations, extractObservations, generateDeltas } from "../reflection.ts"; -import type { EvolvedConfig, SessionSummary } from "../types.ts"; +import type { EvolvedConfig, SessionObservation, SessionSummary } from "../types.ts"; function makeSession(overrides: Partial = {}): SessionSummary { return { @@ -111,12 +111,92 @@ describe("buildCritiqueFromObservations", () => { expect(critique.suggested_changes[0].file).toBe("user-profile.md"); }); - test("produces no changes for simple successful sessions", () => { + test("routes domain_fact observations to domain-knowledge.md", () => { + const session = makeSession({ user_messages: ["Our team uses PostgreSQL for all databases"] }); + const observations = extractObservations(session); + const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); + + const domainChanges = critique.suggested_changes.filter((c) => c.file === "domain-knowledge.md"); + expect(domainChanges.length).toBeGreaterThan(0); + expect(domainChanges[0].type).toBe("append"); + }); + + test("routes error observations to strategies/error-recovery.md", () => { + const session = makeSession({ outcome: "failure" }); + const observations = extractObservations(session); + const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); + + const errorChanges = critique.suggested_changes.filter((c) => c.file === "strategies/error-recovery.md"); + expect(errorChanges.length).toBeGreaterThan(0); + }); + + test("routes tool_pattern observations to strategies/tool-preferences.md", () => { + const session = makeSession({ tools_used: ["Read", "Write", "Bash"] }); + const observations = extractObservations(session); + const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); + + const toolChanges = critique.suggested_changes.filter((c) => c.file === "strategies/tool-preferences.md"); + expect(toolChanges.length).toBeGreaterThan(0); + }); + + test("routes success observations to strategies/task-patterns.md", () => { + const session = makeSession({ outcome: "success" }); + const observations = extractObservations(session); + const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); + + const successChanges = critique.suggested_changes.filter((c) => c.file === "strategies/task-patterns.md"); + expect(successChanges.length).toBeGreaterThan(0); + }); + + test("rejects path traversal in affected_files", () => { + const observations: SessionObservation[] = [ + { + type: "domain_fact", + content: "Malicious content", + context: "Attacker-controlled", + confidence: 0.8, + source_messages: ["test"], + affected_files: ["../../.env"], + }, + ]; + const session = makeSession(); + const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); + + // Should fall back to default, not use the traversal path + const changes = critique.suggested_changes.filter((c) => c.file === "domain-knowledge.md"); + expect(changes.length).toBe(1); + expect(critique.suggested_changes.every((c) => !c.file.includes(".."))).toBe(true); + }); + + test("uses affected_files override when present", () => { + const observations: SessionObservation[] = [ + { + type: "domain_fact", + content: "API runs on port 8080", + context: "User shared domain knowledge", + confidence: 0.8, + source_messages: ["Our API runs on port 8080"], + affected_files: ["strategies/task-patterns.md"], + }, + ]; + const session = makeSession(); + const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); + + const changes = critique.suggested_changes.filter((c) => c.file === "strategies/task-patterns.md"); + expect(changes.length).toBeGreaterThan(0); + expect(changes[0].content).toContain("API runs on port 8080"); + }); + + test("produces only success/tool changes for simple successful sessions", () => { const session = makeSession({ user_messages: ["What is 2+2?"] }); const observations = extractObservations(session); const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); - expect(critique.suggested_changes.length).toBe(0); + // No correction or preference changes, but success routes to task-patterns + const correctionChanges = critique.suggested_changes.filter((c) => c.file === "user-profile.md"); + expect(correctionChanges.length).toBe(0); + const successChanges = critique.suggested_changes.filter((c) => c.file === "strategies/task-patterns.md"); + expect(successChanges.length).toBeGreaterThan(0); }); test("critique format has all required fields", () => { @@ -151,12 +231,14 @@ describe("generateDeltas", () => { } }); - test("returns empty for critiques with no suggestions", () => { + test("returns only success/tool deltas for simple sessions", () => { const session = makeSession({ user_messages: ["What is 2+2?"] }); const observations = extractObservations(session); const critique = buildCritiqueFromObservations(observations, session, makeEvolvedConfig()); const deltas = generateDeltas(critique, session.session_id); - expect(deltas.length).toBe(0); + // Success observation generates a delta for task-patterns + const userProfileDeltas = deltas.filter((d) => d.file === "user-profile.md"); + expect(userProfileDeltas.length).toBe(0); }); }); diff --git a/src/evolution/application.ts b/src/evolution/application.ts index 6751959..500d39b 100644 --- a/src/evolution/application.ts +++ b/src/evolution/application.ts @@ -6,9 +6,9 @@ import { createNextVersion, readVersion, writeVersion } from "./versioning.ts"; /** * Apply a validated delta to the config file. - * Returns the change record for version tracking. + * Returns the change record for version tracking, or null if the delta was a no-op (content already present). */ -export function applyDelta(delta: ConfigDelta, config: EvolutionConfig): VersionChange { +export function applyDelta(delta: ConfigDelta, config: EvolutionConfig): VersionChange | null { const filePath = join(config.paths.config_dir, delta.file); // Ensure directory exists @@ -27,9 +27,28 @@ export function applyDelta(delta: ConfigDelta, config: EvolutionConfig): Version let newContent: string; switch (delta.type) { - case "append": - newContent = currentContent ? `${currentContent}\n${delta.content}` : delta.content; + case "append": { + const existingLines = new Set( + currentContent + .split("\n") + .map((l) => l.trim().toLowerCase()) + .filter(Boolean), + ); + const uniqueLines = delta.content + .split("\n") + .filter((l) => { + const trimmed = l.trim(); + return !trimmed || !existingLines.has(trimmed.toLowerCase()); + }) + .join("\n") + .trim(); + if (!uniqueLines) { + newContent = currentContent; + } else { + newContent = currentContent ? `${currentContent}\n${uniqueLines}` : uniqueLines; + } break; + } case "replace": if (delta.target && currentContent.includes(delta.target)) { newContent = currentContent.replace(delta.target, delta.content); @@ -49,6 +68,10 @@ export function applyDelta(delta: ConfigDelta, config: EvolutionConfig): Version newContent = currentContent; } + if (newContent === currentContent) { + return null; + } + writeFileSync(filePath, newContent, "utf-8"); return { @@ -83,11 +106,24 @@ export function applyApproved( }; } - // Apply all approved deltas + // Apply all approved deltas (skip no-ops where content already exists) const appliedChanges: VersionChange[] = []; for (const result of approved) { const change = applyDelta(result.delta, config); - appliedChanges.push(change); + if (change) { + appliedChanges.push(change); + } + } + + // All approved deltas were no-ops (content already existed) + if (appliedChanges.length === 0) { + return { + applied: [], + rejected: rejected.map((r) => ({ + change: r.delta, + reasons: r.gates.filter((g) => !g.passed).map((g) => `${g.gate}: ${g.reason}`), + })), + }; } // Update version diff --git a/src/evolution/consolidation.ts b/src/evolution/consolidation.ts index 46cef4e..e53e6cb 100644 --- a/src/evolution/consolidation.ts +++ b/src/evolution/consolidation.ts @@ -34,6 +34,9 @@ export function runConsolidation(config: EvolutionConfig): ConsolidationReport { // Compress corrections.md if it has duplicates const correctionsCompressed = compressCorrections(config); + // Compress user-profile.md if it has duplicates + const profileCompressed = compressUserProfile(config); + // Prune the session log of entries that have been processed const prunedCount = pruneSessionLog(config, sessionLog.length); @@ -43,7 +46,7 @@ export function runConsolidation(config: EvolutionConfig): ConsolidationReport { return { principlesExtracted: principles.length, observationsPruned: prunedCount, - filesCompressed: filesCompressed + (correctionsCompressed ? 1 : 0), + filesCompressed: filesCompressed + (correctionsCompressed ? 1 : 0) + (profileCompressed ? 1 : 0), }; } @@ -218,6 +221,35 @@ function compressCorrections(config: EvolutionConfig): boolean { return deduplicated.length < lines.length; } +export function compressUserProfile(config: EvolutionConfig): boolean { + const profilePath = join(config.paths.config_dir, "user-profile.md"); + + let content: string; + try { + content = readFileSync(profilePath, "utf-8"); + } catch { + return false; + } + + const lines = content.split("\n"); + if (lines.length <= config.gates.max_file_lines) return false; + + const seen = new Set(); + const deduplicated = lines.filter((line) => { + const trimmed = line.trim().toLowerCase(); + if (trimmed === "" || trimmed.startsWith("#")) return true; + if (seen.has(trimmed)) return false; + seen.add(trimmed); + return true; + }); + + if (deduplicated.length < lines.length) { + writeFileSync(profilePath, deduplicated.join("\n"), "utf-8"); + return true; + } + return false; +} + function pruneSessionLog(config: EvolutionConfig, processedCount: number): number { const logPath = config.paths.session_log; @@ -236,7 +268,15 @@ function pruneSessionLog(config: EvolutionConfig, processedCount: number): numbe function compressOversizedFiles(config: EvolutionConfig): number { const maxLines = config.gates.max_file_lines; - const filesToCheck = ["user-profile.md", "domain-knowledge.md", "memory/corrections.md", "memory/principles.md"]; + const filesToCheck = [ + "user-profile.md", + "domain-knowledge.md", + "memory/corrections.md", + "memory/principles.md", + "strategies/task-patterns.md", + "strategies/tool-preferences.md", + "strategies/error-recovery.md", + ]; let compressed = 0; for (const file of filesToCheck) { diff --git a/src/evolution/engine.ts b/src/evolution/engine.ts index c32b4df..22bc5a0 100644 --- a/src/evolution/engine.ts +++ b/src/evolution/engine.ts @@ -1,5 +1,5 @@ -import { readFileSync, writeFileSync } from "node:fs"; -import { join } from "node:path"; +import { cpSync, mkdirSync, readFileSync, readdirSync, rmSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; import { applyApproved } from "./application.ts"; import { type EvolutionConfig, loadEvolutionConfig } from "./config.ts"; import { recordObservations, runConsolidation } from "./consolidation.ts"; @@ -48,7 +48,12 @@ export class EvolutionEngine { const setting = this.config.judges?.enabled ?? "auto"; if (setting === "never") return false; if (setting === "always") return true; - return !!process.env.ANTHROPIC_API_KEY; + return !!( + process.env.JUDGE_API_KEY || + process.env.ANTHROPIC_API_KEY || + process.env.ANTHROPIC_AUTH_TOKEN || + process.env.CLAUDE_CODE_OAUTH_TOKEN + ); } usesLLMJudges(): boolean { @@ -138,6 +143,8 @@ export class EvolutionEngine { `[evolution] Applied ${applied.length} changes (v${this.getCurrentVersion()}) in ${Date.now() - startTime}ms`, ); + this.backupConfig(); + // Promote successful corrections to golden suite if (session.outcome === "success" && hadCorrections) { for (const change of applied) { @@ -295,6 +302,26 @@ export class EvolutionEngine { } } + private backupConfig(): void { + const backupDir = join(dirname(this.config.paths.config_dir), "data", "config-backups"); + const version = this.getCurrentVersion(); + const dest = join(backupDir, `v${version}`); + try { + mkdirSync(backupDir, { recursive: true }); + cpSync(this.config.paths.config_dir, dest, { recursive: true }); + // Retain only the last 5 backups + const entries = readdirSync(backupDir) + .filter((e) => e.startsWith("v")) + .sort((a, b) => Number.parseInt(a.slice(1), 10) - Number.parseInt(b.slice(1), 10)); + for (const old of entries.slice(0, -5)) { + rmSync(join(backupDir, old), { recursive: true, force: true }); + } + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`[evolution] Config backup failed: ${msg}`); + } + } + private recordJudgeCosts(costs: JudgeCosts): void { const metricsPath = this.config.paths.metrics_file; try { diff --git a/src/evolution/judges/client.ts b/src/evolution/judges/client.ts index 6254e99..8800e21 100644 --- a/src/evolution/judges/client.ts +++ b/src/evolution/judges/client.ts @@ -14,7 +14,15 @@ let _client: Anthropic | null = null; function getClient(): Anthropic { if (!_client) { - _client = new Anthropic(); + // Prefer dedicated JUDGE_API_KEY for cost isolation and independent rate limits. + // Falls back to ANTHROPIC_API_KEY, then OAuth tokens. + const judgeKey = process.env.JUDGE_API_KEY || process.env.ANTHROPIC_API_KEY; + if (judgeKey) { + _client = new Anthropic({ apiKey: judgeKey }); + } else { + const authToken = process.env.ANTHROPIC_AUTH_TOKEN || process.env.CLAUDE_CODE_OAUTH_TOKEN || undefined; + _client = authToken ? new Anthropic({ authToken }) : new Anthropic(); + } } return _client; } @@ -25,7 +33,12 @@ export function setClient(client: Anthropic | null): void { } export function isJudgeAvailable(): boolean { - return !!process.env.ANTHROPIC_API_KEY; + return !!( + process.env.JUDGE_API_KEY || + process.env.ANTHROPIC_API_KEY || + process.env.ANTHROPIC_AUTH_TOKEN || + process.env.CLAUDE_CODE_OAUTH_TOKEN + ); } /** diff --git a/src/evolution/judges/observation-judge.ts b/src/evolution/judges/observation-judge.ts index 6ad3dbd..65dd153 100644 --- a/src/evolution/judges/observation-judge.ts +++ b/src/evolution/judges/observation-judge.ts @@ -36,6 +36,7 @@ export function toSessionObservations(result: ObservationExtractionResultType): context: obs.detail, confidence: obs.importance, source_messages: [obs.evidence], + affected_files: obs.affected_config_files.length > 0 ? obs.affected_config_files : undefined, })); } diff --git a/src/evolution/reflection.ts b/src/evolution/reflection.ts index bd0d1b8..fba89d0 100644 --- a/src/evolution/reflection.ts +++ b/src/evolution/reflection.ts @@ -168,6 +168,52 @@ export function buildCritiqueFromObservations( }); } + // Domain facts become domain-knowledge changes + const domainFacts = observations.filter((o) => o.type === "domain_fact"); + for (const fact of domainFacts) { + suggestedChanges.push({ + file: sanitizeConfigFile(fact.affected_files?.[0]) ?? "domain-knowledge.md", + type: "append", + content: `- ${fact.content.slice(0, 200)}`, + rationale: `Domain fact from session ${session.session_id}: "${fact.content.slice(0, 100)}"`, + tier: "free", + }); + } + + // Errors become error-recovery strategy changes + for (const error of errors) { + suggestedChanges.push({ + file: sanitizeConfigFile(error.affected_files?.[0]) ?? "strategies/error-recovery.md", + type: "append", + content: `- ${error.content.slice(0, 200)}`, + rationale: `Error pattern from session ${session.session_id}: "${error.content.slice(0, 100)}"`, + tier: "free", + }); + } + + // Tool patterns become tool-preferences strategy changes + const toolPatterns = observations.filter((o) => o.type === "tool_pattern"); + for (const pattern of toolPatterns) { + suggestedChanges.push({ + file: sanitizeConfigFile(pattern.affected_files?.[0]) ?? "strategies/tool-preferences.md", + type: "append", + content: `- ${pattern.content.slice(0, 200)}`, + rationale: `Tool pattern from session ${session.session_id}: "${pattern.content.slice(0, 100)}"`, + tier: "free", + }); + } + + // Successes become task-patterns strategy changes + for (const success of successes) { + suggestedChanges.push({ + file: sanitizeConfigFile(success.affected_files?.[0]) ?? "strategies/task-patterns.md", + type: "append", + content: `- ${success.content.slice(0, 200)}`, + rationale: `Success pattern from session ${session.session_id}: "${success.content.slice(0, 100)}"`, + tier: "free", + }); + } + return { overall_assessment: session.outcome === "success" ? "Session completed successfully." : "Session had issues.", what_worked: successes.map((s) => s.content), @@ -285,6 +331,23 @@ export function getCritiqueJsonSchema(): Record { }; } +const ALLOWED_CONFIG_FILES = new Set([ + "persona.md", + "user-profile.md", + "domain-knowledge.md", + "strategies/task-patterns.md", + "strategies/tool-preferences.md", + "strategies/error-recovery.md", +]); + +/** Validate affected_files values to prevent path traversal from judge output. */ +function sanitizeConfigFile(file: string | undefined): string | undefined { + if (!file) return undefined; + if (file.includes("..") || file.startsWith("/")) return undefined; + if (!ALLOWED_CONFIG_FILES.has(file)) return undefined; + return file; +} + function distillCorrection(message: string): string { // Remove common correction prefixes to get the actual content return message diff --git a/src/evolution/types.ts b/src/evolution/types.ts index d779087..c5e66a8 100644 --- a/src/evolution/types.ts +++ b/src/evolution/types.ts @@ -70,6 +70,7 @@ export type SessionObservation = { context: string; confidence: number; source_messages: string[]; + affected_files?: string[]; }; export type SessionSummary = { diff --git a/src/index.ts b/src/index.ts index a6e0066..6563d94 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,4 @@ -import { existsSync, writeFileSync } from "node:fs"; +import { existsSync, readFileSync, writeFileSync } from "node:fs"; import { join, resolve } from "node:path"; import { createInProcessToolServer } from "./agent/in-process-tools.ts"; import { AgentRuntime } from "./agent/runtime.ts"; @@ -18,9 +18,11 @@ import { loadChannelsConfig, loadConfig } from "./config/loader.ts"; import { installShutdownHandlers, onShutdown } from "./core/graceful.ts"; import { setChannelHealthProvider, + setEvolutionInfoProvider, setEvolutionVersionProvider, setMcpServerProvider, setMemoryHealthProvider, + setModelInfoProvider, setOnboardingStatusProvider, setPeerHealthProvider, setRoleInfoProvider, @@ -43,7 +45,7 @@ import { MemorySystem } from "./memory/system.ts"; import { isFirstRun, isOnboardingInProgress } from "./onboarding/detection.ts"; import { type OnboardingTarget, startOnboarding } from "./onboarding/flow.ts"; import { buildOnboardingPrompt } from "./onboarding/prompt.ts"; -import { getOnboardingStatus } from "./onboarding/state.ts"; +import { getOnboardingStatus, markOnboardingComplete } from "./onboarding/state.ts"; import { createRoleRegistry } from "./roles/registry.ts"; import type { RoleTemplate } from "./roles/types.ts"; import { Scheduler } from "./scheduler/service.ts"; @@ -94,6 +96,7 @@ async function main(): Promise { await memory.initialize(); setMemoryHealthProvider(() => memory.healthCheck()); + setModelInfoProvider(() => ({ model: config.model, model_source: config.model_source })); let evolution: EvolutionEngine | null = null; try { @@ -102,6 +105,27 @@ async function main(): Promise { const judgeMode = evolution.usesLLMJudges() ? "LLM judges" : "heuristic"; console.log(`[evolution] Engine initialized (v${currentVersion}, ${judgeMode})`); setEvolutionVersionProvider(() => evolution?.getCurrentVersion() ?? 0); + setEvolutionInfoProvider(() => { + if (!evolution) return { generation: 0, session_count: 0, sessions_since_consolidation: 0, session_log_depth: 0 }; + const metrics = evolution.getMetrics(); + const evoConfig = evolution.getEvolutionConfig(); + let sessionLogDepth = 0; + try { + const logPath = evoConfig.paths.session_log; + if (existsSync(logPath)) { + const content = readFileSync(logPath, "utf-8").trim(); + sessionLogDepth = content.length > 0 ? content.split("\n").length : 0; + } + } catch { + /* non-critical */ + } + return { + generation: evolution.getCurrentVersion(), + session_count: metrics.session_count, + sessions_since_consolidation: metrics.sessions_since_consolidation, + session_log_depth: sessionLogDepth, + }; + }); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); console.warn(`[evolution] Failed to initialize: ${msg}. Running without self-evolution.`); @@ -499,10 +523,10 @@ async function main(): Promise { if (judgeCost) { evolution?.trackExternalJudgeCost(judgeCost); } - if (result.episodesCreated > 0 || result.factsExtracted > 0) { + if (result.episodesCreated > 0 || result.factsExtracted > 0 || result.proceduresDetected > 0) { console.log( `[memory] Consolidated (LLM): ${result.episodesCreated} episodes, ` + - `${result.factsExtracted} facts (${result.durationMs}ms)`, + `${result.factsExtracted} facts, ${result.proceduresDetected} procedures (${result.durationMs}ms)`, ); } }) @@ -552,6 +576,12 @@ async function main(): Promise { runtime.setEvolvedConfig(updatedConfig); } } + // >= 2 means at least two evolution cycles with applied changes, not just two sessions + if (evolution && evolution.getCurrentVersion() >= 2 && isOnboardingInProgress(db)) { + markOnboardingComplete(db); + runtime.setOnboardingPrompt(null); + console.log("[onboarding] Completed after evolution version >= 2"); + } }) .catch((err: unknown) => { const errMsg = err instanceof Error ? err.message : String(err); diff --git a/src/mcp/__tests__/dynamic-tools.test.ts b/src/mcp/__tests__/dynamic-tools.test.ts index 0e26bee..76a98fa 100644 --- a/src/mcp/__tests__/dynamic-tools.test.ts +++ b/src/mcp/__tests__/dynamic-tools.test.ts @@ -269,6 +269,7 @@ describe("Dynamic Tools via MCP Protocol", () => { port: 3100, role: "swe", model: "claude-opus-4-6", + model_source: "config" as const, effort: "max" as const, max_budget_usd: 0, timeout_minutes: 240, diff --git a/src/mcp/__tests__/scope-enforcement.test.ts b/src/mcp/__tests__/scope-enforcement.test.ts index ed67179..f40505f 100644 --- a/src/mcp/__tests__/scope-enforcement.test.ts +++ b/src/mcp/__tests__/scope-enforcement.test.ts @@ -134,6 +134,7 @@ describe("MCP scope enforcement", () => { port: 3100, role: "swe", model: "claude-opus-4-6", + model_source: "config" as const, effort: "max" as const, max_budget_usd: 0, timeout_minutes: 240, diff --git a/src/mcp/__tests__/server.test.ts b/src/mcp/__tests__/server.test.ts index f7f7d13..489c712 100644 --- a/src/mcp/__tests__/server.test.ts +++ b/src/mcp/__tests__/server.test.ts @@ -119,6 +119,7 @@ describe("PhantomMcpServer", () => { port: 3100, role: "swe", model: "claude-opus-4-6", + model_source: "config" as const, effort: "max" as const, max_budget_usd: 0, timeout_minutes: 240, @@ -334,6 +335,7 @@ describe("PhantomMcpServer", () => { port: 3100, role: "swe", model: "claude-opus-4-6", + model_source: "config" as const, effort: "max" as const, max_budget_usd: 0, timeout_minutes: 240, diff --git a/src/mcp/__tests__/tools-swe.test.ts b/src/mcp/__tests__/tools-swe.test.ts index 8d03f7d..96d292f 100644 --- a/src/mcp/__tests__/tools-swe.test.ts +++ b/src/mcp/__tests__/tools-swe.test.ts @@ -134,6 +134,7 @@ describe("SWE MCP Tools", () => { port: 3100, role: "swe", model: "claude-opus-4-6", + model_source: "config" as const, effort: "max" as const, max_budget_usd: 0, timeout_minutes: 240, diff --git a/src/memory/__tests__/consolidation.test.ts b/src/memory/__tests__/consolidation.test.ts index ed088ef..010ed39 100644 --- a/src/memory/__tests__/consolidation.test.ts +++ b/src/memory/__tests__/consolidation.test.ts @@ -1,5 +1,5 @@ import { describe, expect, mock, test } from "bun:test"; -import { type SessionData, consolidateSession } from "../consolidation.ts"; +import { type SessionData, consolidateSession, consolidateSessionWithLLM } from "../consolidation.ts"; import type { MemorySystem } from "../system.ts"; function makeTestSessionData(overrides?: Partial): SessionData { @@ -23,9 +23,11 @@ function createMockMemory(): { memory: MemorySystem; storedEpisodes: Array>; storedFacts: Array>; + storedProcedures: Array>; } { const storedEpisodes: Array> = []; const storedFacts: Array> = []; + const storedProcedures: Array> = []; const memory = { storeEpisode: mock((episode: Record) => { @@ -36,9 +38,13 @@ function createMockMemory(): { storedFacts.push(fact); return Promise.resolve(fact.id as string); }), + storeProcedure: mock((procedure: Record) => { + storedProcedures.push(procedure); + return Promise.resolve(procedure.id as string); + }), } as unknown as MemorySystem; - return { memory, storedEpisodes, storedFacts }; + return { memory, storedEpisodes, storedFacts, storedProcedures }; } describe("consolidateSession", () => { @@ -157,3 +163,109 @@ describe("consolidateSession", () => { expect(typeof result.durationMs).toBe("number"); }); }); + +// Mock the consolidation judge for LLM path tests +mock.module("../../evolution/judges/consolidation-judge.ts", () => ({ + runConsolidationJudge: mock(), +})); + +import { runConsolidationJudge } from "../../evolution/judges/consolidation-judge.ts"; +const mockedRunConsolidationJudge = runConsolidationJudge as ReturnType; + +function makeJudgeResult(overrides?: { + detected_procedures?: Array<{ + name: string; + description: string; + trigger: string; + steps: string[]; + confidence: number; + evidence: string; + }>; + extracted_facts?: Array>; +}) { + return { + data: { + reasoning: "test reasoning", + extracted_facts: overrides?.extracted_facts ?? [], + detected_procedures: overrides?.detected_procedures ?? [], + episode_importance: 0.5, + episode_importance_reasoning: "test", + contradiction_alerts: [], + key_takeaways: ["test"], + }, + costUsd: 0.01, + inputTokens: 100, + outputTokens: 50, + }; +} + +describe("consolidateSessionWithLLM - procedure storage", () => { + test("stores detected procedures from judge output", async () => { + const { memory, storedProcedures } = createMockMemory(); + mockedRunConsolidationJudge.mockResolvedValueOnce( + makeJudgeResult({ + detected_procedures: [ + { + name: "deploy-staging", + description: "Deploy to staging environment", + trigger: "User asks to deploy staging", + steps: ["Run tests", "Build artifacts", "Deploy"], + confidence: 0.8, + evidence: "User deployed staging", + }, + ], + }), + ); + + const { result } = await consolidateSessionWithLLM(memory, makeTestSessionData(), ""); + + expect(result.proceduresDetected).toBe(1); + expect(storedProcedures.length).toBe(1); + expect(storedProcedures[0].name).toBe("deploy-staging"); + expect(storedProcedures[0].description).toBe("Deploy to staging environment"); + expect(storedProcedures[0].trigger).toBe("User asks to deploy staging"); + expect(storedProcedures[0].confidence).toBe(0.8); + + const steps = storedProcedures[0].steps as Array<{ order: number; action: string }>; + expect(steps.length).toBe(3); + expect(steps[0].order).toBe(1); + expect(steps[0].action).toBe("Run tests"); + expect(steps[2].order).toBe(3); + expect(steps[2].action).toBe("Deploy"); + }); + + test("seeds success_count from session outcome", async () => { + const { memory: memSuccess, storedProcedures: procsSuccess } = createMockMemory(); + const { memory: memFailure, storedProcedures: procsFailure } = createMockMemory(); + + const proc = { + name: "test-proc", + description: "test", + trigger: "test", + steps: ["step1"], + confidence: 0.7, + evidence: "test", + }; + + mockedRunConsolidationJudge.mockResolvedValueOnce(makeJudgeResult({ detected_procedures: [proc] })); + await consolidateSessionWithLLM(memSuccess, makeTestSessionData({ outcome: "success" }), ""); + + mockedRunConsolidationJudge.mockResolvedValueOnce(makeJudgeResult({ detected_procedures: [proc] })); + await consolidateSessionWithLLM(memFailure, makeTestSessionData({ outcome: "failure" }), ""); + + expect(procsSuccess[0].success_count).toBe(1); + expect(procsSuccess[0].failure_count).toBe(0); + expect(procsFailure[0].success_count).toBe(0); + expect(procsFailure[0].failure_count).toBe(1); + }); + + test("handles empty procedures array gracefully", async () => { + const { memory, storedProcedures } = createMockMemory(); + mockedRunConsolidationJudge.mockResolvedValueOnce(makeJudgeResult()); + + const { result } = await consolidateSessionWithLLM(memory, makeTestSessionData(), ""); + + expect(result.proceduresDetected).toBe(0); + expect(storedProcedures.length).toBe(0); + }); +}); diff --git a/src/memory/consolidation.ts b/src/memory/consolidation.ts index 35868d7..eef4c6a 100644 --- a/src/memory/consolidation.ts +++ b/src/memory/consolidation.ts @@ -48,6 +48,33 @@ export async function consolidateSessionWithLLM( factsExtracted++; } + // Store detected procedures + for (const procedure of judgeResult.data.detected_procedures) { + await memory.storeProcedure({ + id: crypto.randomUUID(), + name: procedure.name, + description: procedure.description, + trigger: procedure.trigger, + steps: procedure.steps.map((step, i) => ({ + order: i + 1, + action: step, + tool: null, + expected_outcome: "", + error_handling: null, + decision_point: false, + })), + preconditions: [], + postconditions: [], + parameters: {}, + source_episode_ids: [episode.id], + success_count: session.outcome === "success" ? 1 : 0, + failure_count: session.outcome === "failure" ? 1 : 0, + last_used_at: now, + confidence: procedure.confidence, + version: 1, + }); + } + return { result: { episodesCreated: 1,