diff --git a/.gitignore b/.gitignore index 22860799..535aad4d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ /ds4_native /ds4_server_test /ds4_test +/tests/ds4_agent_context_test +/tests/ds4_agent_git_test +/tests/ds4_kv_cache_benefit_test +/tests/generated/ /ds4flash.gguf /TODO.md /gguf/ diff --git a/Makefile b/Makefile index 27283ba0..9acffe3b 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,7 @@ CPU_CORE_OBJS = ds4_cpu.o METAL_LDLIBS := $(LDLIBS) endif -.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression +.PHONY: all help clean test test-agent-context-loop test-agent-context-compact-canary test-agent-context-self-improvement test-kv-cache-benefit cpu cuda cuda-spark cuda-generic cuda-regression ifeq ($(UNAME_S),Darwin) all: ds4 ds4-server ds4-bench ds4-eval ds4-agent @@ -43,6 +43,14 @@ help: @echo " make Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent" @echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent" @echo " make test Build and run tests" + @echo " make test-agent-context-loop" + @echo " Run slow DS4-generated agent context loop e2e" + @echo " make test-agent-context-compact-canary" + @echo " Run slow DS4-generated compaction canary e2e" + @echo " make test-agent-context-self-improvement" + @echo " Run slow DS4-generated KV self-improvement e2e" + @echo " make test-kv-cache-benefit" + @echo " Run optional KV restore benefit benchmark" @echo " make clean Remove build outputs" ds4: ds4_cli.o linenoise.o $(CORE_OBJS) @@ -57,15 +65,15 @@ ds4-bench: ds4_bench.o $(CORE_OBJS) ds4-eval: ds4_eval.o $(CORE_OBJS) $(CC) $(CFLAGS) -o $@ ds4_eval.o $(CORE_OBJS) $(METAL_LDLIBS) -ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) - $(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS) +ds4-agent: ds4_agent.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) + $(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS) -cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS) +cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o $(CPU_CORE_OBJS) $(LDLIBS) - $(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) + $(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) cuda-regression: @echo "cuda-regression requires a CUDA build" @@ -79,6 +87,14 @@ help: @echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value" @echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent" @echo " make test Build and run tests" + @echo " make test-agent-context-loop" + @echo " Run slow DS4-generated agent context loop e2e" + @echo " make test-agent-context-compact-canary" + @echo " Run slow DS4-generated compaction canary e2e" + @echo " make test-agent-context-self-improvement" + @echo " Run slow DS4-generated KV self-improvement e2e" + @echo " make test-kv-cache-benefit" + @echo " Run optional KV restore benefit benchmark" @echo " make clean Remove build outputs" cuda-spark: @@ -107,15 +123,15 @@ ds4-bench: ds4_bench.o $(CORE_OBJS) ds4-eval: ds4_eval.o $(CORE_OBJS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) +ds4-agent: ds4_agent.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS) +cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o $(CPU_CORE_OBJS) $(LDLIBS) - $(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) + $(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) cuda-regression: tests/cuda_long_context_smoke ./tests/cuda_long_context_smoke @@ -136,9 +152,12 @@ ds4_bench.o: ds4_bench.c ds4.h ds4_eval.o: ds4_eval.c ds4.h $(CC) $(CFLAGS) -c -o $@ ds4_eval.c -ds4_agent.o: ds4_agent.c ds4.h ds4_kvstore.h ds4_web.h linenoise.h +ds4_agent.o: ds4_agent.c ds4.h ds4_agent_context.h ds4_kvstore.h ds4_web.h linenoise.h $(CC) $(CFLAGS) -c -o $@ ds4_agent.c +ds4_agent_context.o: ds4_agent_context.c ds4_agent_context.h + $(CC) $(CFLAGS) -c -o $@ ds4_agent_context.c + ds4_web.o: ds4_web.c ds4_web.h $(CC) $(CFLAGS) -c -o $@ ds4_web.c @@ -151,6 +170,12 @@ ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_kvstore.h rax.h tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h $(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c +tests/ds4_agent_context_test.o: tests/ds4_agent_context_test.c ds4_agent_context.h + $(CC) $(CFLAGS) -I. -c -o $@ tests/ds4_agent_context_test.c + +tests/ds4_kv_cache_benefit_test.o: tests/ds4_kv_cache_benefit_test.c ds4.h + $(CC) $(CFLAGS) -I. -c -o $@ tests/ds4_kv_cache_benefit_test.c + rax.o: rax.c rax.h rax_malloc.h $(CC) $(CFLAGS) -c -o $@ rax.c @@ -172,7 +197,7 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_eval_cpu.o: ds4_eval.c ds4.h $(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_eval.c -ds4_agent_cpu.o: ds4_agent.c ds4.h ds4_kvstore.h ds4_web.h linenoise.h +ds4_agent_cpu.o: ds4_agent.c ds4.h ds4_agent_context.h ds4_kvstore.h ds4_web.h linenoise.h $(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_agent.c ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS) @@ -184,6 +209,16 @@ ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) +tests/ds4_agent_context_test: tests/ds4_agent_context_test.o ds4_agent_context.o + $(CC) $(CFLAGS) -o $@ tests/ds4_agent_context_test.o ds4_agent_context.o $(LDLIBS) + +tests/ds4_kv_cache_benefit_test: tests/ds4_kv_cache_benefit_test.o $(CORE_OBJS) +ifeq ($(UNAME_S),Darwin) + $(CC) $(CFLAGS) -o $@ tests/ds4_kv_cache_benefit_test.o $(CORE_OBJS) $(METAL_LDLIBS) +else + $(NVCC) $(NVCCFLAGS) -o $@ tests/ds4_kv_cache_benefit_test.o $(CORE_OBJS) $(CUDA_LDLIBS) +endif + ds4_test: ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) ifeq ($(UNAME_S),Darwin) $(CC) $(CFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS) @@ -191,9 +226,22 @@ else $(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS) endif -test: ds4_test ds4-eval +test: ds4_test ds4-eval tests/ds4_agent_context_test ./ds4-eval --self-test-extractors + ./tests/ds4_agent_context_test ./ds4_test +test-agent-context-loop: ds4-agent tests/ds4_agent_context_test + sh tests/ds4_agent_context_loop_e2e.sh + +test-agent-context-compact-canary: ds4-agent + sh tests/ds4_agent_context_compact_canary_e2e.sh + +test-agent-context-self-improvement: ds4-agent + sh tests/ds4_agent_context_self_improvement_e2e.sh + +test-kv-cache-benefit: tests/ds4_kv_cache_benefit_test + ./tests/ds4_kv_cache_benefit_test + clean: - rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/ds4_agent_context_test tests/ds4_agent_context_test.o tests/ds4_kv_cache_benefit_test tests/ds4_kv_cache_benefit_test.o diff --git a/docs/agent-kv-context-fixes.md b/docs/agent-kv-context-fixes.md new file mode 100644 index 00000000..a6095324 --- /dev/null +++ b/docs/agent-kv-context-fixes.md @@ -0,0 +1,291 @@ +# Agent KV Context Hardening Fixes + +This branch isolates a small set of hardening fixes for the agent context and +KV restore path. The goal is not to add more agent features here. The goal is +to make the existing context checkpoint, restore, and compaction behavior fail +more safely, report more honestly, and be easier for the model to call +correctly. + +The fixes were driven by tests. In particular, the compaction canary e2e exposed +that a malformed compaction summary could be accepted as if it were useful +state. The runtime now rejects that case instead of rebuilding the live context +from bad memory. + +## 1. Compaction Rejects Broken Summaries + +This does not mean DS4 stops compacting. + +It means: + +- DS4 tries to compact normally. +- To compact, it asks the model for a useful summary of the current task state. +- If the summary is useful, DS4 uses it. +- If the summary is almost empty or clearly broken, DS4 rejects it. + +Before: + +```text +broken summary -> DS4 uses it anyway -> possible corrupted task memory +``` + +Now: + +```text +broken summary -> DS4 aborts compaction -> corrupted memory is not used +``` + +How it is implemented: + +- `agent_compact_summary_has_signal()` checks that the generated summary has a + minimal amount of real text, not just a few tag-like words. +- The internal compaction prompt asks for plain text headings or bullets and + explicitly rejects XML/HTML-like tool markup. +- If the check fails, compaction returns an error and invalidates the live KV + session before any rebuilt context is accepted. + +Verification: + +- `make test-agent-context-compact-canary` +- The test requires DS4 to compact, then write five canary facts only after + compaction. +- The harness checks that compaction really happened, reduced token count, kept + the recent tail late enough, and preserved all canary values. + +## 2. KV Restore Does Not Leave Half-Restored State + +This does not make KV restore more limited. + +It means: + +- DS4 tries to load a saved KV cache. +- Then it checks that the loaded cache matches the context metadata. +- If the cache and metadata match, restore succeeds. +- If they do not match, DS4 avoids leaving the live session in an ambiguous + half-restored state. + +Before: + +```text +partial restore -> mismatch detected later -> possible session/transcript drift +``` + +Now: + +```text +invalid restore -> live KV is invalidated or resynced -> no half-restored state +``` + +How it is implemented: + +- The restore path validates loaded token counts against checkpoint metadata. +- If validation fails after a KV load, the live session is invalidated so the + next operation cannot accidentally continue from the bad KV state. + +Verification: + +- `make test` +- Context unit tests cover checkpoint metadata loading and incompatible restore + handling. + +## 3. Restore Metrics Are Explicit About Expected Versus Actual Savings + +This does not change how the cache itself works. + +It means: + +- DS4 can estimate how many prefill tokens should be avoided by restoring KV. +- After restore, DS4 can also observe what actually happened during sync. +- If expected and actual behavior differ, the model-visible notice should not + hide that difference. + +Before: + +```text +expected savings -> shown as if they definitely happened +``` + +Now: + +```text +expected/actual savings -> reported more clearly -> less misleading feedback +``` + +How it is implemented: + +- Restore bookkeeping tracks expected saved prefill tokens separately from + actual cached-token observations after sync. +- The restore notice and trace avoid presenting estimates as stronger proof + than they are. + +Verification: + +- `make test` +- `make test-kv-cache-benefit` +- The benchmark compares a full prefill against a restored-prefix run and + reports `full_prefill_tokens`, `restored_prefill_tokens`, and + `saved_prefill_tokens`. + +## 4. Tool Schemas List The Allowed Actions + +This does not reduce the tool's capability. + +It means: + +- The context tool accepts a fixed set of action names. +- The schema now tells the model exactly which actions are valid. +- The model has less room to invent plausible but wrong action names. + +Before: + +```text +action = any string -> model may invent an invalid action -> runtime error +``` + +Now: + +```text +action = one of the allowed names -> fewer avoidable tool-call errors +``` + +How it is implemented: + +- The context tool schema uses a JSON `enum` for action values such as + `status`, `checkpoint`, `list`, `restore`, `compact`, and `drop`. + +Verification: + +- `make test` +- The schema is model-visible and the dispatch path still rejects unknown + actions at runtime. + +## 5. Metadata Parsing Is Key-Aware + +This does not change the checkpoint metadata format. + +It means: + +- DS4 still reads the same JSON metadata files. +- It no longer finds fields by blindly searching for a word anywhere in the + file. +- It distinguishes a real key from the same text appearing inside a value. + +Before: + +```text +search raw text -> possible confusion between key and value +``` + +Now: + +```text +read the actual key -> metadata is interpreted more reliably +``` + +How it is implemented: + +- The metadata reader now scans for JSON object keys instead of using a plain + substring search. +- Tests cover pathological values that contain text resembling other keys. + +Verification: + +- `make test` +- The context unit test covers metadata roundtrip and key-aware lookup. + +## 6. Benchmark Build Artifact Is Ignored + +This is only repository hygiene. + +It means: + +- `make test-kv-cache-benefit` may build + `tests/ds4_kv_cache_benefit_test`. +- That generated binary should not make `git status` look dirty after the test. + +Before: + +```text +run benchmark -> generated binary appears as untracked file +``` + +Now: + +```text +run benchmark -> generated binary is ignored -> working tree remains clean +``` + +How it is implemented: + +- `.gitignore` includes `/tests/ds4_kv_cache_benefit_test`. + +Verification: + +- `make test-kv-cache-benefit` +- `git status --short` + +## 7. Adaptive Self-Improvement E2E Scope + +This test demonstrates the agent loop, not a real DS4 code optimization. + +It means: + +- DS4 is given a temporary repository with a small failing Python project. +- DS4 must inspect repository state, fix the bug, run the tests, inspect the + diff, checkpoint the context, restore it, and prove the tests still pass. +- If the native Git tool is available, the prompt asks DS4 to use it for + `status` and `diff`. +- If the native Git tool is not available, the same test falls back to the + existing `bash` path with `git status --short` and `git diff`. + +Before: + +```text +context tools work in isolated calls -> less proof of agent-level usefulness +``` + +Now: + +```text +agent fixes a controlled project -> checkpoints -> restores -> verifies state +``` + +The limitation is intentional. This test does not claim that DS4 found and +optimized DS4's own C code. A stronger follow-up test should run against DS4 +itself: ask the agent to inspect the repository, choose one small measurable +optimization, implement it, run the relevant benchmark or e2e check, inspect +the diff, checkpoint, restore, and record whether the metric improved. + +That DS4-on-DS4 loop is the ideal product demonstration, but it is a slower and +less deterministic test than this PR should require by default. The controlled +temporary repository keeps this PR's regression signal clear while preserving a +direct path to the stronger self-optimization loop. + +Verification: + +- `make test-agent-context-self-improvement` +- The generated ledger records `git_status_mode`, `git_diff_mode`, + `context_checkpoint_before`, `context_checkpoint_after`, + `context_restore_used`, `tests_before_restore`, and `tests_after_restore`. + +## Test Plan + +Run: + +```sh +make test +make test-agent-context-compact-canary +make test-kv-cache-benefit +make test-agent-context-self-improvement +git status --short +``` + +Expected result: + +- default C tests pass, +- compaction canary e2e passes, +- context self-improvement e2e passes; it uses native Git tooling when that + tool is present, and falls back to `bash`-run `git status` / `git diff` when + this branch is tested without the Git-tool PR, +- KV benefit benchmark reports a large `saved_prefill_tokens` value, +- `git status --short` shows only intentional source changes before commit, and + is clean after commit. diff --git a/docs/agent-kv-context-tools.md b/docs/agent-kv-context-tools.md new file mode 100644 index 00000000..d31a75cb --- /dev/null +++ b/docs/agent-kv-context-tools.md @@ -0,0 +1,894 @@ +# Agent KV Context Tools: Analysis And Implementation Plan + +## Goal + +Give `ds4-agent` a native tool for controlling its own context state without +exposing raw KV internals to the model. + +The important distinction is that the agent should not read or write arbitrary +KV bytes. The useful feature is semantic control over checkpoints, restore +points, and context compaction. The tool should operate on transcript and +session checkpoints together, preserving the invariant that the visible +conversation and live `ds4_session` state describe the same timeline. + +## Existing System Constraints + +`ds4_session` is one mutable inference timeline. It owns the live KV cache and +logits, while callers provide full token prefixes to `ds4_session_sync()` so the +session can reuse, extend, or rebuild graph state. + +`ds4-agent` already has user-facing slash commands for related operations: + +- `/save` persists the current session under `~/.ds4/kvcache`. +- `/switch` loads a saved session and restores transcript plus KV payload. +- `/compact` asks the model for a durable summary and rebuilds the transcript. +- `/new` resets to the system/tool prompt. +- `/del` deletes a saved session. +- `/strip` removes a persisted KV payload while preserving rendered text. + +Those commands are controlled by the user. The proposed feature gives the model +a narrower tool-level API so it can manage expensive context deliberately during +long autonomous work. + +The relevant invariants are: + +- A checkpoint is valid only if `ds4_session_tokens(w->session)` matches + `w->transcript`. +- Restore must replace transcript and KV state as one operation. +- A context restore does not revert filesystem, process, network, or browser + side effects. +- Active bash jobs are external state and must either block restore or be + explicitly surfaced in the restored transcript. +- Compaction can temporarily put private compaction prompts into live KV; any + failed compaction must invalidate live session state before continuing. +- For server/API usage, exact DSML replay must remain byte-for-byte compatible + with the rendered history. For `ds4-agent`, sampled DSML is already preserved + directly in the transcript, but the same principle applies: never rewrite a + tool-call turn into a semantically similar but token-different form. + +## Proposed Tool Surface + +Use one DSML tool named `context` with an `action` parameter instead of many +separate tool names. This keeps the system prompt smaller and makes future +actions easier to add without teaching the model a large new catalog. + +```json +{ + "type": "function", + "function": { + "name": "context", + "description": "Inspect, checkpoint, restore, or compact the agent context.", + "parameters": { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["status", "checkpoint", "list", "restore", "compact", "drop"] + }, + "id": {"type": "string"}, + "label": {"type": "string"}, + "reason": {"type": "string"}, + "allow_side_effect_mismatch": {"type": "boolean"}, + "dry_run": {"type": "boolean"} + }, + "required": ["action"] + } + } +} +``` + +Initial actions: + +- `status`: report transcript length, session position, context size, free + tokens, dirty session state, side-effect epoch, active bash jobs, and known + checkpoints. +- `checkpoint`: save a named restore point at the current stable transcript. +- `list`: list known checkpoints. +- `restore`: restore a checkpoint if side-effect rules allow it. +- `compact`: request the existing compaction path with an explicit reason. +- `drop`: delete checkpoint metadata and its associated payload when safe. + In the first implementation, "safe" means the checkpoint id resolves + unambiguously, paths remain inside the context directory, and no bash job is + running. The tool does not understand semantic roles such as "best baseline"; + callers should use `dry_run=true` before deleting important checkpoints. + +Phase 1 should be disk-backed and reuse the existing agent KV save/load path. +That avoids holding multiple huge KV payloads in RAM and keeps the first +experiment close to the existing `/save` and `/switch` implementation. + +## Concrete Use Cases + +1. Deep codebase exploration checkpoint. + +An agent reads architecture files, traces call graphs, and builds a high-value +mental model. Before trying an implementation, it calls +`context action=checkpoint label="repo-map-before-fix"`. If the first +implementation path fails, restore avoids re-prefilling the whole exploration +history. + +2. Alternative patch strategies. + +Before changing a shared subsystem, the agent creates a checkpoint, implements +approach A, runs tests, then restores and tries approach B. This is useful when +both alternatives require long reasoning from the same inspected context. +Filesystem changes still need explicit version-control or file rollback, so +restore must warn when side effects happened after the checkpoint. + +3. Compaction quality recovery. + +The agent checkpoints before forced compaction. If the compacted summary loses +critical details, the agent can restore the pre-compaction checkpoint and retry +with a better compaction reason, smaller tool output, or a manual summary. + +4. Long web research reuse. + +The agent searches and visits several pages, creating large rendered Markdown +observations. A checkpoint lets it try different conclusions or implementation +plans without paying the same browser and prefill cost again. + +5. Risky tool-call loop guard. + +Before a sequence of generated `edit` and `bash` calls, the agent checkpoints. +If it starts following a wrong path, the user or model can restore the reasoning +state while separately deciding whether to keep, revert, or inspect filesystem +effects. + +6. Parser and prompt experiments. + +Developers working on DSML parsing, forced syntax, or tool visualization can +restart from the same prompt frontier and compare generated tool calls under +different prompt wording or sampling knobs. + +7. Large session navigation. + +The agent can preserve named frontiers such as "after reading tests", +"after reproducing bug", and "before final refactor". This gives long local +sessions an internal navigation model instead of relying only on `/save` and +manual `/switch`. + +8. Bounded experimental loops. + +The agent can run a disciplined optimization loop from one baseline context: +checkpoint the baseline, write an experiment ledger, propose a hypothesis, +materialize it in code, measure it, and either save the improved state or record +the failed attempt and restore to the baseline. The ledger survives restore, so +failed attempts do not need to remain in the model transcript to remain useful. +This is especially useful for prompt, parser, quality, and performance +experiments where many attempts share the same expensive codebase understanding. + +Example flow: + +```text +context checkpoint label=baseline-before-tool-parser-loop +write experiment.md with goal, metric, max_attempts, current_attempt=0 +attempt 1: record hypothesis in experiment.md, edit code, run tests +if tests improve: record success, checkpoint label=best-attempt-1 +if tests regress: record failure in experiment.md, restore baseline with reason +restore notice tells the model to reread experiment.md before attempt 2 +stop when metric passes or current_attempt reaches max_attempts +``` + +## Safety Model + +The implementation should add a monotonically increasing `world_epoch` owned by +the agent worker. Increment it for successful operations that may change +external state: + +- `write` +- `edit` +- `bash` +- `bash_stop` +- future filesystem mutation tools + +Read-only tools such as `read`, `search`, `list`, `google_search`, and +`visit_page` do not increment `world_epoch`. + +Every context checkpoint stores the current `world_epoch`. A restore where the +current epoch differs from the checkpoint epoch should fail by default with a +clear message: + +```text +Tool error: restore would rewind model context from world_epoch=7 to 4, but +external side effects may still exist. Revert or inspect those effects, or call +context restore with allow_side_effect_mismatch=true. +``` + +Even with `allow_side_effect_mismatch=true`, the tool result must say that only +model context was restored. It must not claim that files, commands, browser +state, or network side effects were reverted. + +On agent startup, initialize `world_epoch` from the maximum epoch found in +existing checkpoint metadata. That keeps persisted checkpoints usable after a +restart while ensuring new side effects in the current process advance beyond +the restored baseline. + +Restores should also fail while a bash job is running. A running process is a +live external dependency whose output may still arrive after the restored +transcript. + +### Restore Notice + +A model-initiated restore must not be silent. A raw restore of transcript plus +KV would move the model back to the checkpoint and erase the very reason it +decided to restore. The default tool behavior should therefore be: + +```text +load checkpoint transcript + KV +append synthetic restore notice +continue from restored transcript plus notice +``` + +The restore notice becomes the first event after the restored checkpoint. It +should be inserted as a tool result or equivalent user-visible control message +after the restored transcript has been loaded. It must include: + +- checkpoint id and label, +- restore reason supplied by the model or user, +- restored transcript token count, +- checkpoint `world_epoch` and current `world_epoch`, +- whether side-effect mismatch was allowed, +- a warning that files, subprocesses, browser state, network effects, and other + external state were not reverted, +- a compact summary of known post-checkpoint side effects when available, +- an explicit warning when the in-memory side-effect history has been truncated + and older post-checkpoint side effects may have been dropped. + +Example: + +```text +Context restored from checkpoint 7e1c2b1a label=after-repo-map. +Reason: approach A failed because parser regression test X still failed. +Restored model context to 18420 tokens. world_epoch restored=3 current=7. +External side effects were not reverted; inspect or revert files/processes +separately before assuming the workspace matches this checkpoint. +``` + +This means restore creates a coherent continuation, not a perfect time machine. +The agent retains the expensive pre-checkpoint context and receives a short +explanation of why the failed attempt was discarded. + +## Critical Assessment: Agent And Server + +The opportunity is real, but it is not the same feature in `ds4-agent` and +`ds4-server`. + +In `ds4-agent`, the process has one user, one live worker, one transcript, and +one obvious owner of side effects. A context tool can be powerful because it +lets the model preserve expensive frontiers, checkpoint before risky work, +recover from weak compaction, and write durable notes for later reuse. + +In `ds4-server`, the same surface becomes harder for two separate reasons. First, +API requests are stateless, may come from multiple clients, and are serialized +through one live backend session. Second, the server currently returns tool calls +to clients; it does not run native server-side tools in the way `ds4-agent` runs +`read`, `edit`, `bash`, or `context`. The server can reuse KV prefixes safely, +but a model-generated `restore` would be a mutation of the single global live +timeline. Without an explicit session owner, a checkpoint is just a global object +in a shared cache. + +The feature therefore has two layers: + +- Computational continuity: checkpoint/restore of transcript plus KV. +- Semantic continuity: structured memory files that record what the agent + learned. + +The first layer saves prefill. The second saves reasoning. Both are needed for +the tool to be genuinely useful. + +### Opportunities + +- Long local coding sessions can avoid repeated high-cost prefill after a repo + exploration or web research phase. +- The agent can create named frontiers before risky edits, prompt experiments, + tool loops, or compaction. +- Structured memory can preserve architecture facts, invariants, decisions, and + open questions even after compaction or restart. + +### Difficulties + +- A KV checkpoint is not semantic memory. It preserves state, but not a compact + object the model can inspect cheaply. +- Restore does not revert the world. Files, subprocesses, browser state, + network effects, and external APIs remain changed. +- The current server has no authenticated tenant, owner, or session namespace. + Adding one is a prerequisite for writeable multi-user context controls. +- Stateless clients may resend a history that disagrees with a server-side + restore. The server must prefer explicit session-control semantics over + implicit tool behavior. +- Exact DSML replay remains fragile if checkpoint movement loses the sampled + tool-call bytes or maps them to the wrong request/session. +- Future concurrent or multi-slot serving can race on checkpoint metadata and + memory files unless writes are serialized per namespace. + +## Structured Memory Storage + +Do not store structured memory inside the KV payload. Store it next to the +checkpoint as a separate, readable artifact: + +```text +~/.ds4/kvcache/context/ + .kv + .meta.json + .memory.md +``` + +`.kv` stores transcript plus DS4 session payload. + +`.meta.json` stores machine-readable metadata: + +```json +{ + "id": "7e1c2b1a...", + "label": "repo-map-before-fix", + "created_at": 1780000000, + "world_epoch": 3, + "transcript_tokens": 18420, + "kv_path": "7e1c2b1a.kv", + "memory_path": "7e1c2b1a.memory.md", + "memory_sha1": "..." +} +``` + +`.memory.md` stores model-readable semantic memory: + +```md +# Context Memory + +## Goal +## Files Inspected +## Architecture Facts +## Invariants +## Decisions +## Commands And Results +## Risks +## Open Questions +## Next Steps +``` + +This separation matters because memory can be regenerated, diffed, inspected, +loaded selectively, or retained after a KV payload is stripped. + +For the first experiment, memory files should be created by the normal file +tools or by the existing compaction-style paths. They are useful artifacts, but +they are not required for checkpoint and restore to work. + +## Experiment Ledgers + +Long autonomous improvement loops need a durable record that is not rewound by +context restore. Store that record as a Markdown ledger outside the checkpoint +payload, either as a memory artifact associated with a checkpoint or as a named +experiment file referenced by checkpoint metadata. + +The ledger should be append-oriented and machine-readable enough for the agent +to enforce its own budget: + +```md +# Experiment Loop + +## Goal +Reduce DSML tool-call failures without regressing server behavior. + +## Success Metric +- `./ds4_test --tool-call-quality` improves or stays stable +- `./ds4_test --server` has no regressions + +## Budget +max_attempts: 5 +current_attempt: 2 + +## Baseline +checkpoint: 7e1c2b1a +score: ... + +## Attempts + +### Attempt 1 +Prompt: ... +Hypothesis: ... +Patch: ... +Tests: ... +DS4 response: ... +Result: failed +Reason: parser regression X +Decision: discard + +### Attempt 2 +Hypothesis: ... +Status: in_progress +``` + +For DS4-generated loop tests, keep both levels of evidence: + +- a compact model-written ledger with `ds4_prompt` and `ds4_response` fields; +- a harness-written report that preserves the exact prompt sent to DS4, the raw + DS4 output, and the generated ledger. + +The budget is not just a suggestion in prose. For the first experiment, the +agent should reread the ledger after each restore and stop when +`current_attempt >= max_attempts`, when the success metric is met, or when +restore safety checks fail. A later loop controller can enforce the same rule +programmatically. After a failed attempt, the expected flow is: + +```text +append failure result to ledger +restore baseline checkpoint +append restore notice that points to the ledger +start the next hypothesis +``` + +After a successful attempt, the agent should update the ledger, save a new +checkpoint, and mark it as the new best state. This turns context checkpoints +into clean restart points and the Markdown ledger into the durable memory of the +search process. + +## Verified Server Session Model + +The current `ds4-server` does not have an explicit remote session or owner +concept. This was verified against the server implementation: + +- `server` owns one `ds4_session *session`, one disk KV cache handle, one tool + memory map, and a small set of live continuation bindings. +- HTTP client threads parse requests and enqueue stack-owned jobs. A single + `worker_main()` dequeues jobs and mutates the one live session. +- `http_request` stores only method, path, body, and body length. Header parsing + reads `Content-Length`; it does not keep `Authorization`, API key, tenant, + session, organization, or user headers. +- `/v1/chat/completions`, `/v1/messages`, and `/v1/completions` parse protocol + payload fields into rendered prompts and skip unknown JSON fields. OpenAI + `user`, metadata, or similar caller fields are not retained as identity. +- `/v1/responses` explicitly rejects non-null `previous_response_id` and + `conversation` because DS4 does not implement the durable Responses store. +- Disk KV cache lookup is keyed by rendered byte prefix plus compatibility + checks such as quantization and context size. It is not keyed by user, owner, + tenant, or application session. +- The live Responses, Anthropic, and thinking continuation structures bind + recent tool call ids or visible transcript bytes to the current live token + frontier. They are process-local accelerators, not durable session ownership. + +Therefore, the server currently has stateless API semantics with one mutable +worker-owned timeline. The right server default is prefix-cache reuse, not +server-side conversation ownership. + +## MVP Boundary + +The first experiment is agent-only. `ds4-agent` is the only current runtime with +all required semantics in one place: + +- one transcript owner, +- one worker-owned `ds4_session`, +- slash-command save/switch/compact precedents, +- side-effect visibility for `edit`, `write`, `bash`, and browser tools, +- active bash job tracking, +- a natural place to report restore warnings to the user. + +The implementation should stay close to the existing local KV save/load path. +It may factor small helper functions for metadata, atomic writes, and +compatibility checks, but a general storage abstraction is not required before +the first working tool. + +The first implementation should add only the state needed by the agent worker: +checkpoint metadata, a `world_epoch` counter, and enough recent side-effect +summary text to make restore notices useful. It should not introduce a new +global context subsystem before the native tool proves useful. + +For `ds4-server`, the verified model above is enough guidance for the MVP: keep +automatic prefix-cache reuse as the server behavior, and do not add +model-visible restore semantics to stateless API traffic. + +## Branch Boundary + +This branch intentionally owns only agent context and KV checkpoint support. +Native Git support lives in `feature/agent-git-tools` and can be merged through +`feature/agent-kv-git-integration` when both feature lines need to work +together. + +The context branch must not include or link `ds4_agent_git.*`. When an +integration branch combines both features, mutating Git actions should be +recorded as ordinary side effects in `world_epoch`, just like `write`, `edit`, +and `bash`. + +## Implementation Plan + +### Phase 1: Disk-backed context checkpoints + +Add worker-owned checkpoint state and disk metadata: + +```c +typedef struct agent_context_checkpoint { + char id[41]; + char *label; + char *path; + uint64_t created_at; + uint64_t world_epoch; + int transcript_tokens; + struct agent_context_checkpoint *next; +} agent_context_checkpoint; +``` + +Store checkpoint files below: + +```text +~/.ds4/kvcache/context/.kv +~/.ds4/kvcache/context/.meta.json +``` + +Use existing save/load helpers where possible: + +- Save with `agent_kv_save_path()`. +- Load with `agent_kv_load_path()`. +- Reuse `agent_worker_sync_tokens()` for stripped or text-only rebuild paths. +- Keep the worker thread as the only owner of `w->session` mutation. + +`id` should be generated independently from the display label, for example from +random bytes plus checkpoint metadata. The label is user/model-facing display +text, not the stable identity. + +### Phase 2: Tool dispatch + +Add `context` to the tool schema prompt and dispatch in +`agent_execute_tool_call()`. + +The handler should parse: + +- `action` +- `id` +- `label` +- `reason` +- `allow_side_effect_mismatch` +- `dry_run` + +The action handler should return compact machine-readable text. Example: + +```text +context action=checkpoint id=7e1c2b1a label=before-parser-refactor tokens=18420 world_epoch=3 +context action=compact status=ok old_tokens=28500 new_tokens=9400 removed_tokens=19100 reduction_percent=67.0 summary_tokens=2100 tail_tokens=7000 +``` + +Restore appends a model-visible notice that includes KV reuse accounting: + +```text +KV restore expected metrics: checkpoint_tokens=18420 expected_restore_notice_tokens=140 expected_restored_tokens=18560 expected_prefill_suffix_tokens=140 expected_full_prefill_tokens_without_kv=18560 expected_saved_prefill_tokens=18420. +``` + +This makes the benefit concrete for both the implementation and the model: +restoring the checkpoint loads the old prefix from KV, then only the synthetic +restore notice is expected to be prefetched. The word `expected` is intentional: +the notice is built before the final sync that appends it, while trace output +records the actual cached/suffix counts observed by `ds4_session_sync()`. + +## Correctness Verification Measures + +1. Transcript and session equality. + +After every checkpoint and restore: + +```text +agent_tokens_equal(ds4_session_tokens(w->session), &w->transcript) == true +ds4_session_pos(w->session) == w->transcript.len +``` + +2. Prefix reuse measurement. + +For a restore from a disk KV payload, the next sync to the same transcript +should report zero prefill suffix. For stripped checkpoints, the suffix may be +non-zero, and the tool result must say it rebuilt from rendered text. + +For model-initiated restores, the actual post-restore transcript should be the +checkpoint transcript plus the synthetic restore notice. Verification should +measure both values separately: zero prefill for loading the checkpoint payload, +then a small append for the notice. + +If payload tokens and metadata tokens disagree after a KV load, restore must not +leave the live session at the loaded payload while the transcript still points +to the previous conversation. It should resynchronize the live session to the +current transcript or invalidate the session before returning the error. + +The `context status` output should expose the live-cache view as +`cached_tokens` and `prefill_suffix_tokens`, so the agent can tell whether the +current transcript will reuse KV or force a rebuild. + +3. Next-token equivalence. + +Before checkpoint, copy logits with `ds4_session_copy_logits()`. After restore, +copy logits again and compare: + +- exact token position equality, +- same argmax token, +- top-k ids match, +- float deltas are zero or within a backend-specific tolerance. + +4. Side-effect epoch enforcement. + +Create a checkpoint, run `edit` or `bash`, then attempt restore. Expected: + +- restore fails without `allow_side_effect_mismatch=true`, +- restore succeeds with the override, +- restore notice explicitly warns that external effects were not reverted and + names the epoch mismatch. + +5. Active bash job guard. + +Start a long-running bash job, checkpoint or restore depending on policy, and +verify that restore is denied while the job is running. After `bash_stop`, the +same restore should follow normal side-effect rules. + +6. Compaction interaction. + +Checkpoint before compaction, compact, then restore. Expected: + +- transcript returns to the checkpoint token count, +- model-initiated restore appends a restore notice after that checkpoint, +- private compaction prompt text is absent, +- live session is synchronized to restored transcript, +- no stale compaction summary remains unless it was part of the checkpoint. + +7. Corrupt or incompatible checkpoint handling. + +Corrupt a checkpoint file or change quant/context metadata. Expected: + +- restore fails, +- live session is invalidated only if load already touched it, +- transcript is not replaced with partial data, +- error text identifies the reason. + +8. Persistence across restart. + +Save a context checkpoint, exit `ds4-agent`, restart, list checkpoints, restore +the checkpoint, and verify token count plus next-token equivalence where the +same model/backend are available. + +9. DS4-generated experiment loop. + +Run the slow e2e target: + +```sh +make test-agent-context-loop +``` + +This test is intentionally not part of default `make test`: it requires a real +model, a usable backend, and enough time for a short agent turn. The prompt in +`tests/ds4_agent_context_loop_prompt.md` requires DS4 itself to: + +- create an experiment ledger with `write`, +- record the compact prompt and final DS4 response in that ledger, +- measure a DS4-owned helper test with `bash`, +- update the ledger with `edit`, +- create a model-visible `context checkpoint`, +- finish with `LOOP_DONE`. + +The shell harness verifies the generated ledger, the prompt/response report, +and the checkpoint metadata. It also writes +`tests/generated/ds4_agent_context_loop_report.md` plus separate persisted +prompt, response, and ledger files with the exact expanded prompt, the raw DS4 +output, and the generated ledger. It does not synthesize the loop in C; the +point is to test whether the model can operate the new tool surface in the +intended loop shape. + +10. KV cache benefit benchmark. + +Run the optional benchmark target: + +```sh +make test-kv-cache-benefit +``` + +This target is intentionally separate from default `make test` because it opens +the real model and backend. It verifies: + +- a saved KV payload reloads to the same token position, +- restored logits have the same argmax and near-zero delta versus the original + checkpoint state, +- extending the restored session requires prefill only for the suffix, +- a fresh full prefill to the same extended transcript has the same top-1 next + token as KV-restore-plus-suffix, +- the report prints `full_prefill_tokens`, `restored_prefill_tokens`, + `saved_prefill_tokens`, payload bytes, and wall-clock timings. + +The hallucination claim should be phrased conservatively: the deterministic +guard is model-state equivalence. If logits/argmax match after restore, the KV +path has not introduced state drift. Compaction can reduce context pressure, +but factual quality after compaction still depends on the summary and must be +tested with task-specific e2e prompts. + +11. Compaction canary retention e2e. + +Run the optional compaction-quality target: + +```sh +make test-agent-context-compact-canary +``` + +This target is intentionally separate from default `make test`: it asks DS4 to +operate the `context compact` tool, places five canary facts before a long +irrelevant padding block, and then requires DS4 to write the canaries into a +ledger only after compaction. The harness verifies: + +- the trace contains `compacted reason="canary-retention-test"`, +- the compaction trace reports a reduced token count and a late enough recent + tail start, +- the post-compaction ledger exists, +- all five canary values survived, +- the final response marker is present. + +This is still not a general hallucination benchmark. It is a focused task-level +guard that checks whether compaction preserves facts explicitly marked as +critical for the next action while those facts are pushed out of the recent +verbatim tail. + +12. Adaptive context self-improvement e2e. + +Run the optional self-improvement target: + +```sh +make test-agent-context-self-improvement +``` + +This target is intentionally separate from default `make test`: it requires a +real model/backend and asks DS4 to operate a complete agent loop. The harness +creates a temporary repository with a small failing Python project, then the +prompt requires DS4 to: + +- create a context checkpoint before changing the project, +- inspect repository status, +- read and fix the failing code, +- run the project test, +- inspect the resulting diff, +- create a second context checkpoint after the test passes, +- restore from that checkpoint, +- inspect repository status again, +- run the test again, +- write a ledger with the exact actions and final result. + +The Git inspection step is adaptive. If the model-visible schemas include the +native Git tool, the prompt asks DS4 to use it for `status` and `diff`. If that +tool is absent, the same prompt requires the existing `bash` path with +`git status --short` and `git diff`. This keeps the context/KV branch +independent from the Git-tool branch while still letting the same test exercise +the native Git path after integration. + +This test is not a claim that DS4 has optimized DS4 itself. It is a controlled +regression test for the agent loop shape: inspect, edit, test, diff, +checkpoint, restore, retest, and record evidence. + +The stronger product demonstration should be a DS4-on-DS4 optimization loop: +ask DS4 to inspect this repository, select one small measurable optimization, +implement it, run the relevant benchmark or e2e check, inspect the source diff, +checkpoint and restore the successful state, and record whether the metric +improved. That is the right next target, but it should remain an optional slow +evaluation because it is more expensive and less deterministic than a controlled +temporary-repository regression. + +### Resume Point: 2026-05-25 + +The DS4-generated context loop was run successfully with: + +```sh +make test-agent-context-loop +``` + +The first sandboxed attempt failed because the sandbox could not access Metal. +The successful run was executed outside the sandbox and produced: + +- `tests/generated/ds4_agent_context_loop_report.md` +- `tests/generated/ds4_agent_context_loop_prompt.md` +- `tests/generated/ds4_agent_context_loop_output.txt` +- `tests/generated/ds4_agent_context_loop_ledger.md` + +The generated ledger recorded: + +```text +ds4_prompt=validate DS4's own agent context loop capability +ds4_response=LOOP_DONE +attempt=1 status=pass +attempt=1 metric=ds4_agent_context_test passed +``` + +Useful result: the loop proved that DS4 can operate the intended tool sequence: + +```text +write -> bash -> edit -> context checkpoint -> final response +``` + +It also proved that the harness now captures the full evidence chain: expanded +prompt, raw model response, generated ledger, and checkpoint metadata. + +Observed weakness: the prompt explicitly said `Do not explain the plan in +prose`, but DS4 still emitted conversational text such as: + +```text +I'll execute the loop step by step. +The test succeeded (exit_status=0). Now I'll edit the file to mark success. +The attempt passed. Now I'll checkpoint the context. +``` + +This did not break the current harness because the final ledger and checkpoint +were correct, but it gives the next self-improvement loop a concrete target: +improve DS4's adherence to tool-only execution when the prompt requests no +prose. + +Next loop to run from here: + +1. Inspect baseline state with external version-control commands or with the + independent Git branch after integration. +2. Ask DS4 to propose a small DS4-owned improvement for tool-only adherence. +3. Materialize the hypothesis in a Markdown experiment ledger. +4. Implement one minimal change. +5. Measure with a focused e2e check that fails when raw DS4 output contains + unexpected prose before/between required tool calls. +6. If the metric improves, checkpoint and record the source diff in the ledger. +7. If it does not improve, record the failure and restore/retry from the saved + context frontier. + +## Exploration And Implementation Loop + +Use this loop for each action before merging implementation: + +1. Define a concrete agent scenario. + +Write the starting transcript shape, tool calls involved, expected checkpoint +state, and external side effects. + +2. Run the scenario against the current implementation. + +Capture transcript token count, session position, world epoch, active bash job +state, and checkpoint id. + +3. Assert invariants. + +Check token equality, session position, side-effect policy, and tool result +clarity. + +4. Measure cost. + +Record save latency, restore latency, prefill suffix tokens, checkpoint payload +bytes, and whether restore avoided a cold rebuild. + +5. Break it intentionally. + +Try stale ids, corrupt files, active jobs, side-effect mismatch, stripped +payloads, and interrupted compaction. + +6. Tighten the implementation. + +Add the missing guard, simplify the API, or improve the tool result before +moving to the next action. + +## First Loop Batch + +The first implementation pass should cover these scenarios in order: + +| Scenario | Purpose | Expected result | +| --- | --- | --- | +| `status` on fresh sysprompt | establish baseline | reports ctx, pos, transcript tokens, no checkpoints | +| `checkpoint` after one user turn | prove save path | checkpoint id returned, token/session equality holds | +| `restore` with no side effects | prove load path | checkpoint is loaded with zero prefill suffix, then restore notice is appended | +| `restore` after `edit` | prove guard | denied unless override is set | +| `compact` then `restore` | prove compaction safety | restored state has no leaked private summary | +| running `bash` then `restore` | prove live process guard | restore denied until job is stopped | +| failed-attempt retry after restore | prove model usability | model uses restore notice to abandon the failed attempt and try a different strategy | +| DS4-generated loop e2e | prove model tool use | DS4 writes a ledger, records prompt/response, runs a DS4 helper test, records pass/fail, checkpoints passing state | + +## Open Design Decisions + +- Whether `checkpoint` should be allowed while the session is dirty but idle. + The likely answer is yes, after forcing `agent_worker_sync_tokens()`. +- Whether model-initiated `restore` should require user confirmation in + interactive mode. For Phase 1, deny side-effect mismatch by default and do not + prompt from inside the tool. +- Whether to expose an explicit `hard_restore` action for tests and manual + debugging. The default model-visible `restore` should append a restore notice; + hard restore should not be the autonomous path. +- Whether these controls should also be exposed through slash commands. The + initial implementation can keep `/save` and `/switch` unchanged and expose + only the DSML `context` tool. +- How experiment loops coordinate context restore with workspace rollback. The + MVP can warn through `world_epoch` and require explicit cleanup before + override. + +## Non-goals + +- No arbitrary KV byte editing. +- No filesystem rollback. +- No promise that browser state or network side effects are restored. +- No multiple live KV sessions in RAM in Phase 1. +- No prompt rewriting that changes sampled DSML history. diff --git a/ds4_agent.c b/ds4_agent.c index 10b76ea3..fdb65703 100644 --- a/ds4_agent.c +++ b/ds4_agent.c @@ -1,4 +1,5 @@ #include "ds4.h" +#include "ds4_agent_context.h" #include "ds4_kvstore.h" #include "ds4_web.h" #include "linenoise.h" @@ -8,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -96,6 +98,7 @@ typedef struct { ds4_session *session; ds4_tokens transcript; char *cache_dir; + char *context_dir; char *sysprompt_path; char session_sha[41]; char *session_title; @@ -139,10 +142,35 @@ typedef struct { bool more_valid; agent_bash_job *bash_jobs; int next_bash_job_id; + uint64_t world_epoch; + ds4_agent_side_effects side_effects; } agent_worker; static unsigned agent_next_prefill_label(void); +typedef struct { + int old_pos; + int prompt_tokens; + int common_prefix; + int cached_tokens; + int prefill_tokens; + double elapsed_sec; + int rc; +} agent_sync_metrics; + +typedef struct { + int old_tokens; + int new_tokens; + int summary_tokens; + int tail_tokens; + int removed_tokens; + double reduction_percent; +} agent_compact_stats; + +static bool agent_worker_compact(agent_worker *w, const char *reason, + char *err, size_t err_len, + agent_compact_stats *stats); + typedef struct agent_tail_capture { char *buf; size_t cap; @@ -803,6 +831,25 @@ static const char agent_tools_prompt_after_edit[] = "{\n" " \"type\": \"function\",\n" " \"function\": {\n" + " \"name\": \"context\",\n" + " \"description\": \"Inspect, checkpoint, restore, compact, list, or drop the agent context state. Restore never reverts files, processes, browser state, or network effects.\",\n" + " \"parameters\": {\n" + " \"type\": \"object\",\n" + " \"properties\": {\n" + " \"action\": {\"type\": \"string\", \"enum\": [\"status\", \"checkpoint\", \"list\", \"restore\", \"compact\", \"drop\"]},\n" + " \"id\": {\"type\": \"string\"},\n" + " \"label\": {\"type\": \"string\"},\n" + " \"reason\": {\"type\": \"string\"},\n" + " \"allow_side_effect_mismatch\": {\"type\": \"boolean\"},\n" + " \"dry_run\": {\"type\": \"boolean\"}\n" + " },\n" + " \"required\": [\"action\"]\n" + " }\n" + " }\n" + "}\n\n" + "{\n" + " \"type\": \"function\",\n" + " \"function\": {\n" " \"name\": \"read\",\n" " \"description\": \"Read a text file or a range of lines.\",\n" " \"parameters\": {\n" @@ -3927,14 +3974,24 @@ static void worker_answer_queued_user_drain(agent_worker *w, char *text) { * cache-saving operation: if the requested transcript extends the live session, * only the suffix is prefetched; otherwise the DS4 session rebuilds from the * longest common prefix it can retain. */ -static int agent_worker_sync_tokens(agent_worker *w, const ds4_tokens *tokens, - bool publish_progress, - char *err, size_t err_len) { +static int agent_worker_sync_tokens_ex(agent_worker *w, const ds4_tokens *tokens, + bool publish_progress, + char *err, size_t err_len, + agent_sync_metrics *metrics) { int old_pos = ds4_session_pos(w->session); int common = ds4_session_common_prefix(w->session, tokens); int cached = common == old_pos && tokens->len >= old_pos ? common : 0; int suffix = tokens->len - cached; if (suffix < 0) suffix = tokens->len; + if (metrics) { + metrics->old_pos = old_pos; + metrics->prompt_tokens = tokens->len; + metrics->common_prefix = common; + metrics->cached_tokens = cached; + metrics->prefill_tokens = suffix; + metrics->elapsed_sec = 0.0; + metrics->rc = 0; + } if (publish_progress) { pthread_mutex_lock(&w->mu); @@ -3956,12 +4013,25 @@ static int agent_worker_sync_tokens(agent_worker *w, const ds4_tokens *tokens, ds4_session_set_display_progress(w->session, publish_progress ? worker_progress_cb : NULL, publish_progress ? w : NULL); + double t0 = now_sec(); int rc = ds4_session_sync(w->session, tokens, err, err_len); + double elapsed = now_sec() - t0; ds4_session_set_progress(w->session, NULL, NULL); ds4_session_set_display_progress(w->session, NULL, NULL); + if (metrics) { + metrics->elapsed_sec = elapsed; + metrics->rc = rc; + } return rc; } +static int agent_worker_sync_tokens(agent_worker *w, const ds4_tokens *tokens, + bool publish_progress, + char *err, size_t err_len) { + return agent_worker_sync_tokens_ex(w, tokens, publish_progress, + err, err_len, NULL); +} + /* Start a new session at the system/tool prompt. A fixed sysprompt.kv * checkpoint avoids paying this prefill cost repeatedly, but only when the * rendered prompt text still matches the file. The same fixed path is shared @@ -5343,6 +5413,14 @@ static bool agent_tool_result_fits_context(agent_worker *w, const char *result, return tokens + reserve_tokens < w->cfg->gen.ctx_size; } +static void agent_context_note_side_effect(agent_worker *w, const char *kind, + const char *detail) { + if (!w) return; + w->world_epoch = ds4_agent_side_effects_note(&w->side_effects, + w->world_epoch, + kind, detail); +} + /* Read file text for the model. Normal mode shows plain line numbers. Raw * mode is reserved for cases where line decoration would corrupt the payload * being inspected. */ @@ -5440,7 +5518,6 @@ static char *agent_tool_more(agent_worker *w, const agent_tool_call *call) { } static char *agent_tool_write(agent_worker *w, const agent_tool_call *call) { - (void)w; const char *path = agent_tool_arg_value(call, "path"); const char *content = agent_tool_arg_value(call, "content"); if (!path || !path[0]) return xstrdup("Tool error: write requires path\n"); @@ -5463,6 +5540,7 @@ static char *agent_tool_write(agent_worker *w, const agent_tool_call *call) { agent_buf_puts(&b, "\n"); return agent_buf_take(&b); } + agent_context_note_side_effect(w, "write", path); char msg[PATH_MAX + 160]; snprintf(msg, sizeof(msg), "Wrote %zu bytes to %s\n", len, path); return xstrdup(msg); @@ -5920,6 +5998,8 @@ static char *agent_tool_edit(agent_worker *w, const agent_tool_call *call) { new_text, anchored ? "anchored old/new replacement" : "old/new replacement"); + if (strncmp(result, "Tool error:", strlen("Tool error:")) != 0) + agent_context_note_side_effect(w, "edit", path); free(data); return result; } @@ -6721,6 +6801,575 @@ static pid_t agent_tool_pid(const agent_tool_call *call) { return (pid_t)agent_parse_int_default(agent_tool_arg_value(call, "pid"), 0, 0, INT_MAX); } +/* ============================================================================ + * Native Context Tool + * ============================================================================ + */ + +static int agent_context_running_bash_jobs(agent_worker *w) { + int running = 0; + for (agent_bash_job *job = w->bash_jobs; job; job = job->next) { + agent_bash_poll(job); + if (job->running) running++; + } + return running; +} + +static void agent_context_generate_id(char out[41]) { + uint8_t buf[64]; + memset(buf, 0, sizeof(buf)); + int fd = open("/dev/urandom", O_RDONLY); + ssize_t got = fd >= 0 ? read(fd, buf, sizeof(buf)) : -1; + if (fd >= 0) close(fd); + if (got != (ssize_t)sizeof(buf)) { + uint64_t v[6]; + v[0] = (uint64_t)time(NULL); + v[1] = (uint64_t)clock(); + v[2] = (uint64_t)getpid(); + v[3] = (uint64_t)(uintptr_t)&buf; + v[4] = (uint64_t)random(); + v[5] = (uint64_t)now_sec(); + memcpy(buf, v, sizeof(v)); + } + ds4_kvstore_sha1_bytes_hex(buf, sizeof(buf), out); +} + +static char *agent_context_checkpoint_result(const char id[41], const char *label, + int tokens, uint64_t epoch, + bool dry_run) { + char *safe = ds4_agent_context_oneline(label, 160); + agent_buf b = {0}; + char line[256]; + snprintf(line, sizeof(line), + "context action=%s id=%.40s label=\"", + dry_run ? "checkpoint-dry-run" : "checkpoint", id); + agent_buf_puts(&b, line); + agent_buf_puts(&b, safe); + snprintf(line, sizeof(line), "\" tokens=%d world_epoch=%" PRIu64 "\n", + tokens, epoch); + agent_buf_puts(&b, line); + free(safe); + return agent_buf_take(&b); +} + +static bool agent_context_project_tool_result(agent_worker *w, const char *result, + ds4_tokens *projected, + int *tokens_out) { + ds4_tokens_free(projected); + ds4_tokens_copy(projected, &w->transcript); + ds4_chat_append_message(w->engine, projected, "tool", result ? result : ""); + if (tokens_out) *tokens_out = projected->len; + return projected->len + 16 < w->cfg->gen.ctx_size; +} + +static char *agent_context_checkpoint(agent_worker *w, const agent_tool_call *call, + bool *already_appended) { + const char *label_arg = agent_tool_arg_value(call, "label"); + bool dry_run = agent_parse_bool_default(agent_tool_arg_value(call, "dry_run"), false); + char *label = ds4_agent_context_limited_strdup(label_arg && label_arg[0] ? + label_arg : "checkpoint", 240); + char id[41]; + char *kv_file = NULL, *meta_file = NULL, *mem_file = NULL; + char *kv_path = NULL, *meta_path = NULL; + for (int i = 0; i < 16; i++) { + agent_context_generate_id(id); + free(kv_file); + free(meta_file); + free(mem_file); + free(kv_path); + free(meta_path); + kv_file = ds4_agent_context_file_name(id, ".kv"); + meta_file = ds4_agent_context_file_name(id, ".meta.json"); + mem_file = ds4_agent_context_file_name(id, ".memory.md"); + kv_path = ds4_agent_context_path_for_file(w->context_dir, kv_file); + meta_path = ds4_agent_context_path_for_file(w->context_dir, meta_file); + if (access(meta_path, F_OK) != 0 && access(kv_path, F_OK) != 0) break; + } + + int tokens = w->transcript.len; + char *result = NULL; + ds4_tokens projected = {0}; + for (int i = 0; i < 4; i++) { + free(result); + result = agent_context_checkpoint_result(id, label, tokens, w->world_epoch, dry_run); + int new_tokens = 0; + if (!agent_context_project_tool_result(w, result, &projected, &new_tokens)) { + ds4_tokens_free(&projected); + free(label); + free(kv_file); + free(meta_file); + free(mem_file); + free(kv_path); + free(meta_path); + return xstrdup("Tool error: checkpoint result would exceed context\n"); + } + if (new_tokens == tokens) break; + tokens = new_tokens; + } + + if (dry_run) { + ds4_tokens_free(&projected); + free(label); + free(kv_file); + free(meta_file); + free(mem_file); + free(kv_path); + free(meta_path); + return result; + } + + char err[256] = {0}; + if (!agent_mkdir_p(w->context_dir)) { + snprintf(err, sizeof(err), "failed to create %s", w->context_dir); + goto fail; + } + if (agent_worker_sync_tokens(w, &projected, false, err, sizeof(err)) != 0) + goto fail; + + char ignored_sha[41]; + if (!agent_kv_save_path(w, kv_path, &projected, "agent-context", + ignored_sha, NULL, 0, err, sizeof(err))) + goto rollback; + + ds4_agent_context_meta meta = {0}; + snprintf(meta.id, sizeof(meta.id), "%s", id); + meta.label = label; + meta.kv_file = kv_file; + meta.memory_file = mem_file; + meta.created_at = (uint64_t)time(NULL); + meta.world_epoch = w->world_epoch; + meta.transcript_tokens = projected.len; + if (!ds4_agent_context_write_meta(&meta, meta_path, err, sizeof(err))) { + unlink(kv_path); + goto rollback_no_meta_free; + } + + ds4_tokens_free(&w->transcript); + w->transcript = projected; + memset(&projected, 0, sizeof(projected)); + pthread_mutex_lock(&w->mu); + /* session_dirty tracks the durable /save state, not live KV sync. The + * session was synced to projected above; this marks the visible transcript + * as changed because the checkpoint tool result is now part of it. */ + w->session_dirty = true; + w->user_activity = true; + w->status.ctx_used = w->transcript.len; + agent_wake_locked(w); + pthread_mutex_unlock(&w->mu); + if (already_appended) *already_appended = true; + + free(meta_path); + free(kv_path); + free(meta_file); + ds4_agent_context_meta_free(&meta); + return result; + +rollback: + unlink(kv_path); +rollback_no_meta_free: + { + char sync_err[160] = {0}; + if (agent_worker_sync_tokens(w, &w->transcript, false, + sync_err, sizeof(sync_err)) != 0) + ds4_session_invalidate(w->session); + } +fail: + ds4_tokens_free(&projected); + free(result); + free(label); + free(kv_file); + free(meta_file); + free(mem_file); + free(kv_path); + free(meta_path); + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: context checkpoint failed: "); + agent_buf_puts(&b, err[0] ? err : "unknown error"); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); +} + +static void agent_context_append_side_effects_since(agent_worker *w, uint64_t epoch, + agent_buf *b) { + char *summary = ds4_agent_side_effects_summary_since(&w->side_effects, epoch); + agent_buf_puts(b, summary); + free(summary); +} + +static char *agent_context_restore_notice(agent_worker *w, + const ds4_agent_context_meta *meta, + const char *reason, + bool allowed_mismatch, + int checkpoint_tokens, + int restored_tokens, + int restore_notice_tokens) { + char *safe_label = ds4_agent_context_oneline(meta->label, 160); + char *safe_reason = ds4_agent_context_oneline(reason && reason[0] ? + reason : "not specified", 240); + agent_buf b = {0}; + char line[320]; + snprintf(line, sizeof(line), + "Context restored from checkpoint %.40s label=\"%s\".\n", + meta->id, safe_label); + agent_buf_puts(&b, line); + agent_buf_puts(&b, "Reason: "); + agent_buf_puts(&b, safe_reason); + agent_buf_puts(&b, "\n"); + snprintf(line, sizeof(line), + "Restored model context to %d tokens. world_epoch restored=%" PRIu64 " current=%" PRIu64 ".\n", + meta->transcript_tokens, meta->world_epoch, w->world_epoch); + agent_buf_puts(&b, line); + ds4_agent_context_restore_metrics metrics = { + .checkpoint_tokens = checkpoint_tokens, + .restore_notice_tokens = restore_notice_tokens, + .restored_tokens = restored_tokens, + }; + char *metrics_line = ds4_agent_context_restore_expected_metrics_line(&metrics); + agent_buf_puts(&b, metrics_line); + free(metrics_line); + snprintf(line, sizeof(line), + "side_effect_mismatch_allowed=%s\n", + allowed_mismatch ? "true" : "false"); + agent_buf_puts(&b, line); + agent_buf_puts(&b, + "External side effects were not reverted; inspect or revert files, processes, browser state, and network effects separately before assuming the workspace matches this checkpoint.\n"); + agent_context_append_side_effects_since(w, meta->world_epoch, &b); + free(safe_label); + free(safe_reason); + return agent_buf_take(&b); +} + +static bool agent_context_resync_live_transcript(agent_worker *w, + char *err, size_t err_len) { + if (agent_worker_sync_tokens(w, &w->transcript, false, err, err_len) != 0) { + ds4_session_invalidate(w->session); + return false; + } + return true; +} + +static char *agent_context_restore(agent_worker *w, const agent_tool_call *call, + bool *already_appended) { + const char *id = agent_tool_arg_value(call, "id"); + const char *reason = agent_tool_arg_value(call, "reason"); + bool allow = agent_parse_bool_default( + agent_tool_arg_value(call, "allow_side_effect_mismatch"), false); + bool dry_run = agent_parse_bool_default(agent_tool_arg_value(call, "dry_run"), false); + + int running = agent_context_running_bash_jobs(w); + char guard_err[256] = {0}; + if (!ds4_agent_context_no_running_bash_guard("restore", running, + guard_err, sizeof(guard_err))) { + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: "); + agent_buf_puts(&b, guard_err); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + + ds4_agent_context_meta meta = {0}; + char *meta_path = NULL; + char *kv_path = NULL; + char err[256] = {0}; + if (!ds4_agent_context_find_checkpoint(w->context_dir, id, &meta, + &meta_path, &kv_path, + err, sizeof(err))) { + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: "); + agent_buf_puts(&b, err); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + + if (!ds4_agent_context_restore_epoch_guard(w->world_epoch, meta.world_epoch, + allow, guard_err, + sizeof(guard_err))) { + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: "); + agent_buf_puts(&b, guard_err); + agent_buf_puts(&b, "\n"); + agent_context_append_side_effects_since(w, meta.world_epoch, &b); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + return agent_buf_take(&b); + } + + if (dry_run) { + char *safe = ds4_agent_context_oneline(meta.label, 160); + agent_buf b = {0}; + char line[256]; + snprintf(line, sizeof(line), + "context action=restore-dry-run id=%.40s label=\"%s\" tokens=%d world_epoch=%" PRIu64 " current_world_epoch=%" PRIu64 "\n", + meta.id, safe, meta.transcript_tokens, meta.world_epoch, w->world_epoch); + agent_buf_puts(&b, line); + free(safe); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + return agent_buf_take(&b); + } + + ds4_tokens loaded = {0}; + if (!agent_kv_load_path(w, kv_path, NULL, NULL, 0, &loaded, NULL, + err, sizeof(err))) { + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: context restore failed: "); + agent_buf_puts(&b, err[0] ? err : "failed to load checkpoint"); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + if (loaded.len != meta.transcript_tokens) { + int meta_tokens = meta.transcript_tokens; + int kv_tokens = loaded.len; + ds4_tokens_free(&loaded); + char sync_err[160] = {0}; + bool live_resynced = agent_context_resync_live_transcript(w, + sync_err, + sizeof(sync_err)); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + agent_buf b = {0}; + char line[320]; + snprintf(line, sizeof(line), + "Tool error: context restore failed: metadata tokens=%d but KV tokens=%d; live_session=%s%s%s\n", + meta_tokens, kv_tokens, + live_resynced ? "resynced" : "invalidated", + !live_resynced && sync_err[0] ? " error=" : "", + !live_resynced && sync_err[0] ? sync_err : ""); + agent_buf_puts(&b, line); + return agent_buf_take(&b); + } + + char *notice = NULL; + ds4_tokens restored = {0}; + int checkpoint_tokens = loaded.len; + int notice_tokens = 0; + int restored_tokens = checkpoint_tokens; + for (int i = 0; i < 4; i++) { + free(notice); + notice = agent_context_restore_notice(w, &meta, reason, allow, + checkpoint_tokens, + restored_tokens, + notice_tokens); + ds4_tokens_free(&restored); + ds4_tokens_copy(&restored, &loaded); + ds4_chat_append_message(w->engine, &restored, "tool", notice); + int new_notice_tokens = restored.len - checkpoint_tokens; + int new_restored_tokens = restored.len; + if (new_notice_tokens == notice_tokens && + new_restored_tokens == restored_tokens) + break; + notice_tokens = new_notice_tokens; + restored_tokens = new_restored_tokens; + } + ds4_tokens_free(&loaded); + if (restored.len + 16 >= w->cfg->gen.ctx_size) { + ds4_tokens_free(&restored); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + free(notice); + return xstrdup("Tool error: restore notice would exceed context\n"); + } + agent_sync_metrics sync_metrics = {0}; + if (agent_worker_sync_tokens_ex(w, &restored, false, err, sizeof(err), + &sync_metrics) != 0) { + ds4_session_invalidate(w->session); + ds4_tokens_free(&restored); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + free(notice); + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: context restore failed after load: "); + agent_buf_puts(&b, err[0] ? err : "failed to append restore notice"); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + agent_trace(w, + "context restore id=%.40s checkpoint=%d restored=%d cached=%d suffix=%d elapsed=%.3f", + meta.id, checkpoint_tokens, restored.len, + sync_metrics.cached_tokens, sync_metrics.prefill_tokens, + sync_metrics.elapsed_sec); + if (sync_metrics.cached_tokens < checkpoint_tokens || + sync_metrics.prefill_tokens != restored.len - checkpoint_tokens) + { + agent_trace(w, + "context restore prefill mismatch expected_cached=%d expected_suffix=%d common=%d old_pos=%d", + checkpoint_tokens, restored.len - checkpoint_tokens, + sync_metrics.common_prefix, sync_metrics.old_pos); + } + + ds4_tokens_free(&w->transcript); + w->transcript = restored; + pthread_mutex_lock(&w->mu); + w->user_activity = true; + /* session_dirty tracks the durable /save state, not live KV sync. The + * session was synced to restored above; this marks the visible transcript + * as changed because the restore notice is now part of it. */ + w->session_dirty = true; + w->status.ctx_used = w->transcript.len; + agent_wake_locked(w); + pthread_mutex_unlock(&w->mu); + if (already_appended) *already_appended = true; + + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + return notice; +} + +static char *agent_context_list(agent_worker *w) { + DIR *d = opendir(w->context_dir); + agent_buf out = {0}; + if (!d) return xstrdup("context checkpoints: none\n"); + agent_buf_puts(&out, "context checkpoints:\n"); + int count = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (!ds4_agent_context_meta_filename(de->d_name)) continue; + char *meta_path = ds4_agent_context_path_for_file(w->context_dir, de->d_name); + ds4_agent_context_meta m = {0}; + char err[160] = {0}; + if (ds4_agent_context_read_meta_file(meta_path, &m, err, sizeof(err))) { + char *safe = ds4_agent_context_oneline(m.label, 120); + char line[320]; + snprintf(line, sizeof(line), + "- id=%.8s tokens=%d world_epoch=%" PRIu64 " created_at=%" PRIu64 " label=\"%s\"\n", + m.id, m.transcript_tokens, m.world_epoch, m.created_at, safe); + agent_buf_puts(&out, line); + free(safe); + count++; + } + ds4_agent_context_meta_free(&m); + free(meta_path); + } + closedir(d); + if (count == 0) { + free(out.ptr); + return xstrdup("context checkpoints: none\n"); + } + return agent_buf_take(&out); +} + +static char *agent_context_status(agent_worker *w) { + int pos = ds4_session_pos(w->session); + int ctx = ds4_session_ctx(w->session); + int common = ds4_session_common_prefix(w->session, &w->transcript); + int cached = common == pos && w->transcript.len >= pos ? common : 0; + int prefill_suffix = w->transcript.len - cached; + if (prefill_suffix < 0) prefill_suffix = w->transcript.len; + int running = agent_context_running_bash_jobs(w); + int checkpoints = ds4_agent_context_count_checkpoints(w->context_dir); + bool dirty; + pthread_mutex_lock(&w->mu); + dirty = w->session_dirty; + pthread_mutex_unlock(&w->mu); + char msg[640]; + snprintf(msg, sizeof(msg), + "context status transcript_tokens=%d session_pos=%d cached_tokens=%d prefill_suffix_tokens=%d ctx_size=%d free_tokens=%d dirty=%s world_epoch=%" PRIu64 " active_bash_jobs=%d checkpoints=%d\n", + w->transcript.len, pos, cached, prefill_suffix, + ctx, ctx - pos, dirty ? "true" : "false", w->world_epoch, + running, checkpoints); + return xstrdup(msg); +} + +static char *agent_context_drop(agent_worker *w, const agent_tool_call *call) { + const char *id = agent_tool_arg_value(call, "id"); + bool dry_run = agent_parse_bool_default(agent_tool_arg_value(call, "dry_run"), false); + int running = agent_context_running_bash_jobs(w); + char guard_err[256] = {0}; + if (!ds4_agent_context_no_running_bash_guard("drop", running, + guard_err, sizeof(guard_err))) { + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: "); + agent_buf_puts(&b, guard_err); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + ds4_agent_context_meta meta = {0}; + char *meta_path = NULL; + char *kv_path = NULL; + char err[256] = {0}; + if (!ds4_agent_context_find_checkpoint(w->context_dir, id, &meta, + &meta_path, &kv_path, + err, sizeof(err))) { + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: "); + agent_buf_puts(&b, err); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + char *memory_path = ds4_agent_context_full_memory_path(w->context_dir, &meta); + if (dry_run) { + char msg[160]; + snprintf(msg, sizeof(msg), "context action=drop-dry-run id=%.40s\n", meta.id); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + free(memory_path); + return xstrdup(msg); + } + bool ok = true; + if (unlink(kv_path) != 0 && errno != ENOENT) ok = false; + if (memory_path && unlink(memory_path) != 0 && errno != ENOENT) ok = false; + if (unlink(meta_path) != 0 && errno != ENOENT) ok = false; + char msg[256]; + snprintf(msg, sizeof(msg), "%scontext action=drop id=%.40s\n", + ok ? "" : "Tool error: partial drop failure; ", meta.id); + ds4_agent_context_meta_free(&meta); + free(meta_path); + free(kv_path); + free(memory_path); + return xstrdup(msg); +} + +static char *agent_tool_context(agent_worker *w, const agent_tool_call *call, + bool *already_appended) { + const char *action = agent_tool_arg_value(call, "action"); + if (!action || !action[0]) return xstrdup("Tool error: context requires action\n"); + if (!strcmp(action, "status")) return agent_context_status(w); + if (!strcmp(action, "list")) return agent_context_list(w); + if (!strcmp(action, "checkpoint")) return agent_context_checkpoint(w, call, already_appended); + if (!strcmp(action, "restore")) return agent_context_restore(w, call, already_appended); + if (!strcmp(action, "drop")) return agent_context_drop(w, call); + if (!strcmp(action, "compact")) { + char err[256] = {0}; + const char *reason = agent_tool_arg_value(call, "reason"); + agent_compact_stats stats = {0}; + if (!agent_worker_compact(w, reason && reason[0] ? reason : "context tool", + err, sizeof(err), &stats)) { + agent_buf b = {0}; + agent_buf_puts(&b, "Tool error: context compact failed: "); + agent_buf_puts(&b, err[0] ? err : "unknown error"); + agent_buf_puts(&b, "\n"); + return agent_buf_take(&b); + } + char msg[256]; + snprintf(msg, sizeof(msg), + "context action=compact status=ok old_tokens=%d new_tokens=%d removed_tokens=%d reduction_percent=%.1f summary_tokens=%d tail_tokens=%d\n", + stats.old_tokens, stats.new_tokens, stats.removed_tokens, + stats.reduction_percent, stats.summary_tokens, + stats.tail_tokens); + return xstrdup(msg); + } + return xstrdup("Tool error: unknown context action\n"); +} + +static bool agent_tool_call_requires_exclusive_context(const agent_tool_call *call) { + if (!call || !call->name || strcmp(call->name, "context")) return false; + const char *action = agent_tool_arg_value(call, "action"); + return action && + (!strcmp(action, "checkpoint") || + !strcmp(action, "restore") || + !strcmp(action, "compact")); +} + /* ============================================================================ * Tool Dispatch * ============================================================================ @@ -6729,8 +7378,10 @@ static pid_t agent_tool_pid(const agent_tool_call *call) { /* Execute one parsed DSML tool call and return the text that will be appended as * the tool-role result. UI visualization already happened while streaming; this * function is only about side effects and the model-visible observation. */ -static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *call) { +static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *call, + bool *already_appended) { agent_buf result = {0}; + if (already_appended) *already_appended = false; if (!call->name) return xstrdup("Tool error: missing tool name\n"); if (!strcmp(call->name, "read")) return agent_tool_read(w, call); @@ -6741,6 +7392,7 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal if (!strcmp(call->name, "search")) return agent_tool_search(w, call); if (!strcmp(call->name, "google_search")) return agent_tool_google_search(w, call); if (!strcmp(call->name, "visit_page")) return agent_tool_visit_page(w, call); + if (!strcmp(call->name, "context")) return agent_tool_context(w, call, already_appended); if (!strcmp(call->name, "bash")) { const char *cmd = agent_tool_arg_value(call, "command"); @@ -6756,6 +7408,7 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal agent_buf_puts(&result, "\n"); return agent_buf_take(&result); } + agent_context_note_side_effect(w, "bash", cmd); return agent_bash_job_tool_result(w, job, true, refresh, false, true); } @@ -6775,7 +7428,14 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal 60, 1, 3600); bool stop = !strcmp(call->name, "bash_stop"); bool wait = stop; - return agent_bash_job_tool_result(w, job, wait, refresh, stop, true); + char *res = agent_bash_job_tool_result(w, job, wait, refresh, stop, true); + if (stop) { + char detail[160]; + snprintf(detail, sizeof(detail), "job=%d pid=%ld", + job_id, (long)pid); + agent_context_note_side_effect(w, "bash_stop", detail); + } + return res; } { @@ -6791,10 +7451,25 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal /* Execute all tool calls from one DSML block, preserving per-call labels in the * combined result so the model can associate observations with calls. */ -static char *agent_execute_tool_calls(agent_worker *w, const agent_tool_calls *calls) { +static char *agent_execute_tool_calls(agent_worker *w, const agent_tool_calls *calls, + bool *already_appended) { agent_buf all = {0}; + if (already_appended) *already_appended = false; + for (int i = 0; i < calls->len; i++) { + if (agent_tool_call_requires_exclusive_context(&calls->v[i]) && + calls->len != 1) + { + return xstrdup("Tool error: context checkpoint, restore, and compact must be the only tool call in a DSML block\n"); + } + } for (int i = 0; i < calls->len; i++) { - char *res = agent_execute_tool_call(w, &calls->v[i]); + bool one_appended = false; + char *res = agent_execute_tool_call(w, &calls->v[i], &one_appended); + if (one_appended) { + if (already_appended) *already_appended = true; + free(all.ptr); + return res; + } char hdr[128]; snprintf(hdr, sizeof(hdr), "Tool result %d (%s):\n", i + 1, calls->v[i].name ? calls->v[i].name : "unknown"); @@ -6901,6 +7576,7 @@ static char *agent_compact_make_prompt(const char *reason) { "- decisions, rejected approaches, known bugs, and pending next steps\n" "- reloadable bulky data with exact paths/ranges/commands when available\n\n" "Do not invent facts. Do not include generic narration. Do not include raw file contents unless they were essential to a conclusion.\n" + "Use plain text headings or bullets. Do not output XML/HTML-like tags such as , , or .\n" "After the summary, stop. Do not continue the user task, do not call tools, and do not output thinking tags or DSML markup.\n" "Output only the compact summary.\n"); if (reason && reason[0]) { @@ -6911,14 +7587,41 @@ static char *agent_compact_make_prompt(const char *reason) { return agent_buf_take(&b); } +static bool agent_compact_summary_has_signal(const char *s) { + while (*s && isspace((unsigned char)*s)) s++; + if (*s == '<') return false; + if (strstr(s, "= 2) words++; + run = 0; + } + } + if (run >= 2) words++; + return alnum >= 24 && words >= 6; +} + /* Perform the full compaction exchange and rebuild the live DS4 session from * the compacted transcript. Any failure invalidates live KV because the model * may have just seen private compaction instructions that are not part of the * real conversation. */ static bool agent_worker_compact(agent_worker *w, const char *reason, - char *err, size_t err_len) { + char *err, size_t err_len, + agent_compact_stats *stats) { + if (stats) memset(stats, 0, sizeof(*stats)); const int bottom = w->transcript.len; if (bottom <= 0) return true; + if (stats) { + stats->old_tokens = bottom; + stats->new_tokens = bottom; + } ds4_tokens sys = {0}; agent_worker_build_system_tokens(w, &sys); @@ -7026,8 +7729,10 @@ static bool agent_worker_compact(agent_worker *w, const char *reason, agent_publish(w, "\x1b[0m\n", 5); ds4_tokens_free(&prompt); - if (!summary.ptr || !summary.ptr[0]) { - snprintf(err, err_len, "compaction summary was empty"); + if (!summary.ptr || !summary.ptr[0] || + !agent_compact_summary_has_signal(summary.ptr)) + { + snprintf(err, err_len, "compaction summary was empty or malformed"); ds4_session_invalidate(w->session); ds4_tokens_free(&sys); free(summary.ptr); @@ -7045,7 +7750,9 @@ static bool agent_worker_compact(agent_worker *w, const char *reason, if (summary_msg.len && summary_msg.ptr[summary_msg.len - 1] != '\n') agent_buf_puts(&summary_msg, "\n"); agent_buf_puts(&summary_msg, "[End compacted summary. Recent conversation continues verbatim below.]\n\n"); + int before_summary_tokens = compacted.len; ds4_chat_append_message(w->engine, &compacted, "system", summary_msg.ptr); + int summary_tokens = compacted.len - before_summary_tokens; free(summary_msg.ptr); free(summary.ptr); @@ -7081,13 +7788,23 @@ static bool agent_worker_compact(agent_worker *w, const char *reason, agent_trace(w, "compacted reason=\"%s\" old=%d new=%d tail_start=%d tail=%d", reason ? reason : "", bottom, w->transcript.len, tail_start, bottom - tail_start); + if (stats) { + stats->old_tokens = bottom; + stats->new_tokens = w->transcript.len; + stats->summary_tokens = summary_tokens; + stats->tail_tokens = bottom - tail_start; + stats->removed_tokens = bottom - w->transcript.len; + if (stats->removed_tokens < 0) stats->removed_tokens = 0; + stats->reduction_percent = bottom > 0 ? + ((double)stats->removed_tokens * 100.0) / (double)bottom : 0.0; + } return true; } static bool agent_worker_compact_if_needed(agent_worker *w, const char *reason, char *err, size_t err_len) { if (!agent_worker_should_compact(w)) return true; - return agent_worker_compact(w, reason, err, err_len); + return agent_worker_compact(w, reason, err, err_len, NULL); } static int worker_accept_generated_token(agent_worker *w, @@ -7338,6 +8055,7 @@ static int worker_run_turn(agent_worker *w, const char *user_text) { } char *tool_result; + bool tool_result_already_appended = false; if (early_tool_error) { agent_buf b = {0}; agent_buf_puts(&b, "Tool error: "); @@ -7354,15 +8072,17 @@ static int worker_run_turn(agent_worker *w, const char *user_text) { agent_buf_puts(&b, agent_dsml_syntax_reminder); tool_result = agent_buf_take(&b); } else { - tool_result = agent_execute_tool_calls(w, &dsml.calls); + tool_result = agent_execute_tool_calls(w, &dsml.calls, + &tool_result_already_appended); } int projected_tokens = 0; - if (!agent_tool_result_fits_context(w, tool_result, + if (!tool_result_already_appended && + !agent_tool_result_fits_context(w, tool_result, AGENT_TOOL_RESULT_RESERVE_TOKENS, &projected_tokens)) { if (!agent_worker_compact(w, "tool result would exceed context", - compact_err, sizeof(compact_err))) + compact_err, sizeof(compact_err), NULL)) { free(tool_result); agent_dsml_parser_free(&dsml); @@ -7392,7 +8112,8 @@ static int worker_run_turn(agent_worker *w, const char *user_text) { } } } - ds4_chat_append_message(w->engine, &w->transcript, "tool", tool_result); + if (!tool_result_already_appended) + ds4_chat_append_message(w->engine, &w->transcript, "tool", tool_result); free(tool_result); agent_dsml_parser_free(&dsml); @@ -7499,7 +8220,7 @@ static void worker_run_deferred_compact(agent_worker *w) { int before = w->transcript.len; char err[160] = {0}; - if (agent_worker_compact(w, "user requested compaction", err, sizeof(err))) { + if (agent_worker_compact(w, "user requested compaction", err, sizeof(err), NULL)) { if (w->transcript.len != before) { pthread_mutex_lock(&w->mu); w->session_dirty = true; @@ -8857,6 +9578,8 @@ static int agent_worker_init(agent_worker *w, ds4_engine *engine, agent_config * .log_privdata = w, }; w->web = ds4_web_create(&web_cfg); + w->context_dir = ds4_kvstore_path_join(w->cache_dir, "context"); + w->world_epoch = ds4_agent_context_max_world_epoch(w->context_dir); w->sysprompt_path = ds4_kvstore_path_join(w->cache_dir, "sysprompt.kv"); if (cfg->gen.trace_path && cfg->gen.trace_path[0]) { w->trace = fopen(cfg->gen.trace_path, "ab"); @@ -8876,10 +9599,12 @@ static void agent_worker_free(agent_worker *w) { worker_stop(w); if (w->thread) pthread_join(w->thread, NULL); agent_bash_jobs_free(w); + ds4_agent_side_effects_free(&w->side_effects); ds4_web_free(w->web); ds4_session_free(w->session); ds4_tokens_free(&w->transcript); free(w->cache_dir); + free(w->context_dir); free(w->sysprompt_path); free(w->session_title); free(w->legacy_session_path_to_delete); diff --git a/ds4_agent_context.c b/ds4_agent_context.c new file mode 100644 index 00000000..054a5a98 --- /dev/null +++ b/ds4_agent_context.c @@ -0,0 +1,657 @@ +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif + +#include "ds4_agent_context.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DS4_AGENT_CONTEXT_MAX_META_BYTES (1024 * 1024) +#define DS4_AGENT_CONTEXT_MAX_SIDE_EFFECTS 64 + +typedef struct { + char *ptr; + size_t len; + size_t cap; +} ds4_agent_context_buf; + +static void ctx_set_err(char *err, size_t err_len, const char *fmt, ...) { + if (!err || err_len == 0) return; + va_list ap; + va_start(ap, fmt); + vsnprintf(err, err_len, fmt, ap); + va_end(ap); +} + +static void *ctx_xmalloc(size_t n) { + void *p = malloc(n ? n : 1); + if (!p) { + fprintf(stderr, "ds4-agent-context: out of memory\n"); + abort(); + } + return p; +} + +static void *ctx_xrealloc(void *ptr, size_t n) { + void *p = realloc(ptr, n ? n : 1); + if (!p) { + fprintf(stderr, "ds4-agent-context: out of memory\n"); + abort(); + } + return p; +} + +static char *ctx_xstrdup(const char *s) { + if (!s) s = ""; + size_t n = strlen(s); + char *out = ctx_xmalloc(n + 1); + memcpy(out, s, n + 1); + return out; +} + +static void ctx_buf_append(ds4_agent_context_buf *b, const char *s, size_t n) { + if (n == 0) return; + if (b->len + n + 1 > b->cap) { + size_t cap = b->cap ? b->cap : 256; + while (cap < b->len + n + 1) cap *= 2; + b->ptr = ctx_xrealloc(b->ptr, cap); + b->cap = cap; + } + memcpy(b->ptr + b->len, s, n); + b->len += n; + b->ptr[b->len] = '\0'; +} + +static void ctx_buf_puts(ds4_agent_context_buf *b, const char *s) { + if (s) ctx_buf_append(b, s, strlen(s)); +} + +static char *ctx_buf_take(ds4_agent_context_buf *b) { + if (!b->ptr) return ctx_xstrdup(""); + char *out = b->ptr; + b->ptr = NULL; + b->len = 0; + b->cap = 0; + return out; +} + +static int ctx_read_file_bytes(const char *path, char **data, size_t *len, + char *err, size_t err_len) { + FILE *fp = fopen(path, "rb"); + if (!fp) { + ctx_set_err(err, err_len, "%s", strerror(errno)); + return -1; + } + if (fseek(fp, 0, SEEK_END) != 0) { + ctx_set_err(err, err_len, "%s", strerror(errno)); + fclose(fp); + return -1; + } + long sz = ftell(fp); + if (sz < 0) { + ctx_set_err(err, err_len, "%s", strerror(errno)); + fclose(fp); + return -1; + } + if ((unsigned long)sz > DS4_AGENT_CONTEXT_MAX_META_BYTES) { + ctx_set_err(err, err_len, "metadata file too large: %s", path); + fclose(fp); + return -1; + } + rewind(fp); + char *buf = ctx_xmalloc((size_t)sz + 1); + size_t got = fread(buf, 1, (size_t)sz, fp); + if (got != (size_t)sz && ferror(fp)) { + ctx_set_err(err, err_len, "%s", strerror(errno)); + free(buf); + fclose(fp); + return -1; + } + buf[got] = '\0'; + fclose(fp); + if (data) *data = buf; else free(buf); + if (len) *len = got; + return 0; +} + +void ds4_agent_context_meta_free(ds4_agent_context_meta *m) { + if (!m) return; + free(m->label); + free(m->kv_file); + free(m->memory_file); + memset(m, 0, sizeof(*m)); +} + +bool ds4_agent_context_id_valid(const char *id) { + if (!id || strlen(id) != 40) return false; + for (int i = 0; i < 40; i++) { + if (!isxdigit((unsigned char)id[i])) return false; + } + return true; +} + +bool ds4_agent_context_file_component_safe(const char *s) { + if (!s || !s[0]) return false; + for (const char *p = s; *p; p++) { + if (*p == '/' || *p == '\\') return false; + } + return true; +} + +char *ds4_agent_context_file_name(const char id[41], const char *suffix) { + ds4_agent_context_buf b = {0}; + ctx_buf_append(&b, id, 40); + ctx_buf_puts(&b, suffix); + return ctx_buf_take(&b); +} + +char *ds4_agent_context_path_for_file(const char *context_dir, const char *file) { + if (!context_dir || !context_dir[0]) return ctx_xstrdup(file ? file : ""); + if (!file || !file[0]) return ctx_xstrdup(context_dir); + size_t dir_len = strlen(context_dir); + bool need_sep = context_dir[dir_len - 1] != '/'; + ds4_agent_context_buf b = {0}; + ctx_buf_puts(&b, context_dir); + if (need_sep) ctx_buf_puts(&b, "/"); + ctx_buf_puts(&b, file); + return ctx_buf_take(&b); +} + +char *ds4_agent_context_limited_strdup(const char *s, size_t max) { + if (!s) return ctx_xstrdup(""); + size_t n = strlen(s); + if (n > max) n = max; + char *out = ctx_xmalloc(n + 1); + memcpy(out, s, n); + out[n] = '\0'; + return out; +} + +char *ds4_agent_context_oneline(const char *s, size_t max) { + char *out = ds4_agent_context_limited_strdup(s, max); + for (char *p = out; *p; p++) { + if (*p == '\n' || *p == '\r' || *p == '\t') *p = ' '; + } + return out; +} + +static void ctx_json_escape(ds4_agent_context_buf *b, const char *s) { + if (!s) return; + for (const unsigned char *p = (const unsigned char *)s; *p; p++) { + switch (*p) { + case '\\': ctx_buf_puts(b, "\\\\"); break; + case '"': ctx_buf_puts(b, "\\\""); break; + case '\n': ctx_buf_puts(b, "\\n"); break; + case '\r': ctx_buf_puts(b, "\\r"); break; + case '\t': ctx_buf_puts(b, "\\t"); break; + default: + if (*p < 0x20) { + char tmp[8]; + snprintf(tmp, sizeof(tmp), "\\u%04x", *p); + ctx_buf_puts(b, tmp); + } else { + char c = (char)*p; + ctx_buf_append(b, &c, 1); + } + break; + } + } +} + +static bool ctx_write_atomic_text(const char *path, const char *text, + char *err, size_t err_len) { + ds4_agent_context_buf tmpl = {0}; + ctx_buf_puts(&tmpl, path); + ctx_buf_puts(&tmpl, ".tmp.XXXXXX"); + char *tmp = ctx_buf_take(&tmpl); + int fd = mkstemp(tmp); + if (fd < 0) { + ctx_set_err(err, err_len, "%s", strerror(errno)); + free(tmp); + return false; + } + FILE *fp = fdopen(fd, "wb"); + if (!fp) { + ctx_set_err(err, err_len, "%s", strerror(errno)); + close(fd); + unlink(tmp); + free(tmp); + return false; + } + size_t len = strlen(text ? text : ""); + errno = 0; + bool ok = fwrite(text ? text : "", 1, len, fp) == len && fflush(fp) == 0; + int saved_errno = errno; + if (fclose(fp) != 0) { + if (!saved_errno) saved_errno = errno; + ok = false; + } + if (ok && rename(tmp, path) != 0) { + saved_errno = errno; + ok = false; + } + if (!ok) { + ctx_set_err(err, err_len, "%s", + saved_errno ? strerror(saved_errno) : "write failed"); + unlink(tmp); + } + free(tmp); + return ok; +} + +bool ds4_agent_context_write_meta(const ds4_agent_context_meta *m, + const char *meta_path, + char *err, size_t err_len) { + ds4_agent_context_buf b = {0}; + char num[80]; + ctx_buf_puts(&b, "{\n"); + ctx_buf_puts(&b, " \"id\": \""); + ctx_json_escape(&b, m->id); + ctx_buf_puts(&b, "\",\n \"label\": \""); + ctx_json_escape(&b, m->label ? m->label : ""); + ctx_buf_puts(&b, "\",\n"); + snprintf(num, sizeof(num), " \"created_at\": %" PRIu64 ",\n", m->created_at); + ctx_buf_puts(&b, num); + snprintf(num, sizeof(num), " \"world_epoch\": %" PRIu64 ",\n", m->world_epoch); + ctx_buf_puts(&b, num); + snprintf(num, sizeof(num), " \"transcript_tokens\": %d,\n", m->transcript_tokens); + ctx_buf_puts(&b, num); + ctx_buf_puts(&b, " \"kv_path\": \""); + ctx_json_escape(&b, m->kv_file ? m->kv_file : ""); + ctx_buf_puts(&b, "\",\n \"memory_path\": \""); + ctx_json_escape(&b, m->memory_file ? m->memory_file : ""); + ctx_buf_puts(&b, "\",\n \"memory_sha1\": null\n}\n"); + char *text = ctx_buf_take(&b); + bool ok = ctx_write_atomic_text(meta_path, text, err, err_len); + free(text); + return ok; +} + +static const char *ctx_json_skip_string(const char *p) { + if (!p || *p != '"') return p; + p++; + while (*p) { + if (*p == '\\' && p[1]) { + p += 2; + continue; + } + if (*p == '"') return p + 1; + p++; + } + return p; +} + +static bool ctx_json_key_matches(const char *start, const char *end, + const char *key) { + const char *p = start; + const char *k = key; + while (p < end) { + char c = *p++; + if (c == '\\' && p < end) c = *p++; + if (*k != c) return false; + k++; + } + return *k == '\0'; +} + +static const char *ctx_json_find_value(const char *json, const char *key) { + const char *p = json; + while (p && *p) { + if (*p != '"') { + p++; + continue; + } + const char *start = p + 1; + const char *after = ctx_json_skip_string(p); + if (!after || after == p || after[-1] != '"') return NULL; + const char *end = after - 1; + const char *q = after; + while (*q && isspace((unsigned char)*q)) q++; + if (*q == ':' && ctx_json_key_matches(start, end, key)) { + q++; + while (*q && isspace((unsigned char)*q)) q++; + return q; + } + p = after; + } + return NULL; +} + +static bool ctx_json_get_string(const char *json, const char *key, char **out) { + const char *p = ctx_json_find_value(json, key); + if (!p || *p != '"') return false; + p++; + ds4_agent_context_buf b = {0}; + while (*p && *p != '"') { + if (*p == '\\') { + p++; + if (!*p) break; + switch (*p) { + case 'n': ctx_buf_puts(&b, "\n"); break; + case 'r': ctx_buf_puts(&b, "\r"); break; + case 't': ctx_buf_puts(&b, "\t"); break; + case '\\': ctx_buf_puts(&b, "\\"); break; + case '"': ctx_buf_puts(&b, "\""); break; + default: ctx_buf_append(&b, p, 1); break; + } + p++; + } else { + ctx_buf_append(&b, p, 1); + p++; + } + } + if (*p != '"') { + free(b.ptr); + return false; + } + *out = ctx_buf_take(&b); + return true; +} + +static bool ctx_json_get_u64(const char *json, const char *key, uint64_t *out) { + const char *p = ctx_json_find_value(json, key); + if (!p || !isdigit((unsigned char)*p)) return false; + char *end = NULL; + unsigned long long v = strtoull(p, &end, 10); + if (end == p) return false; + *out = (uint64_t)v; + return true; +} + +static bool ctx_json_get_int(const char *json, const char *key, int *out) { + uint64_t v = 0; + if (!ctx_json_get_u64(json, key, &v) || v > (uint64_t)INT_MAX) return false; + *out = (int)v; + return true; +} + +bool ds4_agent_context_read_meta_file(const char *path, + ds4_agent_context_meta *m, + char *err, size_t err_len) { + char *json = NULL; + size_t len = 0; + if (ctx_read_file_bytes(path, &json, &len, err, err_len) != 0) return false; + (void)len; + memset(m, 0, sizeof(*m)); + char *id = NULL; + bool ok = ctx_json_get_string(json, "id", &id); + if (ok) { + if (strlen(id) < sizeof(m->id)) snprintf(m->id, sizeof(m->id), "%s", id); + else ok = false; + free(id); + if (ok) ok = ds4_agent_context_id_valid(m->id); + } + if (ok && !ctx_json_get_string(json, "label", &m->label)) + m->label = ctx_xstrdup(""); + if (ok && !ctx_json_get_u64(json, "created_at", &m->created_at)) + ok = false; + if (ok && !ctx_json_get_u64(json, "world_epoch", &m->world_epoch)) + ok = false; + if (ok && !ctx_json_get_int(json, "transcript_tokens", &m->transcript_tokens)) + ok = false; + if (ok && !ctx_json_get_string(json, "kv_path", &m->kv_file)) + m->kv_file = ds4_agent_context_file_name(m->id, ".kv"); + if (ok && !ctx_json_get_string(json, "memory_path", &m->memory_file)) + m->memory_file = ds4_agent_context_file_name(m->id, ".memory.md"); + if (ok && !ds4_agent_context_file_component_safe(m->kv_file)) ok = false; + if (ok && m->memory_file && m->memory_file[0] && + !ds4_agent_context_file_component_safe(m->memory_file)) ok = false; + if (!ok) { + ctx_set_err(err, err_len, "invalid context metadata: %s", path); + ds4_agent_context_meta_free(m); + } + free(json); + return ok; +} + +bool ds4_agent_context_meta_filename(const char *name) { + size_t n = strlen(name); + static const char suffix[] = ".meta.json"; + size_t s = sizeof(suffix) - 1; + return n > s && !strcmp(name + n - s, suffix); +} + +int ds4_agent_context_count_checkpoints(const char *context_dir) { + DIR *d = opendir(context_dir); + if (!d) return 0; + int count = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (ds4_agent_context_meta_filename(de->d_name)) count++; + } + closedir(d); + return count; +} + +uint64_t ds4_agent_context_max_world_epoch(const char *context_dir) { + DIR *d = opendir(context_dir); + if (!d) return 0; + uint64_t max_epoch = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (!ds4_agent_context_meta_filename(de->d_name)) continue; + char *meta_path = ds4_agent_context_path_for_file(context_dir, de->d_name); + ds4_agent_context_meta m = {0}; + char err[160] = {0}; + if (ds4_agent_context_read_meta_file(meta_path, &m, err, sizeof(err)) && + m.world_epoch > max_epoch) + max_epoch = m.world_epoch; + ds4_agent_context_meta_free(&m); + free(meta_path); + } + closedir(d); + return max_epoch; +} + +char *ds4_agent_context_full_kv_path(const char *context_dir, + const ds4_agent_context_meta *m) { + return ds4_agent_context_path_for_file(context_dir, m->kv_file); +} + +char *ds4_agent_context_full_memory_path(const char *context_dir, + const ds4_agent_context_meta *m) { + if (!m->memory_file || !m->memory_file[0]) return NULL; + return ds4_agent_context_path_for_file(context_dir, m->memory_file); +} + +bool ds4_agent_context_find_checkpoint(const char *context_dir, + const char *prefix, + ds4_agent_context_meta *found, + char **meta_path_out, + char **kv_path_out, + char *err, size_t err_len) { + if (!prefix || !prefix[0]) { + ctx_set_err(err, err_len, "context id is required"); + return false; + } + size_t prefix_len = strlen(prefix); + DIR *d = opendir(context_dir); + if (!d) { + ctx_set_err(err, err_len, "no context checkpoints found"); + return false; + } + bool matched = false; + bool ambiguous = false; + ds4_agent_context_meta best = {0}; + char *best_meta_path = NULL; + char *best_kv_path = NULL; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (!ds4_agent_context_meta_filename(de->d_name)) continue; + char *meta_path = ds4_agent_context_path_for_file(context_dir, de->d_name); + ds4_agent_context_meta m = {0}; + char parse_err[160] = {0}; + if (!ds4_agent_context_read_meta_file(meta_path, &m, parse_err, + sizeof(parse_err))) { + free(meta_path); + continue; + } + if (!strncmp(m.id, prefix, prefix_len)) { + if (matched) { + ambiguous = true; + ds4_agent_context_meta_free(&m); + free(meta_path); + break; + } + matched = true; + best = m; + best_meta_path = meta_path; + best_kv_path = ds4_agent_context_full_kv_path(context_dir, &best); + } else { + ds4_agent_context_meta_free(&m); + free(meta_path); + } + } + closedir(d); + if (ambiguous) { + ds4_agent_context_meta_free(&best); + free(best_meta_path); + free(best_kv_path); + ctx_set_err(err, err_len, "context id prefix is ambiguous: %s", prefix); + return false; + } + if (!matched) { + ctx_set_err(err, err_len, "context checkpoint not found: %s", prefix); + return false; + } + *found = best; + if (meta_path_out) *meta_path_out = best_meta_path; else free(best_meta_path); + if (kv_path_out) *kv_path_out = best_kv_path; else free(best_kv_path); + return true; +} + +void ds4_agent_side_effects_free(ds4_agent_side_effects *effects) { + if (!effects) return; + ds4_agent_side_effect *e = effects->head; + while (e) { + ds4_agent_side_effect *next = e->next; + free(e->kind); + free(e->detail); + free(e); + e = next; + } + effects->head = NULL; + effects->count = 0; + effects->evicted_count = 0; + effects->latest_evicted_epoch = 0; +} + +uint64_t ds4_agent_side_effects_note(ds4_agent_side_effects *effects, + uint64_t current_epoch, + const char *kind, + const char *detail) { + if (!effects) return current_epoch; + uint64_t next_epoch = current_epoch == UINT64_MAX ? current_epoch : current_epoch + 1; + ds4_agent_side_effect *e = ctx_xmalloc(sizeof(*e)); + memset(e, 0, sizeof(*e)); + e->epoch = next_epoch; + e->kind = ctx_xstrdup(kind && kind[0] ? kind : "tool"); + e->detail = ctx_xstrdup(detail && detail[0] ? detail : ""); + e->next = effects->head; + effects->head = e; + effects->count++; + + while (effects->count > DS4_AGENT_CONTEXT_MAX_SIDE_EFFECTS) { + ds4_agent_side_effect **link = &effects->head; + while (*link && (*link)->next) link = &(*link)->next; + if (!*link) break; + ds4_agent_side_effect *old = *link; + *link = NULL; + effects->evicted_count++; + if (old->epoch > effects->latest_evicted_epoch) + effects->latest_evicted_epoch = old->epoch; + free(old->kind); + free(old->detail); + free(old); + effects->count--; + } + + return next_epoch; +} + +char *ds4_agent_side_effects_summary_since(const ds4_agent_side_effects *effects, + uint64_t epoch) { + ds4_agent_context_buf b = {0}; + if (effects && effects->latest_evicted_epoch > epoch) { + char line[256]; + snprintf(line, sizeof(line), + "Known side effects after checkpoint may be incomplete: " + "%" PRIu64 " older side effect(s) were dropped from memory " + "up to epoch=%" PRIu64 ".\n", + effects->evicted_count, effects->latest_evicted_epoch); + ctx_buf_puts(&b, line); + } + int shown = 0; + for (const ds4_agent_side_effect *e = effects ? effects->head : NULL; e; e = e->next) { + if (e->epoch <= epoch) continue; + if (shown == 0) ctx_buf_puts(&b, "Known side effects after checkpoint:\n"); + char *detail = ds4_agent_context_oneline(e->detail, 180); + char line[320]; + snprintf(line, sizeof(line), "- epoch=%" PRIu64 " %s %s\n", + e->epoch, e->kind ? e->kind : "tool", detail); + ctx_buf_puts(&b, line); + free(detail); + shown++; + if (shown >= 8) { + ctx_buf_puts(&b, "- ... more side effects omitted ...\n"); + break; + } + } + return ctx_buf_take(&b); +} + +bool ds4_agent_context_no_running_bash_guard(const char *action, + int running_bash_jobs, + char *err, + size_t err_len) { + if (running_bash_jobs <= 0) return true; + ctx_set_err(err, err_len, + "context %s denied because %d bash job(s) are still running; " + "use bash_status or bash_stop first", + action && action[0] ? action : "operation", + running_bash_jobs); + return false; +} + +bool ds4_agent_context_restore_epoch_guard(uint64_t current_epoch, + uint64_t checkpoint_epoch, + bool allow_side_effect_mismatch, + char *err, + size_t err_len) { + if (current_epoch == checkpoint_epoch || allow_side_effect_mismatch) + return true; + ctx_set_err(err, err_len, + "restore would rewind model context from world_epoch=%" PRIu64 + " to %" PRIu64 ", but external side effects may still exist. " + "Revert or inspect those effects, or call context restore with " + "allow_side_effect_mismatch=true.", + current_epoch, checkpoint_epoch); + return false; +} + +char *ds4_agent_context_restore_expected_metrics_line( + const ds4_agent_context_restore_metrics *metrics) { + ds4_agent_context_buf b = {0}; + char line[384]; + int checkpoint_tokens = metrics ? metrics->checkpoint_tokens : 0; + int notice_tokens = metrics ? metrics->restore_notice_tokens : 0; + int restored_tokens = metrics ? metrics->restored_tokens : 0; + if (checkpoint_tokens < 0) checkpoint_tokens = 0; + if (notice_tokens < 0) notice_tokens = 0; + if (restored_tokens < 0) restored_tokens = 0; + snprintf(line, sizeof(line), + "KV restore expected metrics: checkpoint_tokens=%d expected_restore_notice_tokens=%d expected_restored_tokens=%d expected_prefill_suffix_tokens=%d expected_full_prefill_tokens_without_kv=%d expected_saved_prefill_tokens=%d.\n", + checkpoint_tokens, notice_tokens, restored_tokens, + notice_tokens, restored_tokens, checkpoint_tokens); + ctx_buf_puts(&b, line); + return ctx_buf_take(&b); +} diff --git a/ds4_agent_context.h b/ds4_agent_context.h new file mode 100644 index 00000000..b36b31b1 --- /dev/null +++ b/ds4_agent_context.h @@ -0,0 +1,85 @@ +#ifndef DS4_AGENT_CONTEXT_H +#define DS4_AGENT_CONTEXT_H + +#include +#include +#include + +typedef struct ds4_agent_context_meta { + char id[41]; + char *label; + char *kv_file; + char *memory_file; + uint64_t created_at; + uint64_t world_epoch; + int transcript_tokens; +} ds4_agent_context_meta; + +typedef struct ds4_agent_side_effect { + uint64_t epoch; + char *kind; + char *detail; + struct ds4_agent_side_effect *next; +} ds4_agent_side_effect; + +typedef struct ds4_agent_side_effects { + ds4_agent_side_effect *head; + int count; + uint64_t evicted_count; + uint64_t latest_evicted_epoch; +} ds4_agent_side_effects; + +typedef struct ds4_agent_context_restore_metrics { + int checkpoint_tokens; + int restore_notice_tokens; + int restored_tokens; +} ds4_agent_context_restore_metrics; + +void ds4_agent_context_meta_free(ds4_agent_context_meta *m); +bool ds4_agent_context_id_valid(const char *id); +bool ds4_agent_context_file_component_safe(const char *s); +char *ds4_agent_context_file_name(const char id[41], const char *suffix); +char *ds4_agent_context_path_for_file(const char *context_dir, const char *file); +char *ds4_agent_context_limited_strdup(const char *s, size_t max); +char *ds4_agent_context_oneline(const char *s, size_t max); + +bool ds4_agent_context_write_meta(const ds4_agent_context_meta *m, + const char *meta_path, + char *err, size_t err_len); +bool ds4_agent_context_read_meta_file(const char *path, + ds4_agent_context_meta *m, + char *err, size_t err_len); +bool ds4_agent_context_meta_filename(const char *name); +int ds4_agent_context_count_checkpoints(const char *context_dir); +uint64_t ds4_agent_context_max_world_epoch(const char *context_dir); +char *ds4_agent_context_full_kv_path(const char *context_dir, + const ds4_agent_context_meta *m); +char *ds4_agent_context_full_memory_path(const char *context_dir, + const ds4_agent_context_meta *m); +bool ds4_agent_context_find_checkpoint(const char *context_dir, + const char *prefix, + ds4_agent_context_meta *found, + char **meta_path_out, + char **kv_path_out, + char *err, size_t err_len); + +void ds4_agent_side_effects_free(ds4_agent_side_effects *effects); +uint64_t ds4_agent_side_effects_note(ds4_agent_side_effects *effects, + uint64_t current_epoch, + const char *kind, + const char *detail); +char *ds4_agent_side_effects_summary_since(const ds4_agent_side_effects *effects, + uint64_t epoch); +bool ds4_agent_context_no_running_bash_guard(const char *action, + int running_bash_jobs, + char *err, + size_t err_len); +bool ds4_agent_context_restore_epoch_guard(uint64_t current_epoch, + uint64_t checkpoint_epoch, + bool allow_side_effect_mismatch, + char *err, + size_t err_len); +char *ds4_agent_context_restore_expected_metrics_line( + const ds4_agent_context_restore_metrics *metrics); + +#endif diff --git a/tests/ds4_agent_context_compact_canary_e2e.sh b/tests/ds4_agent_context_compact_canary_e2e.sh new file mode 100755 index 00000000..ee2c57f7 --- /dev/null +++ b/tests/ds4_agent_context_compact_canary_e2e.sh @@ -0,0 +1,195 @@ +#!/bin/sh +set -eu + +ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +BASE=${TMPDIR:-/tmp} +RUN_DIR=$(mktemp -d "${BASE%/}/ds4-agent-compact-canary.XXXXXX") +HOME_DIR="$RUN_DIR/home" +WORK_DIR="$RUN_DIR/work" +OUT="$RUN_DIR/output.txt" +TRACE="$RUN_DIR/trace.txt" +LEDGER="$WORK_DIR/ds4-compact-canary-ledger.md" +PROMPT_FILE="$RUN_DIR/prompt.md" +PROMPT_TMP="$RUN_DIR/prompt.tmp" +PADDING_FILE="$RUN_DIR/padding.txt" +REPORT_DIR="$ROOT/tests/generated" +REPORT="$REPORT_DIR/ds4_agent_context_compact_canary_report.md" +REPORT_PROMPT="$REPORT_DIR/ds4_agent_context_compact_canary_prompt.md" +REPORT_OUTPUT="$REPORT_DIR/ds4_agent_context_compact_canary_output.txt" +REPORT_TRACE="$REPORT_DIR/ds4_agent_context_compact_canary_trace.txt" +REPORT_LEDGER="$REPORT_DIR/ds4_agent_context_compact_canary_ledger.md" + +print_report_file() { + if [ -f "$1" ]; then + sed 's/```/` ` `/g' "$1" + else + printf '(missing: %s)\n' "$1" + fi +} + +write_report() { + mkdir -p "$REPORT_DIR" + [ -f "$PROMPT_FILE" ] && cp "$PROMPT_FILE" "$REPORT_PROMPT" + [ -f "$OUT" ] && cp "$OUT" "$REPORT_OUTPUT" + [ -f "$TRACE" ] && cp "$TRACE" "$REPORT_TRACE" + [ -f "$LEDGER" ] && cp "$LEDGER" "$REPORT_LEDGER" + { + printf '# DS4 Agent Context Compact Canary Report\n\n' + printf 'prompt_file: `%s`\n\n' "$REPORT_PROMPT" + printf 'response_file: `%s`\n\n' "$REPORT_OUTPUT" + printf 'trace_file: `%s`\n\n' "$REPORT_TRACE" + printf 'ledger_file: `%s`\n\n' "$REPORT_LEDGER" + printf 'run_dir: `%s`\n\n' "$RUN_DIR" + printf '## Prompt\n\n```text\n' + print_report_file "$PROMPT_FILE" + printf '\n```\n\n## DS4 Response\n\n```text\n' + print_report_file "$OUT" + printf '\n```\n\n## Trace\n\n```text\n' + print_report_file "$TRACE" + printf '\n```\n\n## Ledger\n\n```text\n' + print_report_file "$LEDGER" + printf '\n```\n' + } >"$REPORT" +} + +cleanup() { + write_report >/dev/null 2>&1 || true + if [ "${DS4_KEEP_COMPACT_CANARY_TEST_DIR:-0}" != "1" ]; then + rm -rf "$RUN_DIR" + else + printf 'kept test directory: %s\n' "$RUN_DIR" >&2 + fi +} +trap cleanup EXIT + +mkdir -p "$HOME_DIR" "$WORK_DIR" + +escape_sed() { + printf '%s' "$1" | sed 's/[&|]/\\&/g' +} + +PADDING_LINES=${DS4_AGENT_COMPACT_CANARY_PADDING_LINES:-180} +i=1 +while [ "$i" -le "$PADDING_LINES" ]; do + printf 'Padding line %03d: irrelevant build-note-%03d contains no canary values and should not be retained.\n' "$i" "$i" >>"$PADDING_FILE" + i=$((i + 1)) +done + +ROOT_ESC=$(escape_sed "$ROOT") +LEDGER_ESC=$(escape_sed "$LEDGER") +sed \ + -e "s|__ROOT__|$ROOT_ESC|g" \ + -e "s|__LEDGER__|$LEDGER_ESC|g" \ + "$ROOT/tests/ds4_agent_context_compact_canary_prompt.md" >"$PROMPT_TMP" + +while IFS= read -r line; do + if [ "$line" = "__PADDING__" ]; then + cat "$PADDING_FILE" + else + printf '%s\n' "$line" + fi +done <"$PROMPT_TMP" >"$PROMPT_FILE" + +if ! HOME="$HOME_DIR" "$ROOT/ds4-agent" \ + --model "$ROOT/ds4flash.gguf" \ + --non-interactive \ + --chdir "$ROOT" \ + --ctx "${DS4_AGENT_COMPACT_CANARY_CTX:-8192}" \ + --tokens "${DS4_AGENT_COMPACT_CANARY_TOKENS:-3500}" \ + --temp 0 \ + --seed 7 \ + --nothink \ + --trace "$TRACE" \ + --prompt "$(cat "$PROMPT_FILE")" >"$OUT" 2>&1 +then + cat "$OUT" >&2 + echo "ds4-agent compact canary run failed" >&2 + exit 1 +fi + +if [ ! -f "$LEDGER" ]; then + cat "$OUT" >&2 + echo "missing generated compact canary ledger: $LEDGER" >&2 + exit 1 +fi + +COMPACT_LINE=$(grep 'compacted reason="canary-retention-test"' "$TRACE" | tail -n 1 || true) +if [ -z "$COMPACT_LINE" ]; then + cat "$TRACE" >&2 + echo "trace does not prove context compaction happened" >&2 + exit 1 +fi + +COMPACT_OLD=$(printf '%s\n' "$COMPACT_LINE" | sed -n 's/.* old=\([0-9][0-9]*\) .*/\1/p') +COMPACT_NEW=$(printf '%s\n' "$COMPACT_LINE" | sed -n 's/.* new=\([0-9][0-9]*\) .*/\1/p') +COMPACT_TAIL_START=$(printf '%s\n' "$COMPACT_LINE" | sed -n 's/.* tail_start=\([0-9][0-9]*\) .*/\1/p') + +if [ -z "$COMPACT_OLD" ] || [ -z "$COMPACT_NEW" ] || [ -z "$COMPACT_TAIL_START" ]; then + cat "$TRACE" >&2 + echo "trace compact line is missing old/new/tail_start metrics" >&2 + exit 1 +fi + +if [ "$COMPACT_NEW" -ge "$COMPACT_OLD" ]; then + cat "$TRACE" >&2 + echo "context compaction did not reduce token count" >&2 + exit 1 +fi + +if [ "$COMPACT_TAIL_START" -lt "${DS4_AGENT_COMPACT_CANARY_MIN_TAIL_START:-3000}" ]; then + cat "$TRACE" >&2 + echo "recent tail started too early to make this a useful retention canary" >&2 + exit 1 +fi + +grep -q "compaction=done" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing compaction=done" >&2 + exit 1 +} + +grep -q "canary_alpha=ORCHID-47" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger lost canary_alpha" >&2 + exit 1 +} + +grep -q "canary_beta=FJORD-932" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger lost canary_beta" >&2 + exit 1 +} + +grep -q "canary_gamma=LEMMA-18" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger lost canary_gamma" >&2 + exit 1 +} + +grep -q "canary_delta=RUNE-604" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger lost canary_delta" >&2 + exit 1 +} + +grep -q "canary_epsilon=VECTOR-251" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger lost canary_epsilon" >&2 + exit 1 +} + +grep -q "final=COMPACT_CANARY_DONE" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing final marker" >&2 + exit 1 +} + +grep -q "COMPACT_CANARY_DONE" "$OUT" || { + cat "$OUT" >&2 + echo "DS4 did not finish compact canary scenario" >&2 + exit 1 +} + +write_report + +printf 'ds4 agent context compact canary e2e: ok\n' diff --git a/tests/ds4_agent_context_compact_canary_prompt.md b/tests/ds4_agent_context_compact_canary_prompt.md new file mode 100644 index 00000000..503af7d5 --- /dev/null +++ b/tests/ds4_agent_context_compact_canary_prompt.md @@ -0,0 +1,46 @@ +You are validating whether DS4 context compaction preserves task-critical facts. + +Do not explain the plan in prose. Use DSML tools. + +Repository root: __ROOT__ +Ledger path: __LEDGER__ + +The following five facts are task-critical canaries. They are the only source +of the values. Preserve them through compaction and use them after compaction: + +canary_alpha=ORCHID-47 +canary_beta=FJORD-932 +canary_gamma=LEMMA-18 +canary_delta=RUNE-604 +canary_epsilon=VECTOR-251 + +The next block is intentionally irrelevant padding. Do not copy it to the +ledger. Its only purpose is to push the canary values out of the recent verbatim +tail, so the compacted summary must preserve them. + +__PADDING__ + +Now execute exactly this sequence: + +1. Use the context tool with action=compact and reason=canary-retention-test. + This context compact call must be the only tool call in its DSML block. + +2. After the compact tool result, use the write tool to create the ledger file + at this exact path: + +__LEDGER__ + + The ledger must contain these exact field names and the exact canary values + from above: + +# DS4 Compact Canary Ledger +compaction=done +canary_alpha= +canary_beta= +canary_gamma= +canary_delta= +canary_epsilon= +final=COMPACT_CANARY_DONE + +3. After the write tool result, answer exactly: +COMPACT_CANARY_DONE diff --git a/tests/ds4_agent_context_loop_e2e.sh b/tests/ds4_agent_context_loop_e2e.sh new file mode 100644 index 00000000..bf382602 --- /dev/null +++ b/tests/ds4_agent_context_loop_e2e.sh @@ -0,0 +1,176 @@ +#!/bin/sh +set -eu + +ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +BASE=${TMPDIR:-/tmp} +RUN_DIR=$(mktemp -d "${BASE%/}/ds4-agent-context-loop.XXXXXX") +HOME_DIR="$RUN_DIR/home" +WORK_DIR="$RUN_DIR/work" +OUT="$RUN_DIR/output.txt" +LEDGER="$WORK_DIR/ds4-generated-loop.md" +PROMPT_FILE="$RUN_DIR/prompt.md" +REPORT_DIR="$ROOT/tests/generated" +REPORT="$REPORT_DIR/ds4_agent_context_loop_report.md" +REPORT_PROMPT="$REPORT_DIR/ds4_agent_context_loop_prompt.md" +REPORT_OUTPUT="$REPORT_DIR/ds4_agent_context_loop_output.txt" +REPORT_LEDGER="$REPORT_DIR/ds4_agent_context_loop_ledger.md" + +print_report_file() { + if [ -f "$1" ]; then + sed 's/```/` ` `/g' "$1" + else + printf '(missing: %s)\n' "$1" + fi +} + +write_report() { + mkdir -p "$REPORT_DIR" + if [ -f "$PROMPT_FILE" ]; then + cp "$PROMPT_FILE" "$REPORT_PROMPT" + fi + if [ -f "$OUT" ]; then + cp "$OUT" "$REPORT_OUTPUT" + fi + if [ -f "$LEDGER" ]; then + cp "$LEDGER" "$REPORT_LEDGER" + fi + { + printf '# DS4 Agent Context Loop Report\n\n' + printf 'prompt_file: `%s`\n\n' "$REPORT_PROMPT" + printf 'response_file: `%s`\n\n' "$REPORT_OUTPUT" + printf 'ledger_file: `%s`\n\n' "$REPORT_LEDGER" + printf 'run_dir: `%s`\n\n' "$RUN_DIR" + printf '## Prompt\n\n```text\n' + print_report_file "$PROMPT_FILE" + printf '\n```\n\n## DS4 Response\n\n```text\n' + print_report_file "$OUT" + printf '\n```\n\n## Ledger\n\n```text\n' + print_report_file "$LEDGER" + printf '\n```\n' + } >"$REPORT" +} + +cleanup() { + write_report >/dev/null 2>&1 || true + if [ "${DS4_KEEP_LOOP_TEST_DIR:-0}" != "1" ]; then + rm -rf "$RUN_DIR" + else + printf 'kept test directory: %s\n' "$RUN_DIR" >&2 + fi +} +trap cleanup EXIT + +mkdir -p "$HOME_DIR" "$WORK_DIR" + +escape_sed() { + printf '%s' "$1" | sed 's/[&|]/\\&/g' +} + +ROOT_ESC=$(escape_sed "$ROOT") +LEDGER_ESC=$(escape_sed "$LEDGER") +PROMPT=$(sed \ + -e "s|__ROOT__|$ROOT_ESC|g" \ + -e "s|__LEDGER__|$LEDGER_ESC|g" \ + "$ROOT/tests/ds4_agent_context_loop_prompt.md") +printf '%s\n' "$PROMPT" >"$PROMPT_FILE" + +if ! HOME="$HOME_DIR" "$ROOT/ds4-agent" \ + --model "$ROOT/ds4flash.gguf" \ + --non-interactive \ + --chdir "$ROOT" \ + --ctx "${DS4_AGENT_LOOP_CTX:-8192}" \ + --tokens "${DS4_AGENT_LOOP_TOKENS:-2500}" \ + --temp 0 \ + --seed 1 \ + --nothink \ + --prompt "$PROMPT" >"$OUT" 2>&1 +then + cat "$OUT" >&2 + echo "ds4-agent loop run failed" >&2 + exit 1 +fi + +if [ ! -f "$LEDGER" ]; then + cat "$OUT" >&2 + echo "missing generated ledger: $LEDGER" >&2 + exit 1 +fi + +grep -q "loop_limit=2" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing loop_limit=2" >&2 + exit 1 +} + +grep -Fq "ds4_prompt=validate DS4's own agent context loop capability" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing the compact DS4 prompt" >&2 + exit 1 +} + +grep -q "ds4_response=LOOP_DONE" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing the final DS4 response" >&2 + exit 1 +} + +grep -q "attempt=1 status=pass" "$LEDGER" || { + cat "$LEDGER" >&2 + cat "$OUT" >&2 + echo "DS4 did not mark the measured attempt as pass" >&2 + exit 1 +} + +grep -q "attempt=1 metric=ds4_agent_context_test passed" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing the expected metric" >&2 + exit 1 +} + +grep -q "LOOP_DONE" "$OUT" || { + cat "$OUT" >&2 + echo "DS4 did not finish the loop" >&2 + exit 1 +} + +CONTEXT_DIR="$HOME_DIR/.ds4/kvcache/context" +if [ ! -d "$CONTEXT_DIR" ]; then + cat "$OUT" >&2 + echo "missing context checkpoint directory: $CONTEXT_DIR" >&2 + exit 1 +fi + +if ! grep -R "ds4-generated-loop-after-pass" "$CONTEXT_DIR" >/dev/null 2>&1; then + find "$CONTEXT_DIR" -maxdepth 1 -type f -print >&2 + cat "$OUT" >&2 + echo "missing DS4-generated context checkpoint label" >&2 + exit 1 +fi + +write_report + +grep -q "^## Prompt" "$REPORT" || { + cat "$REPORT" >&2 + echo "report is missing the prompt section" >&2 + exit 1 +} + +grep -q "^## DS4 Response" "$REPORT" || { + cat "$REPORT" >&2 + echo "report is missing the DS4 response section" >&2 + exit 1 +} + +grep -q "^## Ledger" "$REPORT" || { + cat "$REPORT" >&2 + echo "report is missing the ledger section" >&2 + exit 1 +} + +grep -q "LOOP_DONE" "$REPORT" || { + cat "$REPORT" >&2 + echo "report is missing the final DS4 response" >&2 + exit 1 +} + +printf 'ds4 agent context loop e2e: ok\n' diff --git a/tests/ds4_agent_context_loop_prompt.md b/tests/ds4_agent_context_loop_prompt.md new file mode 100644 index 00000000..41dee3b1 --- /dev/null +++ b/tests/ds4_agent_context_loop_prompt.md @@ -0,0 +1,44 @@ +You are validating DS4's own agent context loop capability. + +Do not explain the plan in prose. Use DSML tools to execute exactly this loop. + +Repository root: __ROOT__ +Ledger path: __LEDGER__ + +Loop: +1. Use the write tool to create __LEDGER__ with this exact starting content: + +# DS4 Generated Context Loop +loop_limit=2 +goal=validate DS4 agent context loop against DS4's context helper test +baseline=before-ds4-agent-context-test +ds4_prompt=validate DS4's own agent context loop capability +ds4_response=pending +attempt=1 hypothesis=the context helper test passes +attempt=1 command=cd __ROOT__ && ./tests/ds4_agent_context_test +attempt=1 status=pending +attempt=1 metric=pending + +2. Use the bash tool to run exactly: +cd __ROOT__ && ./tests/ds4_agent_context_test + +3. If the bash result reports success, use the edit tool to replace: +ds4_response=pending +attempt=1 status=pending +attempt=1 metric=pending + +with: +ds4_response=LOOP_DONE +attempt=1 status=pass +attempt=1 metric=ds4_agent_context_test passed + +If the bash result reports failure, replace the same status and metric lines +with status=fail and the observed failure summary, and replace +ds4_response=pending with the final response you will return. + +4. If the attempt passed, use the context tool with action=checkpoint and label +ds4-generated-loop-after-pass. This context checkpoint call must be the only +tool call in its DSML block. + +5. After the checkpoint tool result, answer exactly: +LOOP_DONE diff --git a/tests/ds4_agent_context_self_improvement_e2e.sh b/tests/ds4_agent_context_self_improvement_e2e.sh new file mode 100755 index 00000000..cd99b6aa --- /dev/null +++ b/tests/ds4_agent_context_self_improvement_e2e.sh @@ -0,0 +1,241 @@ +#!/bin/sh +set -eu + +ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +BASE=${TMPDIR:-/tmp} +RUN_DIR=$(mktemp -d "${BASE%/}/ds4-agent-context-self-improvement.XXXXXX") +HOME_DIR="$RUN_DIR/home" +REPO="$RUN_DIR/repo" +OUT="$RUN_DIR/output.txt" +TRACE="$RUN_DIR/trace.txt" +LEDGER="$RUN_DIR/ds4-context-self-improvement-ledger.md" +PROMPT_FILE="$RUN_DIR/prompt.md" +REPORT_DIR="$ROOT/tests/generated" +REPORT="$REPORT_DIR/ds4_agent_context_self_improvement_report.md" +REPORT_PROMPT="$REPORT_DIR/ds4_agent_context_self_improvement_prompt.md" +REPORT_OUTPUT="$REPORT_DIR/ds4_agent_context_self_improvement_output.txt" +REPORT_TRACE="$REPORT_DIR/ds4_agent_context_self_improvement_trace.txt" +REPORT_LEDGER="$REPORT_DIR/ds4_agent_context_self_improvement_ledger.md" + +print_report_file() { + if [ -f "$1" ]; then + sed 's/```/` ` `/g' "$1" + else + printf '(missing: %s)\n' "$1" + fi +} + +write_report() { + mkdir -p "$REPORT_DIR" + [ -f "$PROMPT_FILE" ] && cp "$PROMPT_FILE" "$REPORT_PROMPT" + [ -f "$OUT" ] && cp "$OUT" "$REPORT_OUTPUT" + [ -f "$TRACE" ] && cp "$TRACE" "$REPORT_TRACE" + [ -f "$LEDGER" ] && cp "$LEDGER" "$REPORT_LEDGER" + { + printf '# DS4 Agent Context Self Improvement Report\n\n' + printf 'prompt_file: `%s`\n\n' "$REPORT_PROMPT" + printf 'response_file: `%s`\n\n' "$REPORT_OUTPUT" + printf 'trace_file: `%s`\n\n' "$REPORT_TRACE" + printf 'ledger_file: `%s`\n\n' "$REPORT_LEDGER" + printf 'run_dir: `%s`\n\n' "$RUN_DIR" + printf 'repo: `%s`\n\n' "$REPO" + printf '## Prompt\n\n```text\n' + print_report_file "$PROMPT_FILE" + printf '\n```\n\n## DS4 Response\n\n```text\n' + print_report_file "$OUT" + printf '\n```\n\n## Trace\n\n```text\n' + print_report_file "$TRACE" + printf '\n```\n\n## Ledger\n\n```text\n' + print_report_file "$LEDGER" + printf '\n```\n\n## Final toy_math.py\n\n```python\n' + print_report_file "$REPO/toy_math.py" + printf '\n```\n' + } >"$REPORT" +} + +cleanup() { + write_report >/dev/null 2>&1 || true + if [ "${DS4_KEEP_CONTEXT_SELF_IMPROVEMENT_TEST_DIR:-0}" != "1" ]; then + rm -rf "$RUN_DIR" + else + printf 'kept test directory: %s\n' "$RUN_DIR" >&2 + fi +} +trap cleanup EXIT + +mkdir -p "$HOME_DIR" "$REPO" + +cat >"$REPO/toy_math.py" <<'PY' +def normalize_score(value, maximum): + """Return value as a score in the inclusive range 0.0..1.0.""" + if maximum == 0: + return 0.0 + return value / maximum +PY + +cat >"$REPO/test_toy_math.py" <<'PY' +from toy_math import normalize_score + + +def check(name, got, expected): + if got != expected: + raise SystemExit(f"{name}: got {got!r}, expected {expected!r}") + + +check("normal", normalize_score(3, 6), 0.5) +check("zero maximum", normalize_score(3, 0), 0.0) +check("negative maximum", normalize_score(3, -1), 0.0) +check("lower clamp", normalize_score(-2, 10), 0.0) +check("upper clamp", normalize_score(12, 10), 1.0) +print("toy_math tests passed") +PY + +git -C "$REPO" init -q +git -C "$REPO" config user.email ds4-agent-test@example.invalid +git -C "$REPO" config user.name "DS4 Agent Test" +git -C "$REPO" add toy_math.py test_toy_math.py +git -C "$REPO" commit -q -m "initial broken toy math" + +escape_sed() { + printf '%s' "$1" | sed 's/[&|]/\\&/g' +} + +REPO_ESC=$(escape_sed "$REPO") +LEDGER_ESC=$(escape_sed "$LEDGER") +sed \ + -e "s|__REPO__|$REPO_ESC|g" \ + -e "s|__LEDGER__|$LEDGER_ESC|g" \ + "$ROOT/tests/ds4_agent_context_self_improvement_prompt.md" >"$PROMPT_FILE" + +if ! HOME="$HOME_DIR" "$ROOT/ds4-agent" \ + --model "$ROOT/ds4flash.gguf" \ + --non-interactive \ + --chdir "$ROOT" \ + --ctx "${DS4_AGENT_CONTEXT_SELF_IMPROVEMENT_CTX:-8192}" \ + --tokens "${DS4_AGENT_CONTEXT_SELF_IMPROVEMENT_TOKENS:-4500}" \ + --temp 0 \ + --seed 11 \ + --nothink \ + --trace "$TRACE" \ + --prompt "$(cat "$PROMPT_FILE")" >"$OUT" 2>&1 +then + cat "$OUT" >&2 + echo "ds4-agent context self-improvement run failed" >&2 + exit 1 +fi + +if [ ! -f "$LEDGER" ]; then + cat "$OUT" >&2 + echo "missing generated context self-improvement ledger: $LEDGER" >&2 + exit 1 +fi + +python3 "$REPO/test_toy_math.py" >/dev/null + +grep -q "git_status_checked=yes" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report git status check" >&2 + exit 1 +} + +grep -Eq "git_status_mode=(native|bash)" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report a valid git status mode" >&2 + exit 1 +} + +grep -q "git_diff_checked=yes" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report git diff check" >&2 + exit 1 +} + +grep -Eq "git_diff_mode=(native|bash)" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report a valid git diff mode" >&2 + exit 1 +} + +grep -q "context_checkpoint_before=yes" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report pre-fix checkpoint" >&2 + exit 1 +} + +grep -q "context_checkpoint_after=yes" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report post-fix checkpoint" >&2 + exit 1 +} + +grep -q "context_restore_used=yes" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report restore usage" >&2 + exit 1 +} + +grep -q "tests_before_restore=pass" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report pre-restore passing tests" >&2 + exit 1 +} + +grep -q "tests_after_restore=pass" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger does not report post-restore passing tests" >&2 + exit 1 +} + +grep -q "final=CONTEXT_SELF_IMPROVEMENT_DONE" "$LEDGER" || { + cat "$LEDGER" >&2 + echo "ledger is missing final marker" >&2 + exit 1 +} + +grep -q "CONTEXT_SELF_IMPROVEMENT_DONE" "$OUT" || { + cat "$OUT" >&2 + echo "DS4 did not finish context self-improvement scenario" >&2 + exit 1 +} + +if grep -Eq "git[[:space:]]+action=status" "$OUT"; then + : +elif grep -Fq "git status --short" "$OUT"; then + : +else + cat "$OUT" >&2 + echo "output does not prove git status was checked" >&2 + exit 1 +fi + +if grep -Eq "git[[:space:]]+action=diff" "$OUT"; then + : +elif grep -Fq "git diff -- toy_math.py" "$OUT"; then + : +else + cat "$OUT" >&2 + echo "output does not prove git diff was checked" >&2 + exit 1 +fi + +grep -Eq "context[[:space:]]+action=checkpoint" "$OUT" || { + cat "$OUT" >&2 + echo "output does not prove context checkpoint was used" >&2 + exit 1 +} + +grep -Eq "context[[:space:]]+action=restore|Context restored from checkpoint" "$OUT" || { + cat "$OUT" >&2 + echo "output does not prove context restore was used" >&2 + exit 1 +} + +if git -C "$REPO" diff --quiet -- toy_math.py; then + git -C "$REPO" diff -- toy_math.py >&2 + echo "final patch did not modify toy_math.py" >&2 + exit 1 +fi + +write_report + +printf 'ds4 agent context self-improvement e2e: ok\n' diff --git a/tests/ds4_agent_context_self_improvement_prompt.md b/tests/ds4_agent_context_self_improvement_prompt.md new file mode 100644 index 00000000..f5476d83 --- /dev/null +++ b/tests/ds4_agent_context_self_improvement_prompt.md @@ -0,0 +1,93 @@ +You are validating DS4's own KV-backed self-improvement loop. + +Do not explain the plan in prose. Use DSML tools. + +Repository root: __REPO__ +Ledger path: __LEDGER__ + +Task: +Fix the small Python project in the repository so its test suite passes. The +bug is intentionally simple and local to the repository. + +Use absolute file paths under the repository root for read, edit, write, and +bash commands. + +Repository inspection mode: + +- If the available tool schemas include a native git DSML tool, prefer that tool + for repository status and diff inspection. +- If the native git DSML tool is not available, use the bash tool to run git + commands in the temporary repository. + +Required sequence: + +1. Use the context tool with action=checkpoint and label + context-self-improvement-before. This context checkpoint call must be the + only tool call in its DSML block. + +2. Inspect repository status. + +If native git DSML is available, use the git tool with action=status and repo +set to the repository root. + +Otherwise, use the bash tool to run exactly: + +cd __REPO__ && git status --short + +3. Use read/edit/write/bash tools as needed to inspect, fix, and test the + project. Run exactly this test command with the bash tool: + +cd __REPO__ && python3 test_toy_math.py + +4. Inspect the produced repository diff. + +If native git DSML is available, use the git tool with action=diff, repo set to +the repository root, and path set to toy_math.py. + +Otherwise, use the bash tool to run exactly: + +cd __REPO__ && git diff -- toy_math.py + +5. If the test passes, use the context tool with action=checkpoint and label + context-self-improvement-after-pass. This context checkpoint call must be the + only tool call in its DSML block. Save the returned checkpoint id for step 6. + +6. Use the context tool with action=restore, id set to the checkpoint id from + step 5, reason=context-self-improvement-restore-check, and + allow_side_effect_mismatch=true. This context restore call must be the only + tool call in its DSML block. + +7. After restore, inspect repository status again. + +If native git DSML is available, use the git tool with action=status and repo +set to the repository root. + +Otherwise, use the bash tool to run exactly: + +cd __REPO__ && git status --short + +Then run exactly this test command again with the bash tool: + +cd __REPO__ && python3 test_toy_math.py + +After this restore, do not create any more context checkpoints and do not call +context restore again. Proceed directly to the ledger. + +8. Use the write tool to create the ledger file at the ledger path. The ledger + must contain these exact field names: + +# DS4 Context Self Improvement Ledger +git_status_checked=yes +git_status_mode= +git_diff_checked=yes +git_diff_mode= +context_checkpoint_before=yes +context_checkpoint_after=yes +context_restore_used=yes +tests_before_restore=pass +tests_after_restore=pass +fixed_file=toy_math.py +final=CONTEXT_SELF_IMPROVEMENT_DONE + +9. After the write tool result, answer exactly: +CONTEXT_SELF_IMPROVEMENT_DONE diff --git a/tests/ds4_agent_context_test.c b/tests/ds4_agent_context_test.c new file mode 100644 index 00000000..7f80d2f0 --- /dev/null +++ b/tests/ds4_agent_context_test.c @@ -0,0 +1,243 @@ +#include "../ds4_agent_context.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void test_fail(const char *msg) { + fprintf(stderr, "ds4_agent_context_test: %s\n", msg); + exit(1); +} + +#define CHECK(cond, msg) do { if (!(cond)) test_fail(msg); } while (0) + +static char *test_strdup(const char *s) { + size_t n = strlen(s); + char *out = malloc(n + 1); + CHECK(out != NULL, "malloc failed"); + memcpy(out, s, n + 1); + return out; +} + +static char *make_temp_dir(void) { + const char *base = getenv("TMPDIR"); + if (!base || !base[0]) base = "/tmp"; + for (int i = 0; i < 100; i++) { + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/ds4-agent-context-test-%ld-%ld-%d", + base, (long)getpid(), (long)time(NULL), i); + if (mkdir(path, 0700) == 0) return test_strdup(path); + if (errno != EEXIST) break; + } + test_fail("failed to create temp dir"); + return NULL; +} + +static void write_raw_file(const char *path, const char *id) { + FILE *fp = fopen(path, "wb"); + CHECK(fp != NULL, "failed to open raw metadata"); + fprintf(fp, + "{\n" + " \"id\": \"%s\",\n" + " \"label\": \"unsafe\",\n" + " \"created_at\": 1,\n" + " \"world_epoch\": 1,\n" + " \"transcript_tokens\": 10,\n" + " \"kv_path\": \"../escape.kv\",\n" + " \"memory_path\": \"safe.memory.md\"\n" + "}\n", + id); + CHECK(fclose(fp) == 0, "failed to write raw metadata"); +} + +static void write_key_collision_file(const char *path, const char *id) { + FILE *fp = fopen(path, "wb"); + CHECK(fp != NULL, "failed to open key collision metadata"); + fprintf(fp, + "{\n" + " \"id\": \"%s\",\n" + " \"label\": \"label mentions \\\"world_epoch\\\": 99 before the real key\",\n" + " \"created_at\": 7,\n" + " \"world_epoch\": 42,\n" + " \"transcript_tokens\": 77,\n" + " \"kv_path\": \"%s.kv\",\n" + " \"memory_path\": \"%s.memory.md\"\n" + "}\n", + id, id, id); + CHECK(fclose(fp) == 0, "failed to write key collision metadata"); +} + +static void fill_meta(ds4_agent_context_meta *m, const char *id, + const char *label, uint64_t epoch, int tokens) { + snprintf(m->id, sizeof(m->id), "%s", id); + m->label = test_strdup(label); + m->kv_file = ds4_agent_context_file_name(m->id, ".kv"); + m->memory_file = ds4_agent_context_file_name(m->id, ".memory.md"); + m->created_at = 1234; + m->world_epoch = epoch; + m->transcript_tokens = tokens; +} + +int main(void) { + static const char id1[] = "1111111111111111111111111111111111111111"; + static const char id2[] = "2222222222222222222222222222222222222222"; + static const char id3[] = "3333333333333333333333333333333333333333"; + static const char id4[] = "4444444444444444444444444444444444444444"; + char err[256] = {0}; + char *dir = make_temp_dir(); + + char *meta1_name = ds4_agent_context_file_name(id1, ".meta.json"); + char *meta2_name = ds4_agent_context_file_name(id2, ".meta.json"); + char *meta1_path = ds4_agent_context_path_for_file(dir, meta1_name); + char *meta2_path = ds4_agent_context_path_for_file(dir, meta2_name); + + ds4_agent_context_meta m1 = {0}; + fill_meta(&m1, id1, "first \"checkpoint\"\\line\nnext", 4, 101); + CHECK(ds4_agent_context_write_meta(&m1, meta1_path, err, sizeof(err)), + "failed to write first metadata"); + ds4_agent_context_meta_free(&m1); + + ds4_agent_context_meta m2 = {0}; + fill_meta(&m2, id2, "second", 12, 202); + CHECK(ds4_agent_context_write_meta(&m2, meta2_path, err, sizeof(err)), + "failed to write second metadata"); + ds4_agent_context_meta_free(&m2); + + ds4_agent_context_meta read_back = {0}; + CHECK(ds4_agent_context_read_meta_file(meta1_path, &read_back, err, sizeof(err)), + "failed to read metadata roundtrip"); + CHECK(strcmp(read_back.id, id1) == 0, "roundtrip id mismatch"); + CHECK(strcmp(read_back.label, "first \"checkpoint\"\\line\nnext") == 0, + "roundtrip label mismatch"); + CHECK(read_back.world_epoch == 4, "roundtrip epoch mismatch"); + CHECK(read_back.transcript_tokens == 101, "roundtrip tokens mismatch"); + ds4_agent_context_meta_free(&read_back); + + CHECK(ds4_agent_context_count_checkpoints(dir) == 2, "checkpoint count mismatch"); + CHECK(ds4_agent_context_max_world_epoch(dir) == 12, "max world epoch mismatch"); + + ds4_agent_context_meta found = {0}; + char *found_meta_path = NULL; + char *found_kv_path = NULL; + CHECK(ds4_agent_context_find_checkpoint(dir, "2222", &found, + &found_meta_path, &found_kv_path, + err, sizeof(err)), + "failed to find checkpoint by prefix"); + CHECK(strcmp(found.id, id2) == 0, "found checkpoint id mismatch"); + CHECK(strstr(found_kv_path, "2222222222222222222222222222222222222222.kv") != NULL, + "found kv path mismatch"); + ds4_agent_context_meta_free(&found); + free(found_meta_path); + free(found_kv_path); + + char *unsafe_name = ds4_agent_context_file_name(id3, ".meta.json"); + char *unsafe_path = ds4_agent_context_path_for_file(dir, unsafe_name); + write_raw_file(unsafe_path, id3); + ds4_agent_context_meta unsafe = {0}; + CHECK(!ds4_agent_context_read_meta_file(unsafe_path, &unsafe, err, sizeof(err)), + "unsafe metadata path should be rejected"); + ds4_agent_context_meta_free(&unsafe); + + char *collision_name = ds4_agent_context_file_name(id4, ".meta.json"); + char *collision_path = ds4_agent_context_path_for_file(dir, collision_name); + write_key_collision_file(collision_path, id4); + ds4_agent_context_meta collision = {0}; + CHECK(ds4_agent_context_read_meta_file(collision_path, &collision, + err, sizeof(err)), + "key collision metadata should parse"); + CHECK(collision.world_epoch == 42, + "parser matched key text inside a string value"); + CHECK(collision.transcript_tokens == 77, + "key collision transcript tokens mismatch"); + ds4_agent_context_meta_free(&collision); + + CHECK(ds4_agent_context_restore_epoch_guard(12, 12, false, err, sizeof(err)), + "equal epoch restore should be allowed"); + CHECK(ds4_agent_context_restore_epoch_guard(13, 12, true, err, sizeof(err)), + "explicit side-effect override should be allowed"); + CHECK(!ds4_agent_context_restore_epoch_guard(13, 12, false, err, sizeof(err)), + "epoch mismatch restore should be rejected"); + CHECK(strstr(err, "world_epoch=13 to 12") != NULL, + "epoch guard error missing epoch details"); + CHECK(ds4_agent_context_no_running_bash_guard("restore", 0, err, sizeof(err)), + "restore should allow no running bash jobs"); + CHECK(!ds4_agent_context_no_running_bash_guard("restore", 2, err, sizeof(err)), + "restore should reject running bash jobs"); + CHECK(strstr(err, "2 bash job(s)") != NULL, + "bash guard error missing job count"); + + ds4_agent_context_restore_metrics metrics = { + .checkpoint_tokens = 101, + .restore_notice_tokens = 13, + .restored_tokens = 114, + }; + char *metrics_line = ds4_agent_context_restore_expected_metrics_line(&metrics); + CHECK(strstr(metrics_line, "KV restore expected metrics:") != NULL, + "restore metrics line missing expected label"); + CHECK(strstr(metrics_line, "checkpoint_tokens=101") != NULL, + "restore metrics line missing checkpoint tokens"); + CHECK(strstr(metrics_line, "expected_restore_notice_tokens=13") != NULL, + "restore metrics line missing notice tokens"); + CHECK(strstr(metrics_line, "expected_prefill_suffix_tokens=13") != NULL, + "restore metrics line missing expected prefill suffix"); + CHECK(strstr(metrics_line, "expected_saved_prefill_tokens=101") != NULL, + "restore metrics line missing expected saved prefill"); + CHECK(strstr(metrics_line, " saved_prefill_tokens=") == NULL, + "restore metrics line should not present expected values as actual"); + free(metrics_line); + + ds4_agent_side_effects effects = {0}; + uint64_t epoch = 3; + epoch = ds4_agent_side_effects_note(&effects, epoch, + "write", "experiment.md\nsecond line"); + CHECK(epoch == 4, "side effect epoch mismatch"); + char *summary = ds4_agent_side_effects_summary_since(&effects, 3); + CHECK(strstr(summary, "Known side effects after checkpoint:") != NULL, + "side effect summary header missing"); + CHECK(strstr(summary, "epoch=4 write experiment.md second line") != NULL, + "side effect summary content missing"); + free(summary); + summary = ds4_agent_side_effects_summary_since(&effects, 4); + CHECK(strcmp(summary, "") == 0, "empty side effect summary mismatch"); + free(summary); + ds4_agent_side_effects_free(&effects); + + for (int i = 0; i < 70; i++) { + char detail[32]; + snprintf(detail, sizeof(detail), "effect-%d", i + 1); + epoch = ds4_agent_side_effects_note(&effects, epoch, "bash", detail); + } + CHECK(effects.count == 64, "side effect retained count mismatch"); + CHECK(effects.evicted_count == 6, "side effect evicted count mismatch"); + summary = ds4_agent_side_effects_summary_since(&effects, 4); + CHECK(strstr(summary, "may be incomplete") != NULL, + "truncated side effect warning missing"); + CHECK(strstr(summary, "6 older side effect") != NULL, + "truncated side effect count missing"); + CHECK(strstr(summary, "... more side effects omitted ...") != NULL, + "retained side effect omission marker missing"); + free(summary); + ds4_agent_side_effects_free(&effects); + + unlink(meta1_path); + unlink(meta2_path); + unlink(unsafe_path); + unlink(collision_path); + rmdir(dir); + free(meta1_name); + free(meta2_name); + free(unsafe_name); + free(collision_name); + free(meta1_path); + free(meta2_path); + free(unsafe_path); + free(collision_path); + free(dir); + return 0; +} diff --git a/tests/ds4_kv_cache_benefit_test.c b/tests/ds4_kv_cache_benefit_test.c new file mode 100644 index 00000000..fd617ff8 --- /dev/null +++ b/tests/ds4_kv_cache_benefit_test.c @@ -0,0 +1,355 @@ +#include "../ds4.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +typedef struct { + char *ptr; + size_t len; + size_t cap; +} test_buf; + +typedef struct { + int top1_a; + int top1_b; + int nonfinite; + double rms; + float max_abs; + bool same_top1; +} logit_cmp; + +static void fail(const char *msg) { + fprintf(stderr, "ds4_kv_cache_benefit_test: %s\n", msg); + exit(1); +} + +#define CHECK(cond, msg) do { if (!(cond)) fail(msg); } while (0) + +static double now_sec(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9; +} + +static void *xmalloc(size_t n) { + void *p = malloc(n ? n : 1); + CHECK(p != NULL, "malloc failed"); + return p; +} + +static char *xstrdup(const char *s) { + size_t n = strlen(s); + char *out = xmalloc(n + 1); + memcpy(out, s, n + 1); + return out; +} + +static void buf_reserve(test_buf *b, size_t add) { + if (b->len + add + 1 <= b->cap) return; + size_t cap = b->cap ? b->cap * 2 : 4096; + while (cap < b->len + add + 1) cap *= 2; + char *p = realloc(b->ptr, cap); + CHECK(p != NULL, "realloc failed"); + b->ptr = p; + b->cap = cap; +} + +static void buf_append(test_buf *b, const char *s, size_t n) { + buf_reserve(b, n); + memcpy(b->ptr + b->len, s, n); + b->len += n; + b->ptr[b->len] = '\0'; +} + +static void buf_puts(test_buf *b, const char *s) { + buf_append(b, s, strlen(s)); +} + +static int env_int(const char *name, int def, int min, int max) { + const char *s = getenv(name); + if (!s || !s[0]) return def; + char *end = NULL; + long v = strtol(s, &end, 10); + if (!end || *end || v < min || v > max) { + fprintf(stderr, "ds4_kv_cache_benefit_test: ignoring invalid %s=%s\n", + name, s); + return def; + } + return (int)v; +} + +static const char *model_path(void) { + const char *path = getenv("DS4_TEST_MODEL"); + return path && path[0] ? path : "ds4flash.gguf"; +} + +static char *make_prompt_text(int lines) { + test_buf b = {0}; + buf_puts(&b, + "KV cache benchmark corpus. Every line below is deterministic and " + "contains canary facts that should survive exact model-state restore.\n"); + for (int i = 0; i < lines; i++) { + char line[256]; + snprintf(line, sizeof(line), + "Fact %04d: project=DS4 cache_test=enabled " + "canary=CANARY-BENCH-%04d checksum=%08x " + "instruction=preserve-prefix-state-without-refill.\n", + i, i, (unsigned)(i * 2654435761u)); + buf_puts(&b, line); + } + return b.ptr ? b.ptr : xstrdup(""); +} + +static void build_prompt(ds4_engine *engine, int target_tokens, int ctx, + ds4_tokens *prompt, int *lines_out) { + int lines = 32; + for (int attempt = 0; attempt < 18; attempt++) { + char *text = make_prompt_text(lines); + ds4_tokens_free(prompt); + memset(prompt, 0, sizeof(*prompt)); + ds4_encode_chat_prompt(engine, "", text, DS4_THINK_NONE, prompt); + free(text); + + if (prompt->len >= target_tokens && prompt->len + 256 < ctx) break; + if (prompt->len + 256 >= ctx && lines > 8) { + lines = (lines * 3) / 4; + if (lines < 8) lines = 8; + } else if (prompt->len < target_tokens) { + lines *= 2; + } else { + break; + } + } + CHECK(prompt->len > 256, "benchmark prompt too small"); + CHECK(prompt->len + 256 < ctx, "benchmark prompt does not fit context"); + if (lines_out) *lines_out = lines; +} + +static char *temp_payload_path(void) { + const char *base = getenv("TMPDIR"); + if (!base || !base[0]) base = "/tmp"; + char tmpl[PATH_MAX]; + snprintf(tmpl, sizeof(tmpl), "%s/ds4-kv-cache-benefit-%ld-XXXXXX", + base, (long)getpid()); + int fd = mkstemp(tmpl); + CHECK(fd >= 0, strerror(errno)); + close(fd); + return xstrdup(tmpl); +} + +static void progress_cb(void *ud, const char *event, int current, int total) { + const char *label = ud ? (const char *)ud : "sync"; + if (strcmp(event, "prefill_chunk")) return; + if (current == 0 || current == total || current % 512 == 0) { + fprintf(stderr, "ds4-kv-benefit: %s prefill %d/%d\n", + label, current, total); + } +} + +static int logit_argmax(const float *x, int n) { + int best = -1; + float best_v = -FLT_MAX; + for (int i = 0; i < n; i++) { + if (!isfinite(x[i])) continue; + if (best < 0 || x[i] > best_v) { + best = i; + best_v = x[i]; + } + } + return best; +} + +static logit_cmp compare_logits(const float *a, const float *b, int n) { + logit_cmp c = {0}; + c.top1_a = logit_argmax(a, n); + c.top1_b = logit_argmax(b, n); + c.same_top1 = c.top1_a >= 0 && c.top1_a == c.top1_b; + double sumsq = 0.0; + for (int i = 0; i < n; i++) { + if (!isfinite(a[i]) || !isfinite(b[i])) { + c.nonfinite++; + continue; + } + float d = b[i] - a[i]; + float ad = fabsf(d); + if (ad > c.max_abs) c.max_abs = ad; + sumsq += (double)d * (double)d; + } + c.rms = sqrt(sumsq / (double)n); + return c; +} + +static uint64_t file_size_or_die(const char *path) { + struct stat st; + CHECK(stat(path, &st) == 0, strerror(errno)); + CHECK(st.st_size >= 0, "negative file size"); + return (uint64_t)st.st_size; +} + +int main(void) { + const int ctx = env_int("DS4_KV_BENCH_CTX", 4096, 1024, 262144); + const int target_tokens = env_int("DS4_KV_BENCH_TARGET_TOKENS", + ctx / 2, 256, ctx - 512); + + ds4_engine *engine = NULL; + ds4_engine_options opt = { + .model_path = model_path(), +#ifdef __APPLE__ + .backend = DS4_BACKEND_METAL, +#else + .backend = DS4_BACKEND_CUDA, +#endif + .quality = false, + }; + CHECK(ds4_engine_open(&engine, &opt) == 0, "failed to open DS4 engine"); + const int vocab = ds4_engine_vocab_size(engine); + CHECK(vocab > 0, "invalid vocab size"); + + ds4_tokens prompt = {0}; + int prompt_lines = 0; + build_prompt(engine, target_tokens, ctx, &prompt, &prompt_lines); + + ds4_session *base = NULL; + CHECK(ds4_session_create(&base, engine, ctx) == 0, "failed to create base session"); + char err[256] = {0}; + ds4_session_set_progress(base, progress_cb, "base"); + double t0 = now_sec(); + CHECK(ds4_session_sync(base, &prompt, err, sizeof(err)) == 0, + err[0] ? err : "base prefill failed"); + double base_sync_sec = now_sec() - t0; + ds4_session_set_progress(base, NULL, NULL); + CHECK(ds4_session_pos(base) == prompt.len, "base session token count mismatch"); + + float *base_logits = xmalloc((size_t)vocab * sizeof(*base_logits)); + CHECK(ds4_session_copy_logits(base, base_logits, vocab) == vocab, + "failed to copy base logits"); + + uint64_t payload_bytes = ds4_session_payload_bytes(base); + CHECK(payload_bytes > 0, "base session has no KV payload"); + char *payload_path = temp_payload_path(); + FILE *fp = fopen(payload_path, "wb"); + CHECK(fp != NULL, strerror(errno)); + t0 = now_sec(); + CHECK(ds4_session_save_payload(base, fp, err, sizeof(err)) == 0, + err[0] ? err : "failed to save KV payload"); + CHECK(fclose(fp) == 0, "failed to close KV payload"); + double save_sec = now_sec() - t0; + CHECK(file_size_or_die(payload_path) == payload_bytes, + "payload byte count mismatch"); + + ds4_session *restored = NULL; + CHECK(ds4_session_create(&restored, engine, ctx) == 0, + "failed to create restored session"); + fp = fopen(payload_path, "rb"); + CHECK(fp != NULL, strerror(errno)); + t0 = now_sec(); + CHECK(ds4_session_load_payload(restored, fp, payload_bytes, + err, sizeof(err)) == 0, + err[0] ? err : "failed to load KV payload"); + double load_sec = now_sec() - t0; + fclose(fp); + CHECK(ds4_session_pos(restored) == prompt.len, + "restored session token count mismatch"); + + float *loaded_logits = xmalloc((size_t)vocab * sizeof(*loaded_logits)); + CHECK(ds4_session_copy_logits(restored, loaded_logits, vocab) == vocab, + "failed to copy loaded logits"); + logit_cmp base_cmp = compare_logits(base_logits, loaded_logits, vocab); + CHECK(base_cmp.nonfinite == 0, "non-finite logits after KV load"); + CHECK(base_cmp.same_top1, "KV load changed top-1 token"); + CHECK(base_cmp.max_abs <= 1.0e-4f, "KV load changed base logits"); + + ds4_tokens suffix = {0}; + ds4_tokenize_text(engine, + "\n\nKV cache continuation probe: report CANARY-BENCH-0042 exactly once.", + &suffix); + CHECK(suffix.len > 0 && suffix.len < 128, "unexpected suffix token count"); + ds4_tokens extended = {0}; + ds4_tokens_copy(&extended, &prompt); + for (int i = 0; i < suffix.len; i++) ds4_tokens_push(&extended, suffix.v[i]); + CHECK(extended.len + 64 < ctx, "extended prompt does not fit context"); + + int common = ds4_session_common_prefix(restored, &extended); + int cached = common == ds4_session_pos(restored) && + extended.len >= ds4_session_pos(restored) ? common : 0; + int restored_prefill_tokens = extended.len - cached; + CHECK(cached == prompt.len, "restored session did not retain prompt prefix"); + CHECK(restored_prefill_tokens == suffix.len, + "restored session would prefill more than the suffix"); + + ds4_session_set_progress(restored, progress_cb, "restored-suffix"); + t0 = now_sec(); + CHECK(ds4_session_sync(restored, &extended, err, sizeof(err)) == 0, + err[0] ? err : "suffix sync failed"); + double suffix_sync_sec = now_sec() - t0; + ds4_session_set_progress(restored, NULL, NULL); + CHECK(ds4_session_pos(restored) == extended.len, + "restored suffix token count mismatch"); + + float *restored_suffix_logits = xmalloc((size_t)vocab * sizeof(*restored_suffix_logits)); + CHECK(ds4_session_copy_logits(restored, restored_suffix_logits, vocab) == vocab, + "failed to copy restored suffix logits"); + + ds4_session *full = NULL; + CHECK(ds4_session_create(&full, engine, ctx) == 0, + "failed to create full-prefill session"); + ds4_session_set_progress(full, progress_cb, "full"); + t0 = now_sec(); + CHECK(ds4_session_sync(full, &extended, err, sizeof(err)) == 0, + err[0] ? err : "full prefill failed"); + double full_sync_sec = now_sec() - t0; + ds4_session_set_progress(full, NULL, NULL); + + float *full_logits = xmalloc((size_t)vocab * sizeof(*full_logits)); + CHECK(ds4_session_copy_logits(full, full_logits, vocab) == vocab, + "failed to copy full logits"); + logit_cmp extended_cmp = compare_logits(full_logits, restored_suffix_logits, vocab); + CHECK(extended_cmp.nonfinite == 0, "non-finite logits after suffix sync"); + CHECK(extended_cmp.same_top1, + "KV restore plus suffix changed top-1 versus full prefill"); + + printf("kv-cache-benefit: prompt_lines=%d base_tokens=%d suffix_tokens=%d " + "full_prefill_tokens=%d restored_prefill_tokens=%d saved_prefill_tokens=%d " + "payload_bytes=%" PRIu64 " base_sync_sec=%.3f save_sec=%.3f " + "load_sec=%.3f suffix_sync_sec=%.3f full_extended_sync_sec=%.3f " + "base_top1_equal=%s base_max_abs=%g extended_top1_equal=%s " + "extended_max_abs=%g extended_rms=%g quality_guard=logits_equivalence\n", + prompt_lines, prompt.len, suffix.len, + extended.len, restored_prefill_tokens, + extended.len - restored_prefill_tokens, + payload_bytes, base_sync_sec, save_sec, + load_sec, suffix_sync_sec, full_sync_sec, + base_cmp.same_top1 ? "true" : "false", base_cmp.max_abs, + extended_cmp.same_top1 ? "true" : "false", + extended_cmp.max_abs, extended_cmp.rms); + + unlink(payload_path); + free(payload_path); + free(full_logits); + free(restored_suffix_logits); + free(loaded_logits); + free(base_logits); + ds4_tokens_free(&extended); + ds4_tokens_free(&suffix); + ds4_tokens_free(&prompt); + ds4_session_free(full); + ds4_session_free(restored); + ds4_session_free(base); + ds4_engine_close(engine); + return 0; +}