diff --git a/.gitignore b/.gitignore
index 22860799..535aad4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,10 @@
 /ds4_native
 /ds4_server_test
 /ds4_test
+/tests/ds4_agent_context_test
+/tests/ds4_agent_git_test
+/tests/ds4_kv_cache_benefit_test
+/tests/generated/
 /ds4flash.gguf
 /TODO.md
 /gguf/
diff --git a/Makefile b/Makefile
index 27283ba0..9acffe3b 100644
--- a/Makefile
+++ b/Makefile
@@ -33,7 +33,7 @@ CPU_CORE_OBJS = ds4_cpu.o
 METAL_LDLIBS := $(LDLIBS)
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
+.PHONY: all help clean test test-agent-context-loop test-agent-context-compact-canary test-agent-context-self-improvement test-kv-cache-benefit cpu cuda cuda-spark cuda-generic cuda-regression
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
@@ -43,6 +43,14 @@ help:
 	@echo "  make              Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make cpu          Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make test         Build and run tests"
+	@echo "  make test-agent-context-loop"
+	@echo "                    Run slow DS4-generated agent context loop e2e"
+	@echo "  make test-agent-context-compact-canary"
+	@echo "                    Run slow DS4-generated compaction canary e2e"
+	@echo "  make test-agent-context-self-improvement"
+	@echo "                    Run slow DS4-generated KV self-improvement e2e"
+	@echo "  make test-kv-cache-benefit"
+	@echo "                    Run optional KV restore benefit benchmark"
 	@echo "  make clean        Remove build outputs"
 
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
@@ -57,15 +65,15 @@ ds4-bench: ds4_bench.o $(CORE_OBJS)
 ds4-eval: ds4_eval.o $(CORE_OBJS)
 	$(CC) $(CFLAGS) -o $@ ds4_eval.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
-	$(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
+ds4-agent: ds4_agent.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
+	$(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
+cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
-	$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
+	$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
 
 cuda-regression:
 	@echo "cuda-regression requires a CUDA build"
@@ -79,6 +87,14 @@ help:
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make test                Build and run tests"
+	@echo "  make test-agent-context-loop"
+	@echo "                           Run slow DS4-generated agent context loop e2e"
+	@echo "  make test-agent-context-compact-canary"
+	@echo "                           Run slow DS4-generated compaction canary e2e"
+	@echo "  make test-agent-context-self-improvement"
+	@echo "                           Run slow DS4-generated KV self-improvement e2e"
+	@echo "  make test-kv-cache-benefit"
+	@echo "                           Run optional KV restore benefit benchmark"
 	@echo "  make clean               Remove build outputs"
 
 cuda-spark:
@@ -107,15 +123,15 @@ ds4-bench: ds4_bench.o $(CORE_OBJS)
 ds4-eval: ds4_eval.o $(CORE_OBJS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
+ds4-agent: ds4_agent.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
+cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
-	$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
+	$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_agent_context.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
 
 cuda-regression: tests/cuda_long_context_smoke
 	./tests/cuda_long_context_smoke
@@ -136,9 +152,12 @@ ds4_bench.o: ds4_bench.c ds4.h
 ds4_eval.o: ds4_eval.c ds4.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_eval.c
 
-ds4_agent.o: ds4_agent.c ds4.h ds4_kvstore.h ds4_web.h linenoise.h
+ds4_agent.o: ds4_agent.c ds4.h ds4_agent_context.h ds4_kvstore.h ds4_web.h linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_agent.c
 
+ds4_agent_context.o: ds4_agent_context.c ds4_agent_context.h
+	$(CC) $(CFLAGS) -c -o $@ ds4_agent_context.c
+
 ds4_web.o: ds4_web.c ds4_web.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_web.c
 
@@ -151,6 +170,12 @@ ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_kvstore.h rax.h
 tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
 	$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c
 
+tests/ds4_agent_context_test.o: tests/ds4_agent_context_test.c ds4_agent_context.h
+	$(CC) $(CFLAGS) -I. -c -o $@ tests/ds4_agent_context_test.c
+
+tests/ds4_kv_cache_benefit_test.o: tests/ds4_kv_cache_benefit_test.c ds4.h
+	$(CC) $(CFLAGS) -I. -c -o $@ tests/ds4_kv_cache_benefit_test.c
+
 rax.o: rax.c rax.h rax_malloc.h
 	$(CC) $(CFLAGS) -c -o $@ rax.c
 
@@ -172,7 +197,7 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
 ds4_eval_cpu.o: ds4_eval.c ds4.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_eval.c
 
-ds4_agent_cpu.o: ds4_agent.c ds4.h ds4_kvstore.h ds4_web.h linenoise.h
+ds4_agent_cpu.o: ds4_agent.c ds4.h ds4_agent_context.h ds4_kvstore.h ds4_web.h linenoise.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_agent.c
 
 ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
@@ -184,6 +209,16 @@ ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
+tests/ds4_agent_context_test: tests/ds4_agent_context_test.o ds4_agent_context.o
+	$(CC) $(CFLAGS) -o $@ tests/ds4_agent_context_test.o ds4_agent_context.o $(LDLIBS)
+
+tests/ds4_kv_cache_benefit_test: tests/ds4_kv_cache_benefit_test.o $(CORE_OBJS)
+ifeq ($(UNAME_S),Darwin)
+	$(CC) $(CFLAGS) -o $@ tests/ds4_kv_cache_benefit_test.o $(CORE_OBJS) $(METAL_LDLIBS)
+else
+	$(NVCC) $(NVCCFLAGS) -o $@ tests/ds4_kv_cache_benefit_test.o $(CORE_OBJS) $(CUDA_LDLIBS)
+endif
+
 ds4_test: ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
 	$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
@@ -191,9 +226,22 @@ else
 	$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
 endif
 
-test: ds4_test ds4-eval
+test: ds4_test ds4-eval tests/ds4_agent_context_test
 	./ds4-eval --self-test-extractors
+	./tests/ds4_agent_context_test
 	./ds4_test
 
+test-agent-context-loop: ds4-agent tests/ds4_agent_context_test
+	sh tests/ds4_agent_context_loop_e2e.sh
+
+test-agent-context-compact-canary: ds4-agent
+	sh tests/ds4_agent_context_compact_canary_e2e.sh
+
+test-agent-context-self-improvement: ds4-agent
+	sh tests/ds4_agent_context_self_improvement_e2e.sh
+
+test-kv-cache-benefit: tests/ds4_kv_cache_benefit_test
+	./tests/ds4_kv_cache_benefit_test
+
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/ds4_agent_context_test tests/ds4_agent_context_test.o tests/ds4_kv_cache_benefit_test tests/ds4_kv_cache_benefit_test.o
diff --git a/docs/agent-kv-context-fixes.md b/docs/agent-kv-context-fixes.md
new file mode 100644
index 00000000..a6095324
--- /dev/null
+++ b/docs/agent-kv-context-fixes.md
@@ -0,0 +1,291 @@
+# Agent KV Context Hardening Fixes
+
+This branch isolates a small set of hardening fixes for the agent context and
+KV restore path. The goal is not to add more agent features here. The goal is
+to make the existing context checkpoint, restore, and compaction behavior fail
+more safely, report more honestly, and be easier for the model to call
+correctly.
+
+The fixes were driven by tests. In particular, the compaction canary e2e exposed
+that a malformed compaction summary could be accepted as if it were useful
+state. The runtime now rejects that case instead of rebuilding the live context
+from bad memory.
+
+## 1. Compaction Rejects Broken Summaries
+
+This does not mean DS4 stops compacting.
+
+It means:
+
+- DS4 tries to compact normally.
+- To compact, it asks the model for a useful summary of the current task state.
+- If the summary is useful, DS4 uses it.
+- If the summary is almost empty or clearly broken, DS4 rejects it.
+
+Before:
+
+```text
+broken summary -> DS4 uses it anyway -> possible corrupted task memory
+```
+
+Now:
+
+```text
+broken summary -> DS4 aborts compaction -> corrupted memory is not used
+```
+
+How it is implemented:
+
+- `agent_compact_summary_has_signal()` checks that the generated summary has a
+  minimal amount of real text, not just a few tag-like words.
+- The internal compaction prompt asks for plain text headings or bullets and
+  explicitly rejects XML/HTML-like tool markup.
+- If the check fails, compaction returns an error and invalidates the live KV
+  session before any rebuilt context is accepted.
+
+Verification:
+
+- `make test-agent-context-compact-canary`
+- The test requires DS4 to compact, then write five canary facts only after
+  compaction.
+- The harness checks that compaction really happened, reduced token count, kept
+  the recent tail late enough, and preserved all canary values.
+
+## 2. KV Restore Does Not Leave Half-Restored State
+
+This does not make KV restore more limited.
+
+It means:
+
+- DS4 tries to load a saved KV cache.
+- Then it checks that the loaded cache matches the context metadata.
+- If the cache and metadata match, restore succeeds.
+- If they do not match, DS4 avoids leaving the live session in an ambiguous
+  half-restored state.
+
+Before:
+
+```text
+partial restore -> mismatch detected later -> possible session/transcript drift
+```
+
+Now:
+
+```text
+invalid restore -> live KV is invalidated or resynced -> no half-restored state
+```
+
+How it is implemented:
+
+- The restore path validates loaded token counts against checkpoint metadata.
+- If validation fails after a KV load, the live session is invalidated so the
+  next operation cannot accidentally continue from the bad KV state.
+
+Verification:
+
+- `make test`
+- Context unit tests cover checkpoint metadata loading and incompatible restore
+  handling.
+
+## 3. Restore Metrics Are Explicit About Expected Versus Actual Savings
+
+This does not change how the cache itself works.
+
+It means:
+
+- DS4 can estimate how many prefill tokens should be avoided by restoring KV.
+- After restore, DS4 can also observe what actually happened during sync.
+- If expected and actual behavior differ, the model-visible notice should not
+  hide that difference.
+
+Before:
+
+```text
+expected savings -> shown as if they definitely happened
+```
+
+Now:
+
+```text
+expected/actual savings -> reported more clearly -> less misleading feedback
+```
+
+How it is implemented:
+
+- Restore bookkeeping tracks expected saved prefill tokens separately from
+  actual cached-token observations after sync.
+- The restore notice and trace avoid presenting estimates as stronger proof
+  than they are.
+
+Verification:
+
+- `make test`
+- `make test-kv-cache-benefit`
+- The benchmark compares a full prefill against a restored-prefix run and
+  reports `full_prefill_tokens`, `restored_prefill_tokens`, and
+  `saved_prefill_tokens`.
+
+## 4. Tool Schemas List The Allowed Actions
+
+This does not reduce the tool's capability.
+
+It means:
+
+- The context tool accepts a fixed set of action names.
+- The schema now tells the model exactly which actions are valid.
+- The model has less room to invent plausible but wrong action names.
+
+Before:
+
+```text
+action = any string -> model may invent an invalid action -> runtime error
+```
+
+Now:
+
+```text
+action = one of the allowed names -> fewer avoidable tool-call errors
+```
+
+How it is implemented:
+
+- The context tool schema uses a JSON `enum` for action values such as
+  `status`, `checkpoint`, `list`, `restore`, `compact`, and `drop`.
+
+Verification:
+
+- `make test`
+- The schema is model-visible and the dispatch path still rejects unknown
+  actions at runtime.
+
+## 5. Metadata Parsing Is Key-Aware
+
+This does not change the checkpoint metadata format.
+
+It means:
+
+- DS4 still reads the same JSON metadata files.
+- It no longer finds fields by blindly searching for a word anywhere in the
+  file.
+- It distinguishes a real key from the same text appearing inside a value.
+
+Before:
+
+```text
+search raw text -> possible confusion between key and value
+```
+
+Now:
+
+```text
+read the actual key -> metadata is interpreted more reliably
+```
+
+How it is implemented:
+
+- The metadata reader now scans for JSON object keys instead of using a plain
+  substring search.
+- Tests cover pathological values that contain text resembling other keys.
+
+Verification:
+
+- `make test`
+- The context unit test covers metadata roundtrip and key-aware lookup.
+
+## 6. Benchmark Build Artifact Is Ignored
+
+This is only repository hygiene.
+
+It means:
+
+- `make test-kv-cache-benefit` may build
+  `tests/ds4_kv_cache_benefit_test`.
+- That generated binary should not make `git status` look dirty after the test.
+
+Before:
+
+```text
+run benchmark -> generated binary appears as untracked file
+```
+
+Now:
+
+```text
+run benchmark -> generated binary is ignored -> working tree remains clean
+```
+
+How it is implemented:
+
+- `.gitignore` includes `/tests/ds4_kv_cache_benefit_test`.
+
+Verification:
+
+- `make test-kv-cache-benefit`
+- `git status --short`
+
+## 7. Adaptive Self-Improvement E2E Scope
+
+This test demonstrates the agent loop, not a real DS4 code optimization.
+
+It means:
+
+- DS4 is given a temporary repository with a small failing Python project.
+- DS4 must inspect repository state, fix the bug, run the tests, inspect the
+  diff, checkpoint the context, restore it, and prove the tests still pass.
+- If the native Git tool is available, the prompt asks DS4 to use it for
+  `status` and `diff`.
+- If the native Git tool is not available, the same test falls back to the
+  existing `bash` path with `git status --short` and `git diff`.
+
+Before:
+
+```text
+context tools work in isolated calls -> less proof of agent-level usefulness
+```
+
+Now:
+
+```text
+agent fixes a controlled project -> checkpoints -> restores -> verifies state
+```
+
+The limitation is intentional. This test does not claim that DS4 found and
+optimized DS4's own C code. A stronger follow-up test should run against DS4
+itself: ask the agent to inspect the repository, choose one small measurable
+optimization, implement it, run the relevant benchmark or e2e check, inspect
+the diff, checkpoint, restore, and record whether the metric improved.
+
+That DS4-on-DS4 loop is the ideal product demonstration, but it is a slower and
+less deterministic test than this PR should require by default. The controlled
+temporary repository keeps this PR's regression signal clear while preserving a
+direct path to the stronger self-optimization loop.
+
+Verification:
+
+- `make test-agent-context-self-improvement`
+- The generated ledger records `git_status_mode`, `git_diff_mode`,
+  `context_checkpoint_before`, `context_checkpoint_after`,
+  `context_restore_used`, `tests_before_restore`, and `tests_after_restore`.
+
+## Test Plan
+
+Run:
+
+```sh
+make test
+make test-agent-context-compact-canary
+make test-kv-cache-benefit
+make test-agent-context-self-improvement
+git status --short
+```
+
+Expected result:
+
+- default C tests pass,
+- compaction canary e2e passes,
+- context self-improvement e2e passes; it uses native Git tooling when that
+  tool is present, and falls back to `bash`-run `git status` / `git diff` when
+  this branch is tested without the Git-tool PR,
+- KV benefit benchmark reports a large `saved_prefill_tokens` value,
+- `git status --short` shows only intentional source changes before commit, and
+  is clean after commit.
diff --git a/docs/agent-kv-context-tools.md b/docs/agent-kv-context-tools.md
new file mode 100644
index 00000000..d31a75cb
--- /dev/null
+++ b/docs/agent-kv-context-tools.md
@@ -0,0 +1,894 @@
+# Agent KV Context Tools: Analysis And Implementation Plan
+
+## Goal
+
+Give `ds4-agent` a native tool for controlling its own context state without
+exposing raw KV internals to the model.
+
+The important distinction is that the agent should not read or write arbitrary
+KV bytes. The useful feature is semantic control over checkpoints, restore
+points, and context compaction. The tool should operate on transcript and
+session checkpoints together, preserving the invariant that the visible
+conversation and live `ds4_session` state describe the same timeline.
+
+## Existing System Constraints
+
+`ds4_session` is one mutable inference timeline. It owns the live KV cache and
+logits, while callers provide full token prefixes to `ds4_session_sync()` so the
+session can reuse, extend, or rebuild graph state.
+
+`ds4-agent` already has user-facing slash commands for related operations:
+
+- `/save` persists the current session under `~/.ds4/kvcache`.
+- `/switch` loads a saved session and restores transcript plus KV payload.
+- `/compact` asks the model for a durable summary and rebuilds the transcript.
+- `/new` resets to the system/tool prompt.
+- `/del` deletes a saved session.
+- `/strip` removes a persisted KV payload while preserving rendered text.
+
+Those commands are controlled by the user. The proposed feature gives the model
+a narrower tool-level API so it can manage expensive context deliberately during
+long autonomous work.
+
+The relevant invariants are:
+
+- A checkpoint is valid only if `ds4_session_tokens(w->session)` matches
+  `w->transcript`.
+- Restore must replace transcript and KV state as one operation.
+- A context restore does not revert filesystem, process, network, or browser
+  side effects.
+- Active bash jobs are external state and must either block restore or be
+  explicitly surfaced in the restored transcript.
+- Compaction can temporarily put private compaction prompts into live KV; any
+  failed compaction must invalidate live session state before continuing.
+- For server/API usage, exact DSML replay must remain byte-for-byte compatible
+  with the rendered history. For `ds4-agent`, sampled DSML is already preserved
+  directly in the transcript, but the same principle applies: never rewrite a
+  tool-call turn into a semantically similar but token-different form.
+
+## Proposed Tool Surface
+
+Use one DSML tool named `context` with an `action` parameter instead of many
+separate tool names. This keeps the system prompt smaller and makes future
+actions easier to add without teaching the model a large new catalog.
+
+```json
+{
+  "type": "function",
+  "function": {
+    "name": "context",
+    "description": "Inspect, checkpoint, restore, or compact the agent context.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "action": {
+          "type": "string",
+          "enum": ["status", "checkpoint", "list", "restore", "compact", "drop"]
+        },
+        "id": {"type": "string"},
+        "label": {"type": "string"},
+        "reason": {"type": "string"},
+        "allow_side_effect_mismatch": {"type": "boolean"},
+        "dry_run": {"type": "boolean"}
+      },
+      "required": ["action"]
+    }
+  }
+}
+```
+
+Initial actions:
+
+- `status`: report transcript length, session position, context size, free
+  tokens, dirty session state, side-effect epoch, active bash jobs, and known
+  checkpoints.
+- `checkpoint`: save a named restore point at the current stable transcript.
+- `list`: list known checkpoints.
+- `restore`: restore a checkpoint if side-effect rules allow it.
+- `compact`: request the existing compaction path with an explicit reason.
+- `drop`: delete checkpoint metadata and its associated payload when safe.
+  In the first implementation, "safe" means the checkpoint id resolves
+  unambiguously, paths remain inside the context directory, and no bash job is
+  running. The tool does not understand semantic roles such as "best baseline";
+  callers should use `dry_run=true` before deleting important checkpoints.
+
+Phase 1 should be disk-backed and reuse the existing agent KV save/load path.
+That avoids holding multiple huge KV payloads in RAM and keeps the first
+experiment close to the existing `/save` and `/switch` implementation.
+
+## Concrete Use Cases
+
+1. Deep codebase exploration checkpoint.
+
+An agent reads architecture files, traces call graphs, and builds a high-value
+mental model. Before trying an implementation, it calls
+`context action=checkpoint label="repo-map-before-fix"`. If the first
+implementation path fails, restore avoids re-prefilling the whole exploration
+history.
+
+2. Alternative patch strategies.
+
+Before changing a shared subsystem, the agent creates a checkpoint, implements
+approach A, runs tests, then restores and tries approach B. This is useful when
+both alternatives require long reasoning from the same inspected context.
+Filesystem changes still need explicit version-control or file rollback, so
+restore must warn when side effects happened after the checkpoint.
+
+3. Compaction quality recovery.
+
+The agent checkpoints before forced compaction. If the compacted summary loses
+critical details, the agent can restore the pre-compaction checkpoint and retry
+with a better compaction reason, smaller tool output, or a manual summary.
+
+4. Long web research reuse.
+
+The agent searches and visits several pages, creating large rendered Markdown
+observations. A checkpoint lets it try different conclusions or implementation
+plans without paying the same browser and prefill cost again.
+
+5. Risky tool-call loop guard.
+
+Before a sequence of generated `edit` and `bash` calls, the agent checkpoints.
+If it starts following a wrong path, the user or model can restore the reasoning
+state while separately deciding whether to keep, revert, or inspect filesystem
+effects.
+
+6. Parser and prompt experiments.
+
+Developers working on DSML parsing, forced syntax, or tool visualization can
+restart from the same prompt frontier and compare generated tool calls under
+different prompt wording or sampling knobs.
+
+7. Large session navigation.
+
+The agent can preserve named frontiers such as "after reading tests",
+"after reproducing bug", and "before final refactor". This gives long local
+sessions an internal navigation model instead of relying only on `/save` and
+manual `/switch`.
+
+8. Bounded experimental loops.
+
+The agent can run a disciplined optimization loop from one baseline context:
+checkpoint the baseline, write an experiment ledger, propose a hypothesis,
+materialize it in code, measure it, and either save the improved state or record
+the failed attempt and restore to the baseline. The ledger survives restore, so
+failed attempts do not need to remain in the model transcript to remain useful.
+This is especially useful for prompt, parser, quality, and performance
+experiments where many attempts share the same expensive codebase understanding.
+
+Example flow:
+
+```text
+context checkpoint label=baseline-before-tool-parser-loop
+write experiment.md with goal, metric, max_attempts, current_attempt=0
+attempt 1: record hypothesis in experiment.md, edit code, run tests
+if tests improve: record success, checkpoint label=best-attempt-1
+if tests regress: record failure in experiment.md, restore baseline with reason
+restore notice tells the model to reread experiment.md before attempt 2
+stop when metric passes or current_attempt reaches max_attempts
+```
+
+## Safety Model
+
+The implementation should add a monotonically increasing `world_epoch` owned by
+the agent worker. Increment it for successful operations that may change
+external state:
+
+- `write`
+- `edit`
+- `bash`
+- `bash_stop`
+- future filesystem mutation tools
+
+Read-only tools such as `read`, `search`, `list`, `google_search`, and
+`visit_page` do not increment `world_epoch`.
+
+Every context checkpoint stores the current `world_epoch`. A restore where the
+current epoch differs from the checkpoint epoch should fail by default with a
+clear message:
+
+```text
+Tool error: restore would rewind model context from world_epoch=7 to 4, but
+external side effects may still exist. Revert or inspect those effects, or call
+context restore with allow_side_effect_mismatch=true.
+```
+
+Even with `allow_side_effect_mismatch=true`, the tool result must say that only
+model context was restored. It must not claim that files, commands, browser
+state, or network side effects were reverted.
+
+On agent startup, initialize `world_epoch` from the maximum epoch found in
+existing checkpoint metadata. That keeps persisted checkpoints usable after a
+restart while ensuring new side effects in the current process advance beyond
+the restored baseline.
+
+Restores should also fail while a bash job is running. A running process is a
+live external dependency whose output may still arrive after the restored
+transcript.
+
+### Restore Notice
+
+A model-initiated restore must not be silent. A raw restore of transcript plus
+KV would move the model back to the checkpoint and erase the very reason it
+decided to restore. The default tool behavior should therefore be:
+
+```text
+load checkpoint transcript + KV
+append synthetic restore notice
+continue from restored transcript plus notice
+```
+
+The restore notice becomes the first event after the restored checkpoint. It
+should be inserted as a tool result or equivalent user-visible control message
+after the restored transcript has been loaded. It must include:
+
+- checkpoint id and label,
+- restore reason supplied by the model or user,
+- restored transcript token count,
+- checkpoint `world_epoch` and current `world_epoch`,
+- whether side-effect mismatch was allowed,
+- a warning that files, subprocesses, browser state, network effects, and other
+  external state were not reverted,
+- a compact summary of known post-checkpoint side effects when available,
+- an explicit warning when the in-memory side-effect history has been truncated
+  and older post-checkpoint side effects may have been dropped.
+
+Example:
+
+```text
+Context restored from checkpoint 7e1c2b1a label=after-repo-map.
+Reason: approach A failed because parser regression test X still failed.
+Restored model context to 18420 tokens. world_epoch restored=3 current=7.
+External side effects were not reverted; inspect or revert files/processes
+separately before assuming the workspace matches this checkpoint.
+```
+
+This means restore creates a coherent continuation, not a perfect time machine.
+The agent retains the expensive pre-checkpoint context and receives a short
+explanation of why the failed attempt was discarded.
+
+## Critical Assessment: Agent And Server
+
+The opportunity is real, but it is not the same feature in `ds4-agent` and
+`ds4-server`.
+
+In `ds4-agent`, the process has one user, one live worker, one transcript, and
+one obvious owner of side effects. A context tool can be powerful because it
+lets the model preserve expensive frontiers, checkpoint before risky work,
+recover from weak compaction, and write durable notes for later reuse.
+
+In `ds4-server`, the same surface becomes harder for two separate reasons. First,
+API requests are stateless, may come from multiple clients, and are serialized
+through one live backend session. Second, the server currently returns tool calls
+to clients; it does not run native server-side tools in the way `ds4-agent` runs
+`read`, `edit`, `bash`, or `context`. The server can reuse KV prefixes safely,
+but a model-generated `restore` would be a mutation of the single global live
+timeline. Without an explicit session owner, a checkpoint is just a global object
+in a shared cache.
+
+The feature therefore has two layers:
+
+- Computational continuity: checkpoint/restore of transcript plus KV.
+- Semantic continuity: structured memory files that record what the agent
+  learned.
+
+The first layer saves prefill. The second saves reasoning. Both are needed for
+the tool to be genuinely useful.
+
+### Opportunities
+
+- Long local coding sessions can avoid repeated high-cost prefill after a repo
+  exploration or web research phase.
+- The agent can create named frontiers before risky edits, prompt experiments,
+  tool loops, or compaction.
+- Structured memory can preserve architecture facts, invariants, decisions, and
+  open questions even after compaction or restart.
+
+### Difficulties
+
+- A KV checkpoint is not semantic memory. It preserves state, but not a compact
+  object the model can inspect cheaply.
+- Restore does not revert the world. Files, subprocesses, browser state,
+  network effects, and external APIs remain changed.
+- The current server has no authenticated tenant, owner, or session namespace.
+  Adding one is a prerequisite for writeable multi-user context controls.
+- Stateless clients may resend a history that disagrees with a server-side
+  restore. The server must prefer explicit session-control semantics over
+  implicit tool behavior.
+- Exact DSML replay remains fragile if checkpoint movement loses the sampled
+  tool-call bytes or maps them to the wrong request/session.
+- Future concurrent or multi-slot serving can race on checkpoint metadata and
+  memory files unless writes are serialized per namespace.
+
+## Structured Memory Storage
+
+Do not store structured memory inside the KV payload. Store it next to the
+checkpoint as a separate, readable artifact:
+
+```text
+~/.ds4/kvcache/context/
+  <checkpoint-id>.kv
+  <checkpoint-id>.meta.json
+  <checkpoint-id>.memory.md
+```
+
+`<checkpoint-id>.kv` stores transcript plus DS4 session payload.
+
+`<checkpoint-id>.meta.json` stores machine-readable metadata:
+
+```json
+{
+  "id": "7e1c2b1a...",
+  "label": "repo-map-before-fix",
+  "created_at": 1780000000,
+  "world_epoch": 3,
+  "transcript_tokens": 18420,
+  "kv_path": "7e1c2b1a.kv",
+  "memory_path": "7e1c2b1a.memory.md",
+  "memory_sha1": "..."
+}
+```
+
+`<checkpoint-id>.memory.md` stores model-readable semantic memory:
+
+```md
+# Context Memory
+
+## Goal
+## Files Inspected
+## Architecture Facts
+## Invariants
+## Decisions
+## Commands And Results
+## Risks
+## Open Questions
+## Next Steps
+```
+
+This separation matters because memory can be regenerated, diffed, inspected,
+loaded selectively, or retained after a KV payload is stripped.
+
+For the first experiment, memory files should be created by the normal file
+tools or by the existing compaction-style paths. They are useful artifacts, but
+they are not required for checkpoint and restore to work.
+
+## Experiment Ledgers
+
+Long autonomous improvement loops need a durable record that is not rewound by
+context restore. Store that record as a Markdown ledger outside the checkpoint
+payload, either as a memory artifact associated with a checkpoint or as a named
+experiment file referenced by checkpoint metadata.
+
+The ledger should be append-oriented and machine-readable enough for the agent
+to enforce its own budget:
+
+```md
+# Experiment Loop
+
+## Goal
+Reduce DSML tool-call failures without regressing server behavior.
+
+## Success Metric
+- `./ds4_test --tool-call-quality` improves or stays stable
+- `./ds4_test --server` has no regressions
+
+## Budget
+max_attempts: 5
+current_attempt: 2
+
+## Baseline
+checkpoint: 7e1c2b1a
+score: ...
+
+## Attempts
+
+### Attempt 1
+Prompt: ...
+Hypothesis: ...
+Patch: ...
+Tests: ...
+DS4 response: ...
+Result: failed
+Reason: parser regression X
+Decision: discard
+
+### Attempt 2
+Hypothesis: ...
+Status: in_progress
+```
+
+For DS4-generated loop tests, keep both levels of evidence:
+
+- a compact model-written ledger with `ds4_prompt` and `ds4_response` fields;
+- a harness-written report that preserves the exact prompt sent to DS4, the raw
+  DS4 output, and the generated ledger.
+
+The budget is not just a suggestion in prose. For the first experiment, the
+agent should reread the ledger after each restore and stop when
+`current_attempt >= max_attempts`, when the success metric is met, or when
+restore safety checks fail. A later loop controller can enforce the same rule
+programmatically. After a failed attempt, the expected flow is:
+
+```text
+append failure result to ledger
+restore baseline checkpoint
+append restore notice that points to the ledger
+start the next hypothesis
+```
+
+After a successful attempt, the agent should update the ledger, save a new
+checkpoint, and mark it as the new best state. This turns context checkpoints
+into clean restart points and the Markdown ledger into the durable memory of the
+search process.
+
+## Verified Server Session Model
+
+The current `ds4-server` does not have an explicit remote session or owner
+concept. This was verified against the server implementation:
+
+- `server` owns one `ds4_session *session`, one disk KV cache handle, one tool
+  memory map, and a small set of live continuation bindings.
+- HTTP client threads parse requests and enqueue stack-owned jobs. A single
+  `worker_main()` dequeues jobs and mutates the one live session.
+- `http_request` stores only method, path, body, and body length. Header parsing
+  reads `Content-Length`; it does not keep `Authorization`, API key, tenant,
+  session, organization, or user headers.
+- `/v1/chat/completions`, `/v1/messages`, and `/v1/completions` parse protocol
+  payload fields into rendered prompts and skip unknown JSON fields. OpenAI
+  `user`, metadata, or similar caller fields are not retained as identity.
+- `/v1/responses` explicitly rejects non-null `previous_response_id` and
+  `conversation` because DS4 does not implement the durable Responses store.
+- Disk KV cache lookup is keyed by rendered byte prefix plus compatibility
+  checks such as quantization and context size. It is not keyed by user, owner,
+  tenant, or application session.
+- The live Responses, Anthropic, and thinking continuation structures bind
+  recent tool call ids or visible transcript bytes to the current live token
+  frontier. They are process-local accelerators, not durable session ownership.
+
+Therefore, the server currently has stateless API semantics with one mutable
+worker-owned timeline. The right server default is prefix-cache reuse, not
+server-side conversation ownership.
+
+## MVP Boundary
+
+The first experiment is agent-only. `ds4-agent` is the only current runtime with
+all required semantics in one place:
+
+- one transcript owner,
+- one worker-owned `ds4_session`,
+- slash-command save/switch/compact precedents,
+- side-effect visibility for `edit`, `write`, `bash`, and browser tools,
+- active bash job tracking,
+- a natural place to report restore warnings to the user.
+
+The implementation should stay close to the existing local KV save/load path.
+It may factor small helper functions for metadata, atomic writes, and
+compatibility checks, but a general storage abstraction is not required before
+the first working tool.
+
+The first implementation should add only the state needed by the agent worker:
+checkpoint metadata, a `world_epoch` counter, and enough recent side-effect
+summary text to make restore notices useful. It should not introduce a new
+global context subsystem before the native tool proves useful.
+
+For `ds4-server`, the verified model above is enough guidance for the MVP: keep
+automatic prefix-cache reuse as the server behavior, and do not add
+model-visible restore semantics to stateless API traffic.
+
+## Branch Boundary
+
+This branch intentionally owns only agent context and KV checkpoint support.
+Native Git support lives in `feature/agent-git-tools` and can be merged through
+`feature/agent-kv-git-integration` when both feature lines need to work
+together.
+
+The context branch must not include or link `ds4_agent_git.*`. When an
+integration branch combines both features, mutating Git actions should be
+recorded as ordinary side effects in `world_epoch`, just like `write`, `edit`,
+and `bash`.
+
+## Implementation Plan
+
+### Phase 1: Disk-backed context checkpoints
+
+Add worker-owned checkpoint state and disk metadata:
+
+```c
+typedef struct agent_context_checkpoint {
+    char id[41];
+    char *label;
+    char *path;
+    uint64_t created_at;
+    uint64_t world_epoch;
+    int transcript_tokens;
+    struct agent_context_checkpoint *next;
+} agent_context_checkpoint;
+```
+
+Store checkpoint files below:
+
+```text
+~/.ds4/kvcache/context/<id>.kv
+~/.ds4/kvcache/context/<id>.meta.json
+```
+
+Use existing save/load helpers where possible:
+
+- Save with `agent_kv_save_path()`.
+- Load with `agent_kv_load_path()`.
+- Reuse `agent_worker_sync_tokens()` for stripped or text-only rebuild paths.
+- Keep the worker thread as the only owner of `w->session` mutation.
+
+`id` should be generated independently from the display label, for example from
+random bytes plus checkpoint metadata. The label is user/model-facing display
+text, not the stable identity.
+
+### Phase 2: Tool dispatch
+
+Add `context` to the tool schema prompt and dispatch in
+`agent_execute_tool_call()`.
+
+The handler should parse:
+
+- `action`
+- `id`
+- `label`
+- `reason`
+- `allow_side_effect_mismatch`
+- `dry_run`
+
+The action handler should return compact machine-readable text. Example:
+
+```text
+context action=checkpoint id=7e1c2b1a label=before-parser-refactor tokens=18420 world_epoch=3
+context action=compact status=ok old_tokens=28500 new_tokens=9400 removed_tokens=19100 reduction_percent=67.0 summary_tokens=2100 tail_tokens=7000
+```
+
+Restore appends a model-visible notice that includes KV reuse accounting:
+
+```text
+KV restore expected metrics: checkpoint_tokens=18420 expected_restore_notice_tokens=140 expected_restored_tokens=18560 expected_prefill_suffix_tokens=140 expected_full_prefill_tokens_without_kv=18560 expected_saved_prefill_tokens=18420.
+```
+
+This makes the benefit concrete for both the implementation and the model:
+restoring the checkpoint loads the old prefix from KV, then only the synthetic
+restore notice is expected to be prefetched. The word `expected` is intentional:
+the notice is built before the final sync that appends it, while trace output
+records the actual cached/suffix counts observed by `ds4_session_sync()`.
+
+## Correctness Verification Measures
+
+1. Transcript and session equality.
+
+After every checkpoint and restore:
+
+```text
+agent_tokens_equal(ds4_session_tokens(w->session), &w->transcript) == true
+ds4_session_pos(w->session) == w->transcript.len
+```
+
+2. Prefix reuse measurement.
+
+For a restore from a disk KV payload, the next sync to the same transcript
+should report zero prefill suffix. For stripped checkpoints, the suffix may be
+non-zero, and the tool result must say it rebuilt from rendered text.
+
+For model-initiated restores, the actual post-restore transcript should be the
+checkpoint transcript plus the synthetic restore notice. Verification should
+measure both values separately: zero prefill for loading the checkpoint payload,
+then a small append for the notice.
+
+If payload tokens and metadata tokens disagree after a KV load, restore must not
+leave the live session at the loaded payload while the transcript still points
+to the previous conversation. It should resynchronize the live session to the
+current transcript or invalidate the session before returning the error.
+
+The `context status` output should expose the live-cache view as
+`cached_tokens` and `prefill_suffix_tokens`, so the agent can tell whether the
+current transcript will reuse KV or force a rebuild.
+
+3. Next-token equivalence.
+
+Before checkpoint, copy logits with `ds4_session_copy_logits()`. After restore,
+copy logits again and compare:
+
+- exact token position equality,
+- same argmax token,
+- top-k ids match,
+- float deltas are zero or within a backend-specific tolerance.
+
+4. Side-effect epoch enforcement.
+
+Create a checkpoint, run `edit` or `bash`, then attempt restore. Expected:
+
+- restore fails without `allow_side_effect_mismatch=true`,
+- restore succeeds with the override,
+- restore notice explicitly warns that external effects were not reverted and
+  names the epoch mismatch.
+
+5. Active bash job guard.
+
+Start a long-running bash job, checkpoint or restore depending on policy, and
+verify that restore is denied while the job is running. After `bash_stop`, the
+same restore should follow normal side-effect rules.
+
+6. Compaction interaction.
+
+Checkpoint before compaction, compact, then restore. Expected:
+
+- transcript returns to the checkpoint token count,
+- model-initiated restore appends a restore notice after that checkpoint,
+- private compaction prompt text is absent,
+- live session is synchronized to restored transcript,
+- no stale compaction summary remains unless it was part of the checkpoint.
+
+7. Corrupt or incompatible checkpoint handling.
+
+Corrupt a checkpoint file or change quant/context metadata. Expected:
+
+- restore fails,
+- live session is invalidated only if load already touched it,
+- transcript is not replaced with partial data,
+- error text identifies the reason.
+
+8. Persistence across restart.
+
+Save a context checkpoint, exit `ds4-agent`, restart, list checkpoints, restore
+the checkpoint, and verify token count plus next-token equivalence where the
+same model/backend are available.
+
+9. DS4-generated experiment loop.
+
+Run the slow e2e target:
+
+```sh
+make test-agent-context-loop
+```
+
+This test is intentionally not part of default `make test`: it requires a real
+model, a usable backend, and enough time for a short agent turn. The prompt in
+`tests/ds4_agent_context_loop_prompt.md` requires DS4 itself to:
+
+- create an experiment ledger with `write`,
+- record the compact prompt and final DS4 response in that ledger,
+- measure a DS4-owned helper test with `bash`,
+- update the ledger with `edit`,
+- create a model-visible `context checkpoint`,
+- finish with `LOOP_DONE`.
+
+The shell harness verifies the generated ledger, the prompt/response report,
+and the checkpoint metadata. It also writes
+`tests/generated/ds4_agent_context_loop_report.md` plus separate persisted
+prompt, response, and ledger files with the exact expanded prompt, the raw DS4
+output, and the generated ledger. It does not synthesize the loop in C; the
+point is to test whether the model can operate the new tool surface in the
+intended loop shape.
+
+10. KV cache benefit benchmark.
+
+Run the optional benchmark target:
+
+```sh
+make test-kv-cache-benefit
+```
+
+This target is intentionally separate from default `make test` because it opens
+the real model and backend. It verifies:
+
+- a saved KV payload reloads to the same token position,
+- restored logits have the same argmax and near-zero delta versus the original
+  checkpoint state,
+- extending the restored session requires prefill only for the suffix,
+- a fresh full prefill to the same extended transcript has the same top-1 next
+  token as KV-restore-plus-suffix,
+- the report prints `full_prefill_tokens`, `restored_prefill_tokens`,
+  `saved_prefill_tokens`, payload bytes, and wall-clock timings.
+
+The hallucination claim should be phrased conservatively: the deterministic
+guard is model-state equivalence. If logits/argmax match after restore, the KV
+path has not introduced state drift. Compaction can reduce context pressure,
+but factual quality after compaction still depends on the summary and must be
+tested with task-specific e2e prompts.
+
+11. Compaction canary retention e2e.
+
+Run the optional compaction-quality target:
+
+```sh
+make test-agent-context-compact-canary
+```
+
+This target is intentionally separate from default `make test`: it asks DS4 to
+operate the `context compact` tool, places five canary facts before a long
+irrelevant padding block, and then requires DS4 to write the canaries into a
+ledger only after compaction. The harness verifies:
+
+- the trace contains `compacted reason="canary-retention-test"`,
+- the compaction trace reports a reduced token count and a late enough recent
+  tail start,
+- the post-compaction ledger exists,
+- all five canary values survived,
+- the final response marker is present.
+
+This is still not a general hallucination benchmark. It is a focused task-level
+guard that checks whether compaction preserves facts explicitly marked as
+critical for the next action while those facts are pushed out of the recent
+verbatim tail.
+
+12. Adaptive context self-improvement e2e.
+
+Run the optional self-improvement target:
+
+```sh
+make test-agent-context-self-improvement
+```
+
+This target is intentionally separate from default `make test`: it requires a
+real model/backend and asks DS4 to operate a complete agent loop. The harness
+creates a temporary repository with a small failing Python project, then the
+prompt requires DS4 to:
+
+- create a context checkpoint before changing the project,
+- inspect repository status,
+- read and fix the failing code,
+- run the project test,
+- inspect the resulting diff,
+- create a second context checkpoint after the test passes,
+- restore from that checkpoint,
+- inspect repository status again,
+- run the test again,
+- write a ledger with the exact actions and final result.
+
+The Git inspection step is adaptive. If the model-visible schemas include the
+native Git tool, the prompt asks DS4 to use it for `status` and `diff`. If that
+tool is absent, the same prompt requires the existing `bash` path with
+`git status --short` and `git diff`. This keeps the context/KV branch
+independent from the Git-tool branch while still letting the same test exercise
+the native Git path after integration.
+
+This test is not a claim that DS4 has optimized DS4 itself. It is a controlled
+regression test for the agent loop shape: inspect, edit, test, diff,
+checkpoint, restore, retest, and record evidence.
+
+The stronger product demonstration should be a DS4-on-DS4 optimization loop:
+ask DS4 to inspect this repository, select one small measurable optimization,
+implement it, run the relevant benchmark or e2e check, inspect the source diff,
+checkpoint and restore the successful state, and record whether the metric
+improved. That is the right next target, but it should remain an optional slow
+evaluation because it is more expensive and less deterministic than a controlled
+temporary-repository regression.
+
+### Resume Point: 2026-05-25
+
+The DS4-generated context loop was run successfully with:
+
+```sh
+make test-agent-context-loop
+```
+
+The first sandboxed attempt failed because the sandbox could not access Metal.
+The successful run was executed outside the sandbox and produced:
+
+- `tests/generated/ds4_agent_context_loop_report.md`
+- `tests/generated/ds4_agent_context_loop_prompt.md`
+- `tests/generated/ds4_agent_context_loop_output.txt`
+- `tests/generated/ds4_agent_context_loop_ledger.md`
+
+The generated ledger recorded:
+
+```text
+ds4_prompt=validate DS4's own agent context loop capability
+ds4_response=LOOP_DONE
+attempt=1 status=pass
+attempt=1 metric=ds4_agent_context_test passed
+```
+
+Useful result: the loop proved that DS4 can operate the intended tool sequence:
+
+```text
+write -> bash -> edit -> context checkpoint -> final response
+```
+
+It also proved that the harness now captures the full evidence chain: expanded
+prompt, raw model response, generated ledger, and checkpoint metadata.
+
+Observed weakness: the prompt explicitly said `Do not explain the plan in
+prose`, but DS4 still emitted conversational text such as:
+
+```text
+I'll execute the loop step by step.
+The test succeeded (exit_status=0). Now I'll edit the file to mark success.
+The attempt passed. Now I'll checkpoint the context.
+```
+
+This did not break the current harness because the final ledger and checkpoint
+were correct, but it gives the next self-improvement loop a concrete target:
+improve DS4's adherence to tool-only execution when the prompt requests no
+prose.
+
+Next loop to run from here:
+
+1. Inspect baseline state with external version-control commands or with the
+   independent Git branch after integration.
+2. Ask DS4 to propose a small DS4-owned improvement for tool-only adherence.
+3. Materialize the hypothesis in a Markdown experiment ledger.
+4. Implement one minimal change.
+5. Measure with a focused e2e check that fails when raw DS4 output contains
+   unexpected prose before/between required tool calls.
+6. If the metric improves, checkpoint and record the source diff in the ledger.
+7. If it does not improve, record the failure and restore/retry from the saved
+   context frontier.
+
+## Exploration And Implementation Loop
+
+Use this loop for each action before merging implementation:
+
+1. Define a concrete agent scenario.
+
+Write the starting transcript shape, tool calls involved, expected checkpoint
+state, and external side effects.
+
+2. Run the scenario against the current implementation.
+
+Capture transcript token count, session position, world epoch, active bash job
+state, and checkpoint id.
+
+3. Assert invariants.
+
+Check token equality, session position, side-effect policy, and tool result
+clarity.
+
+4. Measure cost.
+
+Record save latency, restore latency, prefill suffix tokens, checkpoint payload
+bytes, and whether restore avoided a cold rebuild.
+
+5. Break it intentionally.
+
+Try stale ids, corrupt files, active jobs, side-effect mismatch, stripped
+payloads, and interrupted compaction.
+
+6. Tighten the implementation.
+
+Add the missing guard, simplify the API, or improve the tool result before
+moving to the next action.
+
+## First Loop Batch
+
+The first implementation pass should cover these scenarios in order:
+
+| Scenario | Purpose | Expected result |
+| --- | --- | --- |
+| `status` on fresh sysprompt | establish baseline | reports ctx, pos, transcript tokens, no checkpoints |
+| `checkpoint` after one user turn | prove save path | checkpoint id returned, token/session equality holds |
+| `restore` with no side effects | prove load path | checkpoint is loaded with zero prefill suffix, then restore notice is appended |
+| `restore` after `edit` | prove guard | denied unless override is set |
+| `compact` then `restore` | prove compaction safety | restored state has no leaked private summary |
+| running `bash` then `restore` | prove live process guard | restore denied until job is stopped |
+| failed-attempt retry after restore | prove model usability | model uses restore notice to abandon the failed attempt and try a different strategy |
+| DS4-generated loop e2e | prove model tool use | DS4 writes a ledger, records prompt/response, runs a DS4 helper test, records pass/fail, checkpoints passing state |
+
+## Open Design Decisions
+
+- Whether `checkpoint` should be allowed while the session is dirty but idle.
+  The likely answer is yes, after forcing `agent_worker_sync_tokens()`.
+- Whether model-initiated `restore` should require user confirmation in
+  interactive mode. For Phase 1, deny side-effect mismatch by default and do not
+  prompt from inside the tool.
+- Whether to expose an explicit `hard_restore` action for tests and manual
+  debugging. The default model-visible `restore` should append a restore notice;
+  hard restore should not be the autonomous path.
+- Whether these controls should also be exposed through slash commands. The
+  initial implementation can keep `/save` and `/switch` unchanged and expose
+  only the DSML `context` tool.
+- How experiment loops coordinate context restore with workspace rollback. The
+  MVP can warn through `world_epoch` and require explicit cleanup before
+  override.
+
+## Non-goals
+
+- No arbitrary KV byte editing.
+- No filesystem rollback.
+- No promise that browser state or network side effects are restored.
+- No multiple live KV sessions in RAM in Phase 1.
+- No prompt rewriting that changes sampled DSML history.
diff --git a/ds4_agent.c b/ds4_agent.c
index 10b76ea3..fdb65703 100644
--- a/ds4_agent.c
+++ b/ds4_agent.c
@@ -1,4 +1,5 @@
 #include "ds4.h"
+#include "ds4_agent_context.h"
 #include "ds4_kvstore.h"
 #include "ds4_web.h"
 #include "linenoise.h"
@@ -8,6 +9,7 @@
 #include <dirent.h>
 #include <fnmatch.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <limits.h>
 #include <math.h>
 #include <poll.h>
@@ -96,6 +98,7 @@ typedef struct {
     ds4_session *session;
     ds4_tokens transcript;
     char *cache_dir;
+    char *context_dir;
     char *sysprompt_path;
     char session_sha[41];
     char *session_title;
@@ -139,10 +142,35 @@ typedef struct {
     bool more_valid;
     agent_bash_job *bash_jobs;
     int next_bash_job_id;
+    uint64_t world_epoch;
+    ds4_agent_side_effects side_effects;
 } agent_worker;
 
 static unsigned agent_next_prefill_label(void);
 
+typedef struct {
+    int old_pos;
+    int prompt_tokens;
+    int common_prefix;
+    int cached_tokens;
+    int prefill_tokens;
+    double elapsed_sec;
+    int rc;
+} agent_sync_metrics;
+
+typedef struct {
+    int old_tokens;
+    int new_tokens;
+    int summary_tokens;
+    int tail_tokens;
+    int removed_tokens;
+    double reduction_percent;
+} agent_compact_stats;
+
+static bool agent_worker_compact(agent_worker *w, const char *reason,
+                                 char *err, size_t err_len,
+                                 agent_compact_stats *stats);
+
 typedef struct agent_tail_capture {
     char *buf;
     size_t cap;
@@ -803,6 +831,25 @@ static const char agent_tools_prompt_after_edit[] =
     "{\n"
     "  \"type\": \"function\",\n"
     "  \"function\": {\n"
+    "    \"name\": \"context\",\n"
+    "    \"description\": \"Inspect, checkpoint, restore, compact, list, or drop the agent context state. Restore never reverts files, processes, browser state, or network effects.\",\n"
+    "    \"parameters\": {\n"
+    "      \"type\": \"object\",\n"
+    "      \"properties\": {\n"
+    "        \"action\": {\"type\": \"string\", \"enum\": [\"status\", \"checkpoint\", \"list\", \"restore\", \"compact\", \"drop\"]},\n"
+    "        \"id\": {\"type\": \"string\"},\n"
+    "        \"label\": {\"type\": \"string\"},\n"
+    "        \"reason\": {\"type\": \"string\"},\n"
+    "        \"allow_side_effect_mismatch\": {\"type\": \"boolean\"},\n"
+    "        \"dry_run\": {\"type\": \"boolean\"}\n"
+    "      },\n"
+    "      \"required\": [\"action\"]\n"
+    "    }\n"
+    "  }\n"
+    "}\n\n"
+    "{\n"
+    "  \"type\": \"function\",\n"
+    "  \"function\": {\n"
     "    \"name\": \"read\",\n"
     "    \"description\": \"Read a text file or a range of lines.\",\n"
     "    \"parameters\": {\n"
@@ -3927,14 +3974,24 @@ static void worker_answer_queued_user_drain(agent_worker *w, char *text) {
  * cache-saving operation: if the requested transcript extends the live session,
  * only the suffix is prefetched; otherwise the DS4 session rebuilds from the
  * longest common prefix it can retain. */
-static int agent_worker_sync_tokens(agent_worker *w, const ds4_tokens *tokens,
-                                    bool publish_progress,
-                                    char *err, size_t err_len) {
+static int agent_worker_sync_tokens_ex(agent_worker *w, const ds4_tokens *tokens,
+                                       bool publish_progress,
+                                       char *err, size_t err_len,
+                                       agent_sync_metrics *metrics) {
     int old_pos = ds4_session_pos(w->session);
     int common = ds4_session_common_prefix(w->session, tokens);
     int cached = common == old_pos && tokens->len >= old_pos ? common : 0;
     int suffix = tokens->len - cached;
     if (suffix < 0) suffix = tokens->len;
+    if (metrics) {
+        metrics->old_pos = old_pos;
+        metrics->prompt_tokens = tokens->len;
+        metrics->common_prefix = common;
+        metrics->cached_tokens = cached;
+        metrics->prefill_tokens = suffix;
+        metrics->elapsed_sec = 0.0;
+        metrics->rc = 0;
+    }
 
     if (publish_progress) {
         pthread_mutex_lock(&w->mu);
@@ -3956,12 +4013,25 @@ static int agent_worker_sync_tokens(agent_worker *w, const ds4_tokens *tokens,
     ds4_session_set_display_progress(w->session,
                                      publish_progress ? worker_progress_cb : NULL,
                                      publish_progress ? w : NULL);
+    double t0 = now_sec();
     int rc = ds4_session_sync(w->session, tokens, err, err_len);
+    double elapsed = now_sec() - t0;
     ds4_session_set_progress(w->session, NULL, NULL);
     ds4_session_set_display_progress(w->session, NULL, NULL);
+    if (metrics) {
+        metrics->elapsed_sec = elapsed;
+        metrics->rc = rc;
+    }
     return rc;
 }
 
+static int agent_worker_sync_tokens(agent_worker *w, const ds4_tokens *tokens,
+                                    bool publish_progress,
+                                    char *err, size_t err_len) {
+    return agent_worker_sync_tokens_ex(w, tokens, publish_progress,
+                                       err, err_len, NULL);
+}
+
 /* Start a new session at the system/tool prompt.  A fixed sysprompt.kv
  * checkpoint avoids paying this prefill cost repeatedly, but only when the
  * rendered prompt text still matches the file.  The same fixed path is shared
@@ -5343,6 +5413,14 @@ static bool agent_tool_result_fits_context(agent_worker *w, const char *result,
     return tokens + reserve_tokens < w->cfg->gen.ctx_size;
 }
 
+static void agent_context_note_side_effect(agent_worker *w, const char *kind,
+                                           const char *detail) {
+    if (!w) return;
+    w->world_epoch = ds4_agent_side_effects_note(&w->side_effects,
+                                                 w->world_epoch,
+                                                 kind, detail);
+}
+
 /* Read file text for the model.  Normal mode shows plain line numbers.  Raw
  * mode is reserved for cases where line decoration would corrupt the payload
  * being inspected. */
@@ -5440,7 +5518,6 @@ static char *agent_tool_more(agent_worker *w, const agent_tool_call *call) {
 }
 
 static char *agent_tool_write(agent_worker *w, const agent_tool_call *call) {
-    (void)w;
     const char *path = agent_tool_arg_value(call, "path");
     const char *content = agent_tool_arg_value(call, "content");
     if (!path || !path[0]) return xstrdup("Tool error: write requires path\n");
@@ -5463,6 +5540,7 @@ static char *agent_tool_write(agent_worker *w, const agent_tool_call *call) {
         agent_buf_puts(&b, "\n");
         return agent_buf_take(&b);
     }
+    agent_context_note_side_effect(w, "write", path);
     char msg[PATH_MAX + 160];
     snprintf(msg, sizeof(msg), "Wrote %zu bytes to %s\n", len, path);
     return xstrdup(msg);
@@ -5920,6 +5998,8 @@ static char *agent_tool_edit(agent_worker *w, const agent_tool_call *call) {
                                            new_text,
                                            anchored ? "anchored old/new replacement"
                                                     : "old/new replacement");
+    if (strncmp(result, "Tool error:", strlen("Tool error:")) != 0)
+        agent_context_note_side_effect(w, "edit", path);
     free(data);
     return result;
 }
@@ -6721,6 +6801,575 @@ static pid_t agent_tool_pid(const agent_tool_call *call) {
     return (pid_t)agent_parse_int_default(agent_tool_arg_value(call, "pid"), 0, 0, INT_MAX);
 }
 
+/* ============================================================================
+ * Native Context Tool
+ * ============================================================================
+ */
+
+static int agent_context_running_bash_jobs(agent_worker *w) {
+    int running = 0;
+    for (agent_bash_job *job = w->bash_jobs; job; job = job->next) {
+        agent_bash_poll(job);
+        if (job->running) running++;
+    }
+    return running;
+}
+
+static void agent_context_generate_id(char out[41]) {
+    uint8_t buf[64];
+    memset(buf, 0, sizeof(buf));
+    int fd = open("/dev/urandom", O_RDONLY);
+    ssize_t got = fd >= 0 ? read(fd, buf, sizeof(buf)) : -1;
+    if (fd >= 0) close(fd);
+    if (got != (ssize_t)sizeof(buf)) {
+        uint64_t v[6];
+        v[0] = (uint64_t)time(NULL);
+        v[1] = (uint64_t)clock();
+        v[2] = (uint64_t)getpid();
+        v[3] = (uint64_t)(uintptr_t)&buf;
+        v[4] = (uint64_t)random();
+        v[5] = (uint64_t)now_sec();
+        memcpy(buf, v, sizeof(v));
+    }
+    ds4_kvstore_sha1_bytes_hex(buf, sizeof(buf), out);
+}
+
+static char *agent_context_checkpoint_result(const char id[41], const char *label,
+                                             int tokens, uint64_t epoch,
+                                             bool dry_run) {
+    char *safe = ds4_agent_context_oneline(label, 160);
+    agent_buf b = {0};
+    char line[256];
+    snprintf(line, sizeof(line),
+             "context action=%s id=%.40s label=\"",
+             dry_run ? "checkpoint-dry-run" : "checkpoint", id);
+    agent_buf_puts(&b, line);
+    agent_buf_puts(&b, safe);
+    snprintf(line, sizeof(line), "\" tokens=%d world_epoch=%" PRIu64 "\n",
+             tokens, epoch);
+    agent_buf_puts(&b, line);
+    free(safe);
+    return agent_buf_take(&b);
+}
+
+static bool agent_context_project_tool_result(agent_worker *w, const char *result,
+                                              ds4_tokens *projected,
+                                              int *tokens_out) {
+    ds4_tokens_free(projected);
+    ds4_tokens_copy(projected, &w->transcript);
+    ds4_chat_append_message(w->engine, projected, "tool", result ? result : "");
+    if (tokens_out) *tokens_out = projected->len;
+    return projected->len + 16 < w->cfg->gen.ctx_size;
+}
+
+static char *agent_context_checkpoint(agent_worker *w, const agent_tool_call *call,
+                                      bool *already_appended) {
+    const char *label_arg = agent_tool_arg_value(call, "label");
+    bool dry_run = agent_parse_bool_default(agent_tool_arg_value(call, "dry_run"), false);
+    char *label = ds4_agent_context_limited_strdup(label_arg && label_arg[0] ?
+                                               label_arg : "checkpoint", 240);
+    char id[41];
+    char *kv_file = NULL, *meta_file = NULL, *mem_file = NULL;
+    char *kv_path = NULL, *meta_path = NULL;
+    for (int i = 0; i < 16; i++) {
+        agent_context_generate_id(id);
+        free(kv_file);
+        free(meta_file);
+        free(mem_file);
+        free(kv_path);
+        free(meta_path);
+        kv_file = ds4_agent_context_file_name(id, ".kv");
+        meta_file = ds4_agent_context_file_name(id, ".meta.json");
+        mem_file = ds4_agent_context_file_name(id, ".memory.md");
+        kv_path = ds4_agent_context_path_for_file(w->context_dir, kv_file);
+        meta_path = ds4_agent_context_path_for_file(w->context_dir, meta_file);
+        if (access(meta_path, F_OK) != 0 && access(kv_path, F_OK) != 0) break;
+    }
+
+    int tokens = w->transcript.len;
+    char *result = NULL;
+    ds4_tokens projected = {0};
+    for (int i = 0; i < 4; i++) {
+        free(result);
+        result = agent_context_checkpoint_result(id, label, tokens, w->world_epoch, dry_run);
+        int new_tokens = 0;
+        if (!agent_context_project_tool_result(w, result, &projected, &new_tokens)) {
+            ds4_tokens_free(&projected);
+            free(label);
+            free(kv_file);
+            free(meta_file);
+            free(mem_file);
+            free(kv_path);
+            free(meta_path);
+            return xstrdup("Tool error: checkpoint result would exceed context\n");
+        }
+        if (new_tokens == tokens) break;
+        tokens = new_tokens;
+    }
+
+    if (dry_run) {
+        ds4_tokens_free(&projected);
+        free(label);
+        free(kv_file);
+        free(meta_file);
+        free(mem_file);
+        free(kv_path);
+        free(meta_path);
+        return result;
+    }
+
+    char err[256] = {0};
+    if (!agent_mkdir_p(w->context_dir)) {
+        snprintf(err, sizeof(err), "failed to create %s", w->context_dir);
+        goto fail;
+    }
+    if (agent_worker_sync_tokens(w, &projected, false, err, sizeof(err)) != 0)
+        goto fail;
+
+    char ignored_sha[41];
+    if (!agent_kv_save_path(w, kv_path, &projected, "agent-context",
+                            ignored_sha, NULL, 0, err, sizeof(err)))
+        goto rollback;
+
+    ds4_agent_context_meta meta = {0};
+    snprintf(meta.id, sizeof(meta.id), "%s", id);
+    meta.label = label;
+    meta.kv_file = kv_file;
+    meta.memory_file = mem_file;
+    meta.created_at = (uint64_t)time(NULL);
+    meta.world_epoch = w->world_epoch;
+    meta.transcript_tokens = projected.len;
+    if (!ds4_agent_context_write_meta(&meta, meta_path, err, sizeof(err))) {
+        unlink(kv_path);
+        goto rollback_no_meta_free;
+    }
+
+    ds4_tokens_free(&w->transcript);
+    w->transcript = projected;
+    memset(&projected, 0, sizeof(projected));
+    pthread_mutex_lock(&w->mu);
+    /* session_dirty tracks the durable /save state, not live KV sync.  The
+     * session was synced to projected above; this marks the visible transcript
+     * as changed because the checkpoint tool result is now part of it. */
+    w->session_dirty = true;
+    w->user_activity = true;
+    w->status.ctx_used = w->transcript.len;
+    agent_wake_locked(w);
+    pthread_mutex_unlock(&w->mu);
+    if (already_appended) *already_appended = true;
+
+    free(meta_path);
+    free(kv_path);
+    free(meta_file);
+    ds4_agent_context_meta_free(&meta);
+    return result;
+
+rollback:
+    unlink(kv_path);
+rollback_no_meta_free:
+    {
+        char sync_err[160] = {0};
+        if (agent_worker_sync_tokens(w, &w->transcript, false,
+                                     sync_err, sizeof(sync_err)) != 0)
+            ds4_session_invalidate(w->session);
+    }
+fail:
+    ds4_tokens_free(&projected);
+    free(result);
+    free(label);
+    free(kv_file);
+    free(meta_file);
+    free(mem_file);
+    free(kv_path);
+    free(meta_path);
+    agent_buf b = {0};
+    agent_buf_puts(&b, "Tool error: context checkpoint failed: ");
+    agent_buf_puts(&b, err[0] ? err : "unknown error");
+    agent_buf_puts(&b, "\n");
+    return agent_buf_take(&b);
+}
+
+static void agent_context_append_side_effects_since(agent_worker *w, uint64_t epoch,
+                                                    agent_buf *b) {
+    char *summary = ds4_agent_side_effects_summary_since(&w->side_effects, epoch);
+    agent_buf_puts(b, summary);
+    free(summary);
+}
+
+static char *agent_context_restore_notice(agent_worker *w,
+                                          const ds4_agent_context_meta *meta,
+                                          const char *reason,
+                                          bool allowed_mismatch,
+                                          int checkpoint_tokens,
+                                          int restored_tokens,
+                                          int restore_notice_tokens) {
+    char *safe_label = ds4_agent_context_oneline(meta->label, 160);
+    char *safe_reason = ds4_agent_context_oneline(reason && reason[0] ?
+                                              reason : "not specified", 240);
+    agent_buf b = {0};
+    char line[320];
+    snprintf(line, sizeof(line),
+             "Context restored from checkpoint %.40s label=\"%s\".\n",
+             meta->id, safe_label);
+    agent_buf_puts(&b, line);
+    agent_buf_puts(&b, "Reason: ");
+    agent_buf_puts(&b, safe_reason);
+    agent_buf_puts(&b, "\n");
+    snprintf(line, sizeof(line),
+             "Restored model context to %d tokens. world_epoch restored=%" PRIu64 " current=%" PRIu64 ".\n",
+             meta->transcript_tokens, meta->world_epoch, w->world_epoch);
+    agent_buf_puts(&b, line);
+    ds4_agent_context_restore_metrics metrics = {
+        .checkpoint_tokens = checkpoint_tokens,
+        .restore_notice_tokens = restore_notice_tokens,
+        .restored_tokens = restored_tokens,
+    };
+    char *metrics_line = ds4_agent_context_restore_expected_metrics_line(&metrics);
+    agent_buf_puts(&b, metrics_line);
+    free(metrics_line);
+    snprintf(line, sizeof(line),
+             "side_effect_mismatch_allowed=%s\n",
+             allowed_mismatch ? "true" : "false");
+    agent_buf_puts(&b, line);
+    agent_buf_puts(&b,
+        "External side effects were not reverted; inspect or revert files, processes, browser state, and network effects separately before assuming the workspace matches this checkpoint.\n");
+    agent_context_append_side_effects_since(w, meta->world_epoch, &b);
+    free(safe_label);
+    free(safe_reason);
+    return agent_buf_take(&b);
+}
+
+static bool agent_context_resync_live_transcript(agent_worker *w,
+                                                 char *err, size_t err_len) {
+    if (agent_worker_sync_tokens(w, &w->transcript, false, err, err_len) != 0) {
+        ds4_session_invalidate(w->session);
+        return false;
+    }
+    return true;
+}
+
+static char *agent_context_restore(agent_worker *w, const agent_tool_call *call,
+                                   bool *already_appended) {
+    const char *id = agent_tool_arg_value(call, "id");
+    const char *reason = agent_tool_arg_value(call, "reason");
+    bool allow = agent_parse_bool_default(
+        agent_tool_arg_value(call, "allow_side_effect_mismatch"), false);
+    bool dry_run = agent_parse_bool_default(agent_tool_arg_value(call, "dry_run"), false);
+
+    int running = agent_context_running_bash_jobs(w);
+    char guard_err[256] = {0};
+    if (!ds4_agent_context_no_running_bash_guard("restore", running,
+                                                 guard_err, sizeof(guard_err))) {
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: ");
+        agent_buf_puts(&b, guard_err);
+        agent_buf_puts(&b, "\n");
+        return agent_buf_take(&b);
+    }
+
+    ds4_agent_context_meta meta = {0};
+    char *meta_path = NULL;
+    char *kv_path = NULL;
+    char err[256] = {0};
+    if (!ds4_agent_context_find_checkpoint(w->context_dir, id, &meta,
+                                           &meta_path, &kv_path,
+                                           err, sizeof(err))) {
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: ");
+        agent_buf_puts(&b, err);
+        agent_buf_puts(&b, "\n");
+        return agent_buf_take(&b);
+    }
+
+    if (!ds4_agent_context_restore_epoch_guard(w->world_epoch, meta.world_epoch,
+                                               allow, guard_err,
+                                               sizeof(guard_err))) {
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: ");
+        agent_buf_puts(&b, guard_err);
+        agent_buf_puts(&b, "\n");
+        agent_context_append_side_effects_since(w, meta.world_epoch, &b);
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        return agent_buf_take(&b);
+    }
+
+    if (dry_run) {
+        char *safe = ds4_agent_context_oneline(meta.label, 160);
+        agent_buf b = {0};
+        char line[256];
+        snprintf(line, sizeof(line),
+                 "context action=restore-dry-run id=%.40s label=\"%s\" tokens=%d world_epoch=%" PRIu64 " current_world_epoch=%" PRIu64 "\n",
+                 meta.id, safe, meta.transcript_tokens, meta.world_epoch, w->world_epoch);
+        agent_buf_puts(&b, line);
+        free(safe);
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        return agent_buf_take(&b);
+    }
+
+    ds4_tokens loaded = {0};
+    if (!agent_kv_load_path(w, kv_path, NULL, NULL, 0, &loaded, NULL,
+                            err, sizeof(err))) {
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: context restore failed: ");
+        agent_buf_puts(&b, err[0] ? err : "failed to load checkpoint");
+        agent_buf_puts(&b, "\n");
+        return agent_buf_take(&b);
+    }
+    if (loaded.len != meta.transcript_tokens) {
+        int meta_tokens = meta.transcript_tokens;
+        int kv_tokens = loaded.len;
+        ds4_tokens_free(&loaded);
+        char sync_err[160] = {0};
+        bool live_resynced = agent_context_resync_live_transcript(w,
+                                                                  sync_err,
+                                                                  sizeof(sync_err));
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        agent_buf b = {0};
+        char line[320];
+        snprintf(line, sizeof(line),
+                 "Tool error: context restore failed: metadata tokens=%d but KV tokens=%d; live_session=%s%s%s\n",
+                 meta_tokens, kv_tokens,
+                 live_resynced ? "resynced" : "invalidated",
+                 !live_resynced && sync_err[0] ? " error=" : "",
+                 !live_resynced && sync_err[0] ? sync_err : "");
+        agent_buf_puts(&b, line);
+        return agent_buf_take(&b);
+    }
+
+    char *notice = NULL;
+    ds4_tokens restored = {0};
+    int checkpoint_tokens = loaded.len;
+    int notice_tokens = 0;
+    int restored_tokens = checkpoint_tokens;
+    for (int i = 0; i < 4; i++) {
+        free(notice);
+        notice = agent_context_restore_notice(w, &meta, reason, allow,
+                                              checkpoint_tokens,
+                                              restored_tokens,
+                                              notice_tokens);
+        ds4_tokens_free(&restored);
+        ds4_tokens_copy(&restored, &loaded);
+        ds4_chat_append_message(w->engine, &restored, "tool", notice);
+        int new_notice_tokens = restored.len - checkpoint_tokens;
+        int new_restored_tokens = restored.len;
+        if (new_notice_tokens == notice_tokens &&
+            new_restored_tokens == restored_tokens)
+            break;
+        notice_tokens = new_notice_tokens;
+        restored_tokens = new_restored_tokens;
+    }
+    ds4_tokens_free(&loaded);
+    if (restored.len + 16 >= w->cfg->gen.ctx_size) {
+        ds4_tokens_free(&restored);
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        free(notice);
+        return xstrdup("Tool error: restore notice would exceed context\n");
+    }
+    agent_sync_metrics sync_metrics = {0};
+    if (agent_worker_sync_tokens_ex(w, &restored, false, err, sizeof(err),
+                                    &sync_metrics) != 0) {
+        ds4_session_invalidate(w->session);
+        ds4_tokens_free(&restored);
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        free(notice);
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: context restore failed after load: ");
+        agent_buf_puts(&b, err[0] ? err : "failed to append restore notice");
+        agent_buf_puts(&b, "\n");
+        return agent_buf_take(&b);
+    }
+    agent_trace(w,
+                "context restore id=%.40s checkpoint=%d restored=%d cached=%d suffix=%d elapsed=%.3f",
+                meta.id, checkpoint_tokens, restored.len,
+                sync_metrics.cached_tokens, sync_metrics.prefill_tokens,
+                sync_metrics.elapsed_sec);
+    if (sync_metrics.cached_tokens < checkpoint_tokens ||
+        sync_metrics.prefill_tokens != restored.len - checkpoint_tokens)
+    {
+        agent_trace(w,
+                    "context restore prefill mismatch expected_cached=%d expected_suffix=%d common=%d old_pos=%d",
+                    checkpoint_tokens, restored.len - checkpoint_tokens,
+                    sync_metrics.common_prefix, sync_metrics.old_pos);
+    }
+
+    ds4_tokens_free(&w->transcript);
+    w->transcript = restored;
+    pthread_mutex_lock(&w->mu);
+    w->user_activity = true;
+    /* session_dirty tracks the durable /save state, not live KV sync.  The
+     * session was synced to restored above; this marks the visible transcript
+     * as changed because the restore notice is now part of it. */
+    w->session_dirty = true;
+    w->status.ctx_used = w->transcript.len;
+    agent_wake_locked(w);
+    pthread_mutex_unlock(&w->mu);
+    if (already_appended) *already_appended = true;
+
+    ds4_agent_context_meta_free(&meta);
+    free(meta_path);
+    free(kv_path);
+    return notice;
+}
+
+static char *agent_context_list(agent_worker *w) {
+    DIR *d = opendir(w->context_dir);
+    agent_buf out = {0};
+    if (!d) return xstrdup("context checkpoints: none\n");
+    agent_buf_puts(&out, "context checkpoints:\n");
+    int count = 0;
+    struct dirent *de;
+    while ((de = readdir(d)) != NULL) {
+        if (!ds4_agent_context_meta_filename(de->d_name)) continue;
+        char *meta_path = ds4_agent_context_path_for_file(w->context_dir, de->d_name);
+        ds4_agent_context_meta m = {0};
+        char err[160] = {0};
+        if (ds4_agent_context_read_meta_file(meta_path, &m, err, sizeof(err))) {
+            char *safe = ds4_agent_context_oneline(m.label, 120);
+            char line[320];
+            snprintf(line, sizeof(line),
+                     "- id=%.8s tokens=%d world_epoch=%" PRIu64 " created_at=%" PRIu64 " label=\"%s\"\n",
+                     m.id, m.transcript_tokens, m.world_epoch, m.created_at, safe);
+            agent_buf_puts(&out, line);
+            free(safe);
+            count++;
+        }
+        ds4_agent_context_meta_free(&m);
+        free(meta_path);
+    }
+    closedir(d);
+    if (count == 0) {
+        free(out.ptr);
+        return xstrdup("context checkpoints: none\n");
+    }
+    return agent_buf_take(&out);
+}
+
+static char *agent_context_status(agent_worker *w) {
+    int pos = ds4_session_pos(w->session);
+    int ctx = ds4_session_ctx(w->session);
+    int common = ds4_session_common_prefix(w->session, &w->transcript);
+    int cached = common == pos && w->transcript.len >= pos ? common : 0;
+    int prefill_suffix = w->transcript.len - cached;
+    if (prefill_suffix < 0) prefill_suffix = w->transcript.len;
+    int running = agent_context_running_bash_jobs(w);
+    int checkpoints = ds4_agent_context_count_checkpoints(w->context_dir);
+    bool dirty;
+    pthread_mutex_lock(&w->mu);
+    dirty = w->session_dirty;
+    pthread_mutex_unlock(&w->mu);
+    char msg[640];
+    snprintf(msg, sizeof(msg),
+             "context status transcript_tokens=%d session_pos=%d cached_tokens=%d prefill_suffix_tokens=%d ctx_size=%d free_tokens=%d dirty=%s world_epoch=%" PRIu64 " active_bash_jobs=%d checkpoints=%d\n",
+             w->transcript.len, pos, cached, prefill_suffix,
+             ctx, ctx - pos, dirty ? "true" : "false", w->world_epoch,
+             running, checkpoints);
+    return xstrdup(msg);
+}
+
+static char *agent_context_drop(agent_worker *w, const agent_tool_call *call) {
+    const char *id = agent_tool_arg_value(call, "id");
+    bool dry_run = agent_parse_bool_default(agent_tool_arg_value(call, "dry_run"), false);
+    int running = agent_context_running_bash_jobs(w);
+    char guard_err[256] = {0};
+    if (!ds4_agent_context_no_running_bash_guard("drop", running,
+                                                 guard_err, sizeof(guard_err))) {
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: ");
+        agent_buf_puts(&b, guard_err);
+        agent_buf_puts(&b, "\n");
+        return agent_buf_take(&b);
+    }
+    ds4_agent_context_meta meta = {0};
+    char *meta_path = NULL;
+    char *kv_path = NULL;
+    char err[256] = {0};
+    if (!ds4_agent_context_find_checkpoint(w->context_dir, id, &meta,
+                                           &meta_path, &kv_path,
+                                           err, sizeof(err))) {
+        agent_buf b = {0};
+        agent_buf_puts(&b, "Tool error: ");
+        agent_buf_puts(&b, err);
+        agent_buf_puts(&b, "\n");
+        return agent_buf_take(&b);
+    }
+    char *memory_path = ds4_agent_context_full_memory_path(w->context_dir, &meta);
+    if (dry_run) {
+        char msg[160];
+        snprintf(msg, sizeof(msg), "context action=drop-dry-run id=%.40s\n", meta.id);
+        ds4_agent_context_meta_free(&meta);
+        free(meta_path);
+        free(kv_path);
+        free(memory_path);
+        return xstrdup(msg);
+    }
+    bool ok = true;
+    if (unlink(kv_path) != 0 && errno != ENOENT) ok = false;
+    if (memory_path && unlink(memory_path) != 0 && errno != ENOENT) ok = false;
+    if (unlink(meta_path) != 0 && errno != ENOENT) ok = false;
+    char msg[256];
+    snprintf(msg, sizeof(msg), "%scontext action=drop id=%.40s\n",
+             ok ? "" : "Tool error: partial drop failure; ", meta.id);
+    ds4_agent_context_meta_free(&meta);
+    free(meta_path);
+    free(kv_path);
+    free(memory_path);
+    return xstrdup(msg);
+}
+
+static char *agent_tool_context(agent_worker *w, const agent_tool_call *call,
+                                bool *already_appended) {
+    const char *action = agent_tool_arg_value(call, "action");
+    if (!action || !action[0]) return xstrdup("Tool error: context requires action\n");
+    if (!strcmp(action, "status")) return agent_context_status(w);
+    if (!strcmp(action, "list")) return agent_context_list(w);
+    if (!strcmp(action, "checkpoint")) return agent_context_checkpoint(w, call, already_appended);
+    if (!strcmp(action, "restore")) return agent_context_restore(w, call, already_appended);
+    if (!strcmp(action, "drop")) return agent_context_drop(w, call);
+    if (!strcmp(action, "compact")) {
+        char err[256] = {0};
+        const char *reason = agent_tool_arg_value(call, "reason");
+        agent_compact_stats stats = {0};
+        if (!agent_worker_compact(w, reason && reason[0] ? reason : "context tool",
+                                  err, sizeof(err), &stats)) {
+            agent_buf b = {0};
+            agent_buf_puts(&b, "Tool error: context compact failed: ");
+            agent_buf_puts(&b, err[0] ? err : "unknown error");
+            agent_buf_puts(&b, "\n");
+            return agent_buf_take(&b);
+        }
+        char msg[256];
+        snprintf(msg, sizeof(msg),
+                 "context action=compact status=ok old_tokens=%d new_tokens=%d removed_tokens=%d reduction_percent=%.1f summary_tokens=%d tail_tokens=%d\n",
+                 stats.old_tokens, stats.new_tokens, stats.removed_tokens,
+                 stats.reduction_percent, stats.summary_tokens,
+                 stats.tail_tokens);
+        return xstrdup(msg);
+    }
+    return xstrdup("Tool error: unknown context action\n");
+}
+
+static bool agent_tool_call_requires_exclusive_context(const agent_tool_call *call) {
+    if (!call || !call->name || strcmp(call->name, "context")) return false;
+    const char *action = agent_tool_arg_value(call, "action");
+    return action &&
+           (!strcmp(action, "checkpoint") ||
+            !strcmp(action, "restore") ||
+            !strcmp(action, "compact"));
+}
+
 /* ============================================================================
  * Tool Dispatch
  * ============================================================================
@@ -6729,8 +7378,10 @@ static pid_t agent_tool_pid(const agent_tool_call *call) {
 /* Execute one parsed DSML tool call and return the text that will be appended as
  * the tool-role result.  UI visualization already happened while streaming; this
  * function is only about side effects and the model-visible observation. */
-static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *call) {
+static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *call,
+                                     bool *already_appended) {
     agent_buf result = {0};
+    if (already_appended) *already_appended = false;
     if (!call->name) return xstrdup("Tool error: missing tool name\n");
 
     if (!strcmp(call->name, "read")) return agent_tool_read(w, call);
@@ -6741,6 +7392,7 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal
     if (!strcmp(call->name, "search")) return agent_tool_search(w, call);
     if (!strcmp(call->name, "google_search")) return agent_tool_google_search(w, call);
     if (!strcmp(call->name, "visit_page")) return agent_tool_visit_page(w, call);
+    if (!strcmp(call->name, "context")) return agent_tool_context(w, call, already_appended);
 
     if (!strcmp(call->name, "bash")) {
         const char *cmd = agent_tool_arg_value(call, "command");
@@ -6756,6 +7408,7 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal
             agent_buf_puts(&result, "\n");
             return agent_buf_take(&result);
         }
+        agent_context_note_side_effect(w, "bash", cmd);
         return agent_bash_job_tool_result(w, job, true, refresh, false, true);
     }
 
@@ -6775,7 +7428,14 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal
                                               60, 1, 3600);
         bool stop = !strcmp(call->name, "bash_stop");
         bool wait = stop;
-        return agent_bash_job_tool_result(w, job, wait, refresh, stop, true);
+        char *res = agent_bash_job_tool_result(w, job, wait, refresh, stop, true);
+        if (stop) {
+            char detail[160];
+            snprintf(detail, sizeof(detail), "job=%d pid=%ld",
+                     job_id, (long)pid);
+            agent_context_note_side_effect(w, "bash_stop", detail);
+        }
+        return res;
     }
 
     {
@@ -6791,10 +7451,25 @@ static char *agent_execute_tool_call(agent_worker *w, const agent_tool_call *cal
 
 /* Execute all tool calls from one DSML block, preserving per-call labels in the
  * combined result so the model can associate observations with calls. */
-static char *agent_execute_tool_calls(agent_worker *w, const agent_tool_calls *calls) {
+static char *agent_execute_tool_calls(agent_worker *w, const agent_tool_calls *calls,
+                                      bool *already_appended) {
     agent_buf all = {0};
+    if (already_appended) *already_appended = false;
+    for (int i = 0; i < calls->len; i++) {
+        if (agent_tool_call_requires_exclusive_context(&calls->v[i]) &&
+            calls->len != 1)
+        {
+            return xstrdup("Tool error: context checkpoint, restore, and compact must be the only tool call in a DSML block\n");
+        }
+    }
     for (int i = 0; i < calls->len; i++) {
-        char *res = agent_execute_tool_call(w, &calls->v[i]);
+        bool one_appended = false;
+        char *res = agent_execute_tool_call(w, &calls->v[i], &one_appended);
+        if (one_appended) {
+            if (already_appended) *already_appended = true;
+            free(all.ptr);
+            return res;
+        }
         char hdr[128];
         snprintf(hdr, sizeof(hdr), "Tool result %d (%s):\n", i + 1,
                  calls->v[i].name ? calls->v[i].name : "unknown");
@@ -6901,6 +7576,7 @@ static char *agent_compact_make_prompt(const char *reason) {
         "- decisions, rejected approaches, known bugs, and pending next steps\n"
         "- reloadable bulky data with exact paths/ranges/commands when available\n\n"
         "Do not invent facts. Do not include generic narration. Do not include raw file contents unless they were essential to a conclusion.\n"
+        "Use plain text headings or bullets. Do not output XML/HTML-like tags such as <context>, <tool_calls>, or <tool_call>.\n"
         "After the summary, stop. Do not continue the user task, do not call tools, and do not output thinking tags or DSML markup.\n"
         "Output only the compact summary.\n");
     if (reason && reason[0]) {
@@ -6911,14 +7587,41 @@ static char *agent_compact_make_prompt(const char *reason) {
     return agent_buf_take(&b);
 }
 
+static bool agent_compact_summary_has_signal(const char *s) {
+    while (*s && isspace((unsigned char)*s)) s++;
+    if (*s == '<') return false;
+    if (strstr(s, "<tool_calls") || strstr(s, "<tool_call") ||
+        strstr(s, "<context") || strstr(s, "｜DSML｜"))
+        return false;
+
+    int alnum = 0, words = 0, run = 0;
+    for (const unsigned char *p = (const unsigned char *)s; *p; p++) {
+        if (isalnum(*p)) {
+            alnum++;
+            run++;
+        } else {
+            if (run >= 2) words++;
+            run = 0;
+        }
+    }
+    if (run >= 2) words++;
+    return alnum >= 24 && words >= 6;
+}
+
 /* Perform the full compaction exchange and rebuild the live DS4 session from
  * the compacted transcript.  Any failure invalidates live KV because the model
  * may have just seen private compaction instructions that are not part of the
  * real conversation. */
 static bool agent_worker_compact(agent_worker *w, const char *reason,
-                                 char *err, size_t err_len) {
+                                 char *err, size_t err_len,
+                                 agent_compact_stats *stats) {
+    if (stats) memset(stats, 0, sizeof(*stats));
     const int bottom = w->transcript.len;
     if (bottom <= 0) return true;
+    if (stats) {
+        stats->old_tokens = bottom;
+        stats->new_tokens = bottom;
+    }
 
     ds4_tokens sys = {0};
     agent_worker_build_system_tokens(w, &sys);
@@ -7026,8 +7729,10 @@ static bool agent_worker_compact(agent_worker *w, const char *reason,
     agent_publish(w, "\x1b[0m\n", 5);
     ds4_tokens_free(&prompt);
 
-    if (!summary.ptr || !summary.ptr[0]) {
-        snprintf(err, err_len, "compaction summary was empty");
+    if (!summary.ptr || !summary.ptr[0] ||
+        !agent_compact_summary_has_signal(summary.ptr))
+    {
+        snprintf(err, err_len, "compaction summary was empty or malformed");
         ds4_session_invalidate(w->session);
         ds4_tokens_free(&sys);
         free(summary.ptr);
@@ -7045,7 +7750,9 @@ static bool agent_worker_compact(agent_worker *w, const char *reason,
     if (summary_msg.len && summary_msg.ptr[summary_msg.len - 1] != '\n')
         agent_buf_puts(&summary_msg, "\n");
     agent_buf_puts(&summary_msg, "[End compacted summary. Recent conversation continues verbatim below.]\n\n");
+    int before_summary_tokens = compacted.len;
     ds4_chat_append_message(w->engine, &compacted, "system", summary_msg.ptr);
+    int summary_tokens = compacted.len - before_summary_tokens;
     free(summary_msg.ptr);
     free(summary.ptr);
 
@@ -7081,13 +7788,23 @@ static bool agent_worker_compact(agent_worker *w, const char *reason,
     agent_trace(w, "compacted reason=\"%s\" old=%d new=%d tail_start=%d tail=%d",
                 reason ? reason : "", bottom, w->transcript.len,
                 tail_start, bottom - tail_start);
+    if (stats) {
+        stats->old_tokens = bottom;
+        stats->new_tokens = w->transcript.len;
+        stats->summary_tokens = summary_tokens;
+        stats->tail_tokens = bottom - tail_start;
+        stats->removed_tokens = bottom - w->transcript.len;
+        if (stats->removed_tokens < 0) stats->removed_tokens = 0;
+        stats->reduction_percent = bottom > 0 ?
+            ((double)stats->removed_tokens * 100.0) / (double)bottom : 0.0;
+    }
     return true;
 }
 
 static bool agent_worker_compact_if_needed(agent_worker *w, const char *reason,
                                            char *err, size_t err_len) {
     if (!agent_worker_should_compact(w)) return true;
-    return agent_worker_compact(w, reason, err, err_len);
+    return agent_worker_compact(w, reason, err, err_len, NULL);
 }
 
 static int worker_accept_generated_token(agent_worker *w,
@@ -7338,6 +8055,7 @@ static int worker_run_turn(agent_worker *w, const char *user_text) {
         }
 
         char *tool_result;
+        bool tool_result_already_appended = false;
         if (early_tool_error) {
             agent_buf b = {0};
             agent_buf_puts(&b, "Tool error: ");
@@ -7354,15 +8072,17 @@ static int worker_run_turn(agent_worker *w, const char *user_text) {
             agent_buf_puts(&b, agent_dsml_syntax_reminder);
             tool_result = agent_buf_take(&b);
         } else {
-            tool_result = agent_execute_tool_calls(w, &dsml.calls);
+            tool_result = agent_execute_tool_calls(w, &dsml.calls,
+                                                   &tool_result_already_appended);
         }
         int projected_tokens = 0;
-        if (!agent_tool_result_fits_context(w, tool_result,
+        if (!tool_result_already_appended &&
+            !agent_tool_result_fits_context(w, tool_result,
                                             AGENT_TOOL_RESULT_RESERVE_TOKENS,
                                             &projected_tokens))
         {
             if (!agent_worker_compact(w, "tool result would exceed context",
-                                      compact_err, sizeof(compact_err)))
+                                      compact_err, sizeof(compact_err), NULL))
             {
                 free(tool_result);
                 agent_dsml_parser_free(&dsml);
@@ -7392,7 +8112,8 @@ static int worker_run_turn(agent_worker *w, const char *user_text) {
                 }
             }
         }
-        ds4_chat_append_message(w->engine, &w->transcript, "tool", tool_result);
+        if (!tool_result_already_appended)
+            ds4_chat_append_message(w->engine, &w->transcript, "tool", tool_result);
         free(tool_result);
         agent_dsml_parser_free(&dsml);
 
@@ -7499,7 +8220,7 @@ static void worker_run_deferred_compact(agent_worker *w) {
 
     int before = w->transcript.len;
     char err[160] = {0};
-    if (agent_worker_compact(w, "user requested compaction", err, sizeof(err))) {
+    if (agent_worker_compact(w, "user requested compaction", err, sizeof(err), NULL)) {
         if (w->transcript.len != before) {
             pthread_mutex_lock(&w->mu);
             w->session_dirty = true;
@@ -8857,6 +9578,8 @@ static int agent_worker_init(agent_worker *w, ds4_engine *engine, agent_config *
         .log_privdata = w,
     };
     w->web = ds4_web_create(&web_cfg);
+    w->context_dir = ds4_kvstore_path_join(w->cache_dir, "context");
+    w->world_epoch = ds4_agent_context_max_world_epoch(w->context_dir);
     w->sysprompt_path = ds4_kvstore_path_join(w->cache_dir, "sysprompt.kv");
     if (cfg->gen.trace_path && cfg->gen.trace_path[0]) {
         w->trace = fopen(cfg->gen.trace_path, "ab");
@@ -8876,10 +9599,12 @@ static void agent_worker_free(agent_worker *w) {
     worker_stop(w);
     if (w->thread) pthread_join(w->thread, NULL);
     agent_bash_jobs_free(w);
+    ds4_agent_side_effects_free(&w->side_effects);
     ds4_web_free(w->web);
     ds4_session_free(w->session);
     ds4_tokens_free(&w->transcript);
     free(w->cache_dir);
+    free(w->context_dir);
     free(w->sysprompt_path);
     free(w->session_title);
     free(w->legacy_session_path_to_delete);
diff --git a/ds4_agent_context.c b/ds4_agent_context.c
new file mode 100644
index 00000000..054a5a98
--- /dev/null
+++ b/ds4_agent_context.c
@@ -0,0 +1,657 @@
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "ds4_agent_context.h"
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define DS4_AGENT_CONTEXT_MAX_META_BYTES (1024 * 1024)
+#define DS4_AGENT_CONTEXT_MAX_SIDE_EFFECTS 64
+
+typedef struct {
+    char *ptr;
+    size_t len;
+    size_t cap;
+} ds4_agent_context_buf;
+
+static void ctx_set_err(char *err, size_t err_len, const char *fmt, ...) {
+    if (!err || err_len == 0) return;
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(err, err_len, fmt, ap);
+    va_end(ap);
+}
+
+static void *ctx_xmalloc(size_t n) {
+    void *p = malloc(n ? n : 1);
+    if (!p) {
+        fprintf(stderr, "ds4-agent-context: out of memory\n");
+        abort();
+    }
+    return p;
+}
+
+static void *ctx_xrealloc(void *ptr, size_t n) {
+    void *p = realloc(ptr, n ? n : 1);
+    if (!p) {
+        fprintf(stderr, "ds4-agent-context: out of memory\n");
+        abort();
+    }
+    return p;
+}
+
+static char *ctx_xstrdup(const char *s) {
+    if (!s) s = "";
+    size_t n = strlen(s);
+    char *out = ctx_xmalloc(n + 1);
+    memcpy(out, s, n + 1);
+    return out;
+}
+
+static void ctx_buf_append(ds4_agent_context_buf *b, const char *s, size_t n) {
+    if (n == 0) return;
+    if (b->len + n + 1 > b->cap) {
+        size_t cap = b->cap ? b->cap : 256;
+        while (cap < b->len + n + 1) cap *= 2;
+        b->ptr = ctx_xrealloc(b->ptr, cap);
+        b->cap = cap;
+    }
+    memcpy(b->ptr + b->len, s, n);
+    b->len += n;
+    b->ptr[b->len] = '\0';
+}
+
+static void ctx_buf_puts(ds4_agent_context_buf *b, const char *s) {
+    if (s) ctx_buf_append(b, s, strlen(s));
+}
+
+static char *ctx_buf_take(ds4_agent_context_buf *b) {
+    if (!b->ptr) return ctx_xstrdup("");
+    char *out = b->ptr;
+    b->ptr = NULL;
+    b->len = 0;
+    b->cap = 0;
+    return out;
+}
+
+static int ctx_read_file_bytes(const char *path, char **data, size_t *len,
+                               char *err, size_t err_len) {
+    FILE *fp = fopen(path, "rb");
+    if (!fp) {
+        ctx_set_err(err, err_len, "%s", strerror(errno));
+        return -1;
+    }
+    if (fseek(fp, 0, SEEK_END) != 0) {
+        ctx_set_err(err, err_len, "%s", strerror(errno));
+        fclose(fp);
+        return -1;
+    }
+    long sz = ftell(fp);
+    if (sz < 0) {
+        ctx_set_err(err, err_len, "%s", strerror(errno));
+        fclose(fp);
+        return -1;
+    }
+    if ((unsigned long)sz > DS4_AGENT_CONTEXT_MAX_META_BYTES) {
+        ctx_set_err(err, err_len, "metadata file too large: %s", path);
+        fclose(fp);
+        return -1;
+    }
+    rewind(fp);
+    char *buf = ctx_xmalloc((size_t)sz + 1);
+    size_t got = fread(buf, 1, (size_t)sz, fp);
+    if (got != (size_t)sz && ferror(fp)) {
+        ctx_set_err(err, err_len, "%s", strerror(errno));
+        free(buf);
+        fclose(fp);
+        return -1;
+    }
+    buf[got] = '\0';
+    fclose(fp);
+    if (data) *data = buf; else free(buf);
+    if (len) *len = got;
+    return 0;
+}
+
+void ds4_agent_context_meta_free(ds4_agent_context_meta *m) {
+    if (!m) return;
+    free(m->label);
+    free(m->kv_file);
+    free(m->memory_file);
+    memset(m, 0, sizeof(*m));
+}
+
+bool ds4_agent_context_id_valid(const char *id) {
+    if (!id || strlen(id) != 40) return false;
+    for (int i = 0; i < 40; i++) {
+        if (!isxdigit((unsigned char)id[i])) return false;
+    }
+    return true;
+}
+
+bool ds4_agent_context_file_component_safe(const char *s) {
+    if (!s || !s[0]) return false;
+    for (const char *p = s; *p; p++) {
+        if (*p == '/' || *p == '\\') return false;
+    }
+    return true;
+}
+
+char *ds4_agent_context_file_name(const char id[41], const char *suffix) {
+    ds4_agent_context_buf b = {0};
+    ctx_buf_append(&b, id, 40);
+    ctx_buf_puts(&b, suffix);
+    return ctx_buf_take(&b);
+}
+
+char *ds4_agent_context_path_for_file(const char *context_dir, const char *file) {
+    if (!context_dir || !context_dir[0]) return ctx_xstrdup(file ? file : "");
+    if (!file || !file[0]) return ctx_xstrdup(context_dir);
+    size_t dir_len = strlen(context_dir);
+    bool need_sep = context_dir[dir_len - 1] != '/';
+    ds4_agent_context_buf b = {0};
+    ctx_buf_puts(&b, context_dir);
+    if (need_sep) ctx_buf_puts(&b, "/");
+    ctx_buf_puts(&b, file);
+    return ctx_buf_take(&b);
+}
+
+char *ds4_agent_context_limited_strdup(const char *s, size_t max) {
+    if (!s) return ctx_xstrdup("");
+    size_t n = strlen(s);
+    if (n > max) n = max;
+    char *out = ctx_xmalloc(n + 1);
+    memcpy(out, s, n);
+    out[n] = '\0';
+    return out;
+}
+
+char *ds4_agent_context_oneline(const char *s, size_t max) {
+    char *out = ds4_agent_context_limited_strdup(s, max);
+    for (char *p = out; *p; p++) {
+        if (*p == '\n' || *p == '\r' || *p == '\t') *p = ' ';
+    }
+    return out;
+}
+
+static void ctx_json_escape(ds4_agent_context_buf *b, const char *s) {
+    if (!s) return;
+    for (const unsigned char *p = (const unsigned char *)s; *p; p++) {
+        switch (*p) {
+        case '\\': ctx_buf_puts(b, "\\\\"); break;
+        case '"': ctx_buf_puts(b, "\\\""); break;
+        case '\n': ctx_buf_puts(b, "\\n"); break;
+        case '\r': ctx_buf_puts(b, "\\r"); break;
+        case '\t': ctx_buf_puts(b, "\\t"); break;
+        default:
+            if (*p < 0x20) {
+                char tmp[8];
+                snprintf(tmp, sizeof(tmp), "\\u%04x", *p);
+                ctx_buf_puts(b, tmp);
+            } else {
+                char c = (char)*p;
+                ctx_buf_append(b, &c, 1);
+            }
+            break;
+        }
+    }
+}
+
+static bool ctx_write_atomic_text(const char *path, const char *text,
+                                  char *err, size_t err_len) {
+    ds4_agent_context_buf tmpl = {0};
+    ctx_buf_puts(&tmpl, path);
+    ctx_buf_puts(&tmpl, ".tmp.XXXXXX");
+    char *tmp = ctx_buf_take(&tmpl);
+    int fd = mkstemp(tmp);
+    if (fd < 0) {
+        ctx_set_err(err, err_len, "%s", strerror(errno));
+        free(tmp);
+        return false;
+    }
+    FILE *fp = fdopen(fd, "wb");
+    if (!fp) {
+        ctx_set_err(err, err_len, "%s", strerror(errno));
+        close(fd);
+        unlink(tmp);
+        free(tmp);
+        return false;
+    }
+    size_t len = strlen(text ? text : "");
+    errno = 0;
+    bool ok = fwrite(text ? text : "", 1, len, fp) == len && fflush(fp) == 0;
+    int saved_errno = errno;
+    if (fclose(fp) != 0) {
+        if (!saved_errno) saved_errno = errno;
+        ok = false;
+    }
+    if (ok && rename(tmp, path) != 0) {
+        saved_errno = errno;
+        ok = false;
+    }
+    if (!ok) {
+        ctx_set_err(err, err_len, "%s",
+                    saved_errno ? strerror(saved_errno) : "write failed");
+        unlink(tmp);
+    }
+    free(tmp);
+    return ok;
+}
+
+bool ds4_agent_context_write_meta(const ds4_agent_context_meta *m,
+                                  const char *meta_path,
+                                  char *err, size_t err_len) {
+    ds4_agent_context_buf b = {0};
+    char num[80];
+    ctx_buf_puts(&b, "{\n");
+    ctx_buf_puts(&b, "  \"id\": \"");
+    ctx_json_escape(&b, m->id);
+    ctx_buf_puts(&b, "\",\n  \"label\": \"");
+    ctx_json_escape(&b, m->label ? m->label : "");
+    ctx_buf_puts(&b, "\",\n");
+    snprintf(num, sizeof(num), "  \"created_at\": %" PRIu64 ",\n", m->created_at);
+    ctx_buf_puts(&b, num);
+    snprintf(num, sizeof(num), "  \"world_epoch\": %" PRIu64 ",\n", m->world_epoch);
+    ctx_buf_puts(&b, num);
+    snprintf(num, sizeof(num), "  \"transcript_tokens\": %d,\n", m->transcript_tokens);
+    ctx_buf_puts(&b, num);
+    ctx_buf_puts(&b, "  \"kv_path\": \"");
+    ctx_json_escape(&b, m->kv_file ? m->kv_file : "");
+    ctx_buf_puts(&b, "\",\n  \"memory_path\": \"");
+    ctx_json_escape(&b, m->memory_file ? m->memory_file : "");
+    ctx_buf_puts(&b, "\",\n  \"memory_sha1\": null\n}\n");
+    char *text = ctx_buf_take(&b);
+    bool ok = ctx_write_atomic_text(meta_path, text, err, err_len);
+    free(text);
+    return ok;
+}
+
+static const char *ctx_json_skip_string(const char *p) {
+    if (!p || *p != '"') return p;
+    p++;
+    while (*p) {
+        if (*p == '\\' && p[1]) {
+            p += 2;
+            continue;
+        }
+        if (*p == '"') return p + 1;
+        p++;
+    }
+    return p;
+}
+
+static bool ctx_json_key_matches(const char *start, const char *end,
+                                 const char *key) {
+    const char *p = start;
+    const char *k = key;
+    while (p < end) {
+        char c = *p++;
+        if (c == '\\' && p < end) c = *p++;
+        if (*k != c) return false;
+        k++;
+    }
+    return *k == '\0';
+}
+
+static const char *ctx_json_find_value(const char *json, const char *key) {
+    const char *p = json;
+    while (p && *p) {
+        if (*p != '"') {
+            p++;
+            continue;
+        }
+        const char *start = p + 1;
+        const char *after = ctx_json_skip_string(p);
+        if (!after || after == p || after[-1] != '"') return NULL;
+        const char *end = after - 1;
+        const char *q = after;
+        while (*q && isspace((unsigned char)*q)) q++;
+        if (*q == ':' && ctx_json_key_matches(start, end, key)) {
+            q++;
+            while (*q && isspace((unsigned char)*q)) q++;
+            return q;
+        }
+        p = after;
+    }
+    return NULL;
+}
+
+static bool ctx_json_get_string(const char *json, const char *key, char **out) {
+    const char *p = ctx_json_find_value(json, key);
+    if (!p || *p != '"') return false;
+    p++;
+    ds4_agent_context_buf b = {0};
+    while (*p && *p != '"') {
+        if (*p == '\\') {
+            p++;
+            if (!*p) break;
+            switch (*p) {
+            case 'n': ctx_buf_puts(&b, "\n"); break;
+            case 'r': ctx_buf_puts(&b, "\r"); break;
+            case 't': ctx_buf_puts(&b, "\t"); break;
+            case '\\': ctx_buf_puts(&b, "\\"); break;
+            case '"': ctx_buf_puts(&b, "\""); break;
+            default: ctx_buf_append(&b, p, 1); break;
+            }
+            p++;
+        } else {
+            ctx_buf_append(&b, p, 1);
+            p++;
+        }
+    }
+    if (*p != '"') {
+        free(b.ptr);
+        return false;
+    }
+    *out = ctx_buf_take(&b);
+    return true;
+}
+
+static bool ctx_json_get_u64(const char *json, const char *key, uint64_t *out) {
+    const char *p = ctx_json_find_value(json, key);
+    if (!p || !isdigit((unsigned char)*p)) return false;
+    char *end = NULL;
+    unsigned long long v = strtoull(p, &end, 10);
+    if (end == p) return false;
+    *out = (uint64_t)v;
+    return true;
+}
+
+static bool ctx_json_get_int(const char *json, const char *key, int *out) {
+    uint64_t v = 0;
+    if (!ctx_json_get_u64(json, key, &v) || v > (uint64_t)INT_MAX) return false;
+    *out = (int)v;
+    return true;
+}
+
+bool ds4_agent_context_read_meta_file(const char *path,
+                                      ds4_agent_context_meta *m,
+                                      char *err, size_t err_len) {
+    char *json = NULL;
+    size_t len = 0;
+    if (ctx_read_file_bytes(path, &json, &len, err, err_len) != 0) return false;
+    (void)len;
+    memset(m, 0, sizeof(*m));
+    char *id = NULL;
+    bool ok = ctx_json_get_string(json, "id", &id);
+    if (ok) {
+        if (strlen(id) < sizeof(m->id)) snprintf(m->id, sizeof(m->id), "%s", id);
+        else ok = false;
+        free(id);
+        if (ok) ok = ds4_agent_context_id_valid(m->id);
+    }
+    if (ok && !ctx_json_get_string(json, "label", &m->label))
+        m->label = ctx_xstrdup("");
+    if (ok && !ctx_json_get_u64(json, "created_at", &m->created_at))
+        ok = false;
+    if (ok && !ctx_json_get_u64(json, "world_epoch", &m->world_epoch))
+        ok = false;
+    if (ok && !ctx_json_get_int(json, "transcript_tokens", &m->transcript_tokens))
+        ok = false;
+    if (ok && !ctx_json_get_string(json, "kv_path", &m->kv_file))
+        m->kv_file = ds4_agent_context_file_name(m->id, ".kv");
+    if (ok && !ctx_json_get_string(json, "memory_path", &m->memory_file))
+        m->memory_file = ds4_agent_context_file_name(m->id, ".memory.md");
+    if (ok && !ds4_agent_context_file_component_safe(m->kv_file)) ok = false;
+    if (ok && m->memory_file && m->memory_file[0] &&
+        !ds4_agent_context_file_component_safe(m->memory_file)) ok = false;
+    if (!ok) {
+        ctx_set_err(err, err_len, "invalid context metadata: %s", path);
+        ds4_agent_context_meta_free(m);
+    }
+    free(json);
+    return ok;
+}
+
+bool ds4_agent_context_meta_filename(const char *name) {
+    size_t n = strlen(name);
+    static const char suffix[] = ".meta.json";
+    size_t s = sizeof(suffix) - 1;
+    return n > s && !strcmp(name + n - s, suffix);
+}
+
+int ds4_agent_context_count_checkpoints(const char *context_dir) {
+    DIR *d = opendir(context_dir);
+    if (!d) return 0;
+    int count = 0;
+    struct dirent *de;
+    while ((de = readdir(d)) != NULL) {
+        if (ds4_agent_context_meta_filename(de->d_name)) count++;
+    }
+    closedir(d);
+    return count;
+}
+
+uint64_t ds4_agent_context_max_world_epoch(const char *context_dir) {
+    DIR *d = opendir(context_dir);
+    if (!d) return 0;
+    uint64_t max_epoch = 0;
+    struct dirent *de;
+    while ((de = readdir(d)) != NULL) {
+        if (!ds4_agent_context_meta_filename(de->d_name)) continue;
+        char *meta_path = ds4_agent_context_path_for_file(context_dir, de->d_name);
+        ds4_agent_context_meta m = {0};
+        char err[160] = {0};
+        if (ds4_agent_context_read_meta_file(meta_path, &m, err, sizeof(err)) &&
+            m.world_epoch > max_epoch)
+            max_epoch = m.world_epoch;
+        ds4_agent_context_meta_free(&m);
+        free(meta_path);
+    }
+    closedir(d);
+    return max_epoch;
+}
+
+char *ds4_agent_context_full_kv_path(const char *context_dir,
+                                     const ds4_agent_context_meta *m) {
+    return ds4_agent_context_path_for_file(context_dir, m->kv_file);
+}
+
+char *ds4_agent_context_full_memory_path(const char *context_dir,
+                                         const ds4_agent_context_meta *m) {
+    if (!m->memory_file || !m->memory_file[0]) return NULL;
+    return ds4_agent_context_path_for_file(context_dir, m->memory_file);
+}
+
+bool ds4_agent_context_find_checkpoint(const char *context_dir,
+                                       const char *prefix,
+                                       ds4_agent_context_meta *found,
+                                       char **meta_path_out,
+                                       char **kv_path_out,
+                                       char *err, size_t err_len) {
+    if (!prefix || !prefix[0]) {
+        ctx_set_err(err, err_len, "context id is required");
+        return false;
+    }
+    size_t prefix_len = strlen(prefix);
+    DIR *d = opendir(context_dir);
+    if (!d) {
+        ctx_set_err(err, err_len, "no context checkpoints found");
+        return false;
+    }
+    bool matched = false;
+    bool ambiguous = false;
+    ds4_agent_context_meta best = {0};
+    char *best_meta_path = NULL;
+    char *best_kv_path = NULL;
+    struct dirent *de;
+    while ((de = readdir(d)) != NULL) {
+        if (!ds4_agent_context_meta_filename(de->d_name)) continue;
+        char *meta_path = ds4_agent_context_path_for_file(context_dir, de->d_name);
+        ds4_agent_context_meta m = {0};
+        char parse_err[160] = {0};
+        if (!ds4_agent_context_read_meta_file(meta_path, &m, parse_err,
+                                              sizeof(parse_err))) {
+            free(meta_path);
+            continue;
+        }
+        if (!strncmp(m.id, prefix, prefix_len)) {
+            if (matched) {
+                ambiguous = true;
+                ds4_agent_context_meta_free(&m);
+                free(meta_path);
+                break;
+            }
+            matched = true;
+            best = m;
+            best_meta_path = meta_path;
+            best_kv_path = ds4_agent_context_full_kv_path(context_dir, &best);
+        } else {
+            ds4_agent_context_meta_free(&m);
+            free(meta_path);
+        }
+    }
+    closedir(d);
+    if (ambiguous) {
+        ds4_agent_context_meta_free(&best);
+        free(best_meta_path);
+        free(best_kv_path);
+        ctx_set_err(err, err_len, "context id prefix is ambiguous: %s", prefix);
+        return false;
+    }
+    if (!matched) {
+        ctx_set_err(err, err_len, "context checkpoint not found: %s", prefix);
+        return false;
+    }
+    *found = best;
+    if (meta_path_out) *meta_path_out = best_meta_path; else free(best_meta_path);
+    if (kv_path_out) *kv_path_out = best_kv_path; else free(best_kv_path);
+    return true;
+}
+
+void ds4_agent_side_effects_free(ds4_agent_side_effects *effects) {
+    if (!effects) return;
+    ds4_agent_side_effect *e = effects->head;
+    while (e) {
+        ds4_agent_side_effect *next = e->next;
+        free(e->kind);
+        free(e->detail);
+        free(e);
+        e = next;
+    }
+    effects->head = NULL;
+    effects->count = 0;
+    effects->evicted_count = 0;
+    effects->latest_evicted_epoch = 0;
+}
+
+uint64_t ds4_agent_side_effects_note(ds4_agent_side_effects *effects,
+                                     uint64_t current_epoch,
+                                     const char *kind,
+                                     const char *detail) {
+    if (!effects) return current_epoch;
+    uint64_t next_epoch = current_epoch == UINT64_MAX ? current_epoch : current_epoch + 1;
+    ds4_agent_side_effect *e = ctx_xmalloc(sizeof(*e));
+    memset(e, 0, sizeof(*e));
+    e->epoch = next_epoch;
+    e->kind = ctx_xstrdup(kind && kind[0] ? kind : "tool");
+    e->detail = ctx_xstrdup(detail && detail[0] ? detail : "");
+    e->next = effects->head;
+    effects->head = e;
+    effects->count++;
+
+    while (effects->count > DS4_AGENT_CONTEXT_MAX_SIDE_EFFECTS) {
+        ds4_agent_side_effect **link = &effects->head;
+        while (*link && (*link)->next) link = &(*link)->next;
+        if (!*link) break;
+        ds4_agent_side_effect *old = *link;
+        *link = NULL;
+        effects->evicted_count++;
+        if (old->epoch > effects->latest_evicted_epoch)
+            effects->latest_evicted_epoch = old->epoch;
+        free(old->kind);
+        free(old->detail);
+        free(old);
+        effects->count--;
+    }
+
+    return next_epoch;
+}
+
+char *ds4_agent_side_effects_summary_since(const ds4_agent_side_effects *effects,
+                                           uint64_t epoch) {
+    ds4_agent_context_buf b = {0};
+    if (effects && effects->latest_evicted_epoch > epoch) {
+        char line[256];
+        snprintf(line, sizeof(line),
+                 "Known side effects after checkpoint may be incomplete: "
+                 "%" PRIu64 " older side effect(s) were dropped from memory "
+                 "up to epoch=%" PRIu64 ".\n",
+                 effects->evicted_count, effects->latest_evicted_epoch);
+        ctx_buf_puts(&b, line);
+    }
+    int shown = 0;
+    for (const ds4_agent_side_effect *e = effects ? effects->head : NULL; e; e = e->next) {
+        if (e->epoch <= epoch) continue;
+        if (shown == 0) ctx_buf_puts(&b, "Known side effects after checkpoint:\n");
+        char *detail = ds4_agent_context_oneline(e->detail, 180);
+        char line[320];
+        snprintf(line, sizeof(line), "- epoch=%" PRIu64 " %s %s\n",
+                 e->epoch, e->kind ? e->kind : "tool", detail);
+        ctx_buf_puts(&b, line);
+        free(detail);
+        shown++;
+        if (shown >= 8) {
+            ctx_buf_puts(&b, "- ... more side effects omitted ...\n");
+            break;
+        }
+    }
+    return ctx_buf_take(&b);
+}
+
+bool ds4_agent_context_no_running_bash_guard(const char *action,
+                                             int running_bash_jobs,
+                                             char *err,
+                                             size_t err_len) {
+    if (running_bash_jobs <= 0) return true;
+    ctx_set_err(err, err_len,
+                "context %s denied because %d bash job(s) are still running; "
+                "use bash_status or bash_stop first",
+                action && action[0] ? action : "operation",
+                running_bash_jobs);
+    return false;
+}
+
+bool ds4_agent_context_restore_epoch_guard(uint64_t current_epoch,
+                                           uint64_t checkpoint_epoch,
+                                           bool allow_side_effect_mismatch,
+                                           char *err,
+                                           size_t err_len) {
+    if (current_epoch == checkpoint_epoch || allow_side_effect_mismatch)
+        return true;
+    ctx_set_err(err, err_len,
+                "restore would rewind model context from world_epoch=%" PRIu64
+                " to %" PRIu64 ", but external side effects may still exist. "
+                "Revert or inspect those effects, or call context restore with "
+                "allow_side_effect_mismatch=true.",
+                current_epoch, checkpoint_epoch);
+    return false;
+}
+
+char *ds4_agent_context_restore_expected_metrics_line(
+        const ds4_agent_context_restore_metrics *metrics) {
+    ds4_agent_context_buf b = {0};
+    char line[384];
+    int checkpoint_tokens = metrics ? metrics->checkpoint_tokens : 0;
+    int notice_tokens = metrics ? metrics->restore_notice_tokens : 0;
+    int restored_tokens = metrics ? metrics->restored_tokens : 0;
+    if (checkpoint_tokens < 0) checkpoint_tokens = 0;
+    if (notice_tokens < 0) notice_tokens = 0;
+    if (restored_tokens < 0) restored_tokens = 0;
+    snprintf(line, sizeof(line),
+             "KV restore expected metrics: checkpoint_tokens=%d expected_restore_notice_tokens=%d expected_restored_tokens=%d expected_prefill_suffix_tokens=%d expected_full_prefill_tokens_without_kv=%d expected_saved_prefill_tokens=%d.\n",
+             checkpoint_tokens, notice_tokens, restored_tokens,
+             notice_tokens, restored_tokens, checkpoint_tokens);
+    ctx_buf_puts(&b, line);
+    return ctx_buf_take(&b);
+}
diff --git a/ds4_agent_context.h b/ds4_agent_context.h
new file mode 100644
index 00000000..b36b31b1
--- /dev/null
+++ b/ds4_agent_context.h
@@ -0,0 +1,85 @@
+#ifndef DS4_AGENT_CONTEXT_H
+#define DS4_AGENT_CONTEXT_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct ds4_agent_context_meta {
+    char id[41];
+    char *label;
+    char *kv_file;
+    char *memory_file;
+    uint64_t created_at;
+    uint64_t world_epoch;
+    int transcript_tokens;
+} ds4_agent_context_meta;
+
+typedef struct ds4_agent_side_effect {
+    uint64_t epoch;
+    char *kind;
+    char *detail;
+    struct ds4_agent_side_effect *next;
+} ds4_agent_side_effect;
+
+typedef struct ds4_agent_side_effects {
+    ds4_agent_side_effect *head;
+    int count;
+    uint64_t evicted_count;
+    uint64_t latest_evicted_epoch;
+} ds4_agent_side_effects;
+
+typedef struct ds4_agent_context_restore_metrics {
+    int checkpoint_tokens;
+    int restore_notice_tokens;
+    int restored_tokens;
+} ds4_agent_context_restore_metrics;
+
+void ds4_agent_context_meta_free(ds4_agent_context_meta *m);
+bool ds4_agent_context_id_valid(const char *id);
+bool ds4_agent_context_file_component_safe(const char *s);
+char *ds4_agent_context_file_name(const char id[41], const char *suffix);
+char *ds4_agent_context_path_for_file(const char *context_dir, const char *file);
+char *ds4_agent_context_limited_strdup(const char *s, size_t max);
+char *ds4_agent_context_oneline(const char *s, size_t max);
+
+bool ds4_agent_context_write_meta(const ds4_agent_context_meta *m,
+                                  const char *meta_path,
+                                  char *err, size_t err_len);
+bool ds4_agent_context_read_meta_file(const char *path,
+                                      ds4_agent_context_meta *m,
+                                      char *err, size_t err_len);
+bool ds4_agent_context_meta_filename(const char *name);
+int ds4_agent_context_count_checkpoints(const char *context_dir);
+uint64_t ds4_agent_context_max_world_epoch(const char *context_dir);
+char *ds4_agent_context_full_kv_path(const char *context_dir,
+                                     const ds4_agent_context_meta *m);
+char *ds4_agent_context_full_memory_path(const char *context_dir,
+                                         const ds4_agent_context_meta *m);
+bool ds4_agent_context_find_checkpoint(const char *context_dir,
+                                       const char *prefix,
+                                       ds4_agent_context_meta *found,
+                                       char **meta_path_out,
+                                       char **kv_path_out,
+                                       char *err, size_t err_len);
+
+void ds4_agent_side_effects_free(ds4_agent_side_effects *effects);
+uint64_t ds4_agent_side_effects_note(ds4_agent_side_effects *effects,
+                                     uint64_t current_epoch,
+                                     const char *kind,
+                                     const char *detail);
+char *ds4_agent_side_effects_summary_since(const ds4_agent_side_effects *effects,
+                                           uint64_t epoch);
+bool ds4_agent_context_no_running_bash_guard(const char *action,
+                                             int running_bash_jobs,
+                                             char *err,
+                                             size_t err_len);
+bool ds4_agent_context_restore_epoch_guard(uint64_t current_epoch,
+                                           uint64_t checkpoint_epoch,
+                                           bool allow_side_effect_mismatch,
+                                           char *err,
+                                           size_t err_len);
+char *ds4_agent_context_restore_expected_metrics_line(
+        const ds4_agent_context_restore_metrics *metrics);
+
+#endif
diff --git a/tests/ds4_agent_context_compact_canary_e2e.sh b/tests/ds4_agent_context_compact_canary_e2e.sh
new file mode 100755
index 00000000..ee2c57f7
--- /dev/null
+++ b/tests/ds4_agent_context_compact_canary_e2e.sh
@@ -0,0 +1,195 @@
+#!/bin/sh
+set -eu
+
+ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
+BASE=${TMPDIR:-/tmp}
+RUN_DIR=$(mktemp -d "${BASE%/}/ds4-agent-compact-canary.XXXXXX")
+HOME_DIR="$RUN_DIR/home"
+WORK_DIR="$RUN_DIR/work"
+OUT="$RUN_DIR/output.txt"
+TRACE="$RUN_DIR/trace.txt"
+LEDGER="$WORK_DIR/ds4-compact-canary-ledger.md"
+PROMPT_FILE="$RUN_DIR/prompt.md"
+PROMPT_TMP="$RUN_DIR/prompt.tmp"
+PADDING_FILE="$RUN_DIR/padding.txt"
+REPORT_DIR="$ROOT/tests/generated"
+REPORT="$REPORT_DIR/ds4_agent_context_compact_canary_report.md"
+REPORT_PROMPT="$REPORT_DIR/ds4_agent_context_compact_canary_prompt.md"
+REPORT_OUTPUT="$REPORT_DIR/ds4_agent_context_compact_canary_output.txt"
+REPORT_TRACE="$REPORT_DIR/ds4_agent_context_compact_canary_trace.txt"
+REPORT_LEDGER="$REPORT_DIR/ds4_agent_context_compact_canary_ledger.md"
+
+print_report_file() {
+    if [ -f "$1" ]; then
+        sed 's/```/` ` `/g' "$1"
+    else
+        printf '(missing: %s)\n' "$1"
+    fi
+}
+
+write_report() {
+    mkdir -p "$REPORT_DIR"
+    [ -f "$PROMPT_FILE" ] && cp "$PROMPT_FILE" "$REPORT_PROMPT"
+    [ -f "$OUT" ] && cp "$OUT" "$REPORT_OUTPUT"
+    [ -f "$TRACE" ] && cp "$TRACE" "$REPORT_TRACE"
+    [ -f "$LEDGER" ] && cp "$LEDGER" "$REPORT_LEDGER"
+    {
+        printf '# DS4 Agent Context Compact Canary Report\n\n'
+        printf 'prompt_file: `%s`\n\n' "$REPORT_PROMPT"
+        printf 'response_file: `%s`\n\n' "$REPORT_OUTPUT"
+        printf 'trace_file: `%s`\n\n' "$REPORT_TRACE"
+        printf 'ledger_file: `%s`\n\n' "$REPORT_LEDGER"
+        printf 'run_dir: `%s`\n\n' "$RUN_DIR"
+        printf '## Prompt\n\n```text\n'
+        print_report_file "$PROMPT_FILE"
+        printf '\n```\n\n## DS4 Response\n\n```text\n'
+        print_report_file "$OUT"
+        printf '\n```\n\n## Trace\n\n```text\n'
+        print_report_file "$TRACE"
+        printf '\n```\n\n## Ledger\n\n```text\n'
+        print_report_file "$LEDGER"
+        printf '\n```\n'
+    } >"$REPORT"
+}
+
+cleanup() {
+    write_report >/dev/null 2>&1 || true
+    if [ "${DS4_KEEP_COMPACT_CANARY_TEST_DIR:-0}" != "1" ]; then
+        rm -rf "$RUN_DIR"
+    else
+        printf 'kept test directory: %s\n' "$RUN_DIR" >&2
+    fi
+}
+trap cleanup EXIT
+
+mkdir -p "$HOME_DIR" "$WORK_DIR"
+
+escape_sed() {
+    printf '%s' "$1" | sed 's/[&|]/\\&/g'
+}
+
+PADDING_LINES=${DS4_AGENT_COMPACT_CANARY_PADDING_LINES:-180}
+i=1
+while [ "$i" -le "$PADDING_LINES" ]; do
+    printf 'Padding line %03d: irrelevant build-note-%03d contains no canary values and should not be retained.\n' "$i" "$i" >>"$PADDING_FILE"
+    i=$((i + 1))
+done
+
+ROOT_ESC=$(escape_sed "$ROOT")
+LEDGER_ESC=$(escape_sed "$LEDGER")
+sed \
+    -e "s|__ROOT__|$ROOT_ESC|g" \
+    -e "s|__LEDGER__|$LEDGER_ESC|g" \
+    "$ROOT/tests/ds4_agent_context_compact_canary_prompt.md" >"$PROMPT_TMP"
+
+while IFS= read -r line; do
+    if [ "$line" = "__PADDING__" ]; then
+        cat "$PADDING_FILE"
+    else
+        printf '%s\n' "$line"
+    fi
+done <"$PROMPT_TMP" >"$PROMPT_FILE"
+
+if ! HOME="$HOME_DIR" "$ROOT/ds4-agent" \
+    --model "$ROOT/ds4flash.gguf" \
+    --non-interactive \
+    --chdir "$ROOT" \
+    --ctx "${DS4_AGENT_COMPACT_CANARY_CTX:-8192}" \
+    --tokens "${DS4_AGENT_COMPACT_CANARY_TOKENS:-3500}" \
+    --temp 0 \
+    --seed 7 \
+    --nothink \
+    --trace "$TRACE" \
+    --prompt "$(cat "$PROMPT_FILE")" >"$OUT" 2>&1
+then
+    cat "$OUT" >&2
+    echo "ds4-agent compact canary run failed" >&2
+    exit 1
+fi
+
+if [ ! -f "$LEDGER" ]; then
+    cat "$OUT" >&2
+    echo "missing generated compact canary ledger: $LEDGER" >&2
+    exit 1
+fi
+
+COMPACT_LINE=$(grep 'compacted reason="canary-retention-test"' "$TRACE" | tail -n 1 || true)
+if [ -z "$COMPACT_LINE" ]; then
+    cat "$TRACE" >&2
+    echo "trace does not prove context compaction happened" >&2
+    exit 1
+fi
+
+COMPACT_OLD=$(printf '%s\n' "$COMPACT_LINE" | sed -n 's/.* old=\([0-9][0-9]*\) .*/\1/p')
+COMPACT_NEW=$(printf '%s\n' "$COMPACT_LINE" | sed -n 's/.* new=\([0-9][0-9]*\) .*/\1/p')
+COMPACT_TAIL_START=$(printf '%s\n' "$COMPACT_LINE" | sed -n 's/.* tail_start=\([0-9][0-9]*\) .*/\1/p')
+
+if [ -z "$COMPACT_OLD" ] || [ -z "$COMPACT_NEW" ] || [ -z "$COMPACT_TAIL_START" ]; then
+    cat "$TRACE" >&2
+    echo "trace compact line is missing old/new/tail_start metrics" >&2
+    exit 1
+fi
+
+if [ "$COMPACT_NEW" -ge "$COMPACT_OLD" ]; then
+    cat "$TRACE" >&2
+    echo "context compaction did not reduce token count" >&2
+    exit 1
+fi
+
+if [ "$COMPACT_TAIL_START" -lt "${DS4_AGENT_COMPACT_CANARY_MIN_TAIL_START:-3000}" ]; then
+    cat "$TRACE" >&2
+    echo "recent tail started too early to make this a useful retention canary" >&2
+    exit 1
+fi
+
+grep -q "compaction=done" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing compaction=done" >&2
+    exit 1
+}
+
+grep -q "canary_alpha=ORCHID-47" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger lost canary_alpha" >&2
+    exit 1
+}
+
+grep -q "canary_beta=FJORD-932" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger lost canary_beta" >&2
+    exit 1
+}
+
+grep -q "canary_gamma=LEMMA-18" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger lost canary_gamma" >&2
+    exit 1
+}
+
+grep -q "canary_delta=RUNE-604" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger lost canary_delta" >&2
+    exit 1
+}
+
+grep -q "canary_epsilon=VECTOR-251" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger lost canary_epsilon" >&2
+    exit 1
+}
+
+grep -q "final=COMPACT_CANARY_DONE" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing final marker" >&2
+    exit 1
+}
+
+grep -q "COMPACT_CANARY_DONE" "$OUT" || {
+    cat "$OUT" >&2
+    echo "DS4 did not finish compact canary scenario" >&2
+    exit 1
+}
+
+write_report
+
+printf 'ds4 agent context compact canary e2e: ok\n'
diff --git a/tests/ds4_agent_context_compact_canary_prompt.md b/tests/ds4_agent_context_compact_canary_prompt.md
new file mode 100644
index 00000000..503af7d5
--- /dev/null
+++ b/tests/ds4_agent_context_compact_canary_prompt.md
@@ -0,0 +1,46 @@
+You are validating whether DS4 context compaction preserves task-critical facts.
+
+Do not explain the plan in prose. Use DSML tools.
+
+Repository root: __ROOT__
+Ledger path: __LEDGER__
+
+The following five facts are task-critical canaries. They are the only source
+of the values. Preserve them through compaction and use them after compaction:
+
+canary_alpha=ORCHID-47
+canary_beta=FJORD-932
+canary_gamma=LEMMA-18
+canary_delta=RUNE-604
+canary_epsilon=VECTOR-251
+
+The next block is intentionally irrelevant padding. Do not copy it to the
+ledger. Its only purpose is to push the canary values out of the recent verbatim
+tail, so the compacted summary must preserve them.
+
+__PADDING__
+
+Now execute exactly this sequence:
+
+1. Use the context tool with action=compact and reason=canary-retention-test.
+   This context compact call must be the only tool call in its DSML block.
+
+2. After the compact tool result, use the write tool to create the ledger file
+   at this exact path:
+
+__LEDGER__
+
+   The ledger must contain these exact field names and the exact canary values
+   from above:
+
+# DS4 Compact Canary Ledger
+compaction=done
+canary_alpha=<value from above>
+canary_beta=<value from above>
+canary_gamma=<value from above>
+canary_delta=<value from above>
+canary_epsilon=<value from above>
+final=COMPACT_CANARY_DONE
+
+3. After the write tool result, answer exactly:
+COMPACT_CANARY_DONE
diff --git a/tests/ds4_agent_context_loop_e2e.sh b/tests/ds4_agent_context_loop_e2e.sh
new file mode 100644
index 00000000..bf382602
--- /dev/null
+++ b/tests/ds4_agent_context_loop_e2e.sh
@@ -0,0 +1,176 @@
+#!/bin/sh
+set -eu
+
+ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
+BASE=${TMPDIR:-/tmp}
+RUN_DIR=$(mktemp -d "${BASE%/}/ds4-agent-context-loop.XXXXXX")
+HOME_DIR="$RUN_DIR/home"
+WORK_DIR="$RUN_DIR/work"
+OUT="$RUN_DIR/output.txt"
+LEDGER="$WORK_DIR/ds4-generated-loop.md"
+PROMPT_FILE="$RUN_DIR/prompt.md"
+REPORT_DIR="$ROOT/tests/generated"
+REPORT="$REPORT_DIR/ds4_agent_context_loop_report.md"
+REPORT_PROMPT="$REPORT_DIR/ds4_agent_context_loop_prompt.md"
+REPORT_OUTPUT="$REPORT_DIR/ds4_agent_context_loop_output.txt"
+REPORT_LEDGER="$REPORT_DIR/ds4_agent_context_loop_ledger.md"
+
+print_report_file() {
+    if [ -f "$1" ]; then
+        sed 's/```/` ` `/g' "$1"
+    else
+        printf '(missing: %s)\n' "$1"
+    fi
+}
+
+write_report() {
+    mkdir -p "$REPORT_DIR"
+    if [ -f "$PROMPT_FILE" ]; then
+        cp "$PROMPT_FILE" "$REPORT_PROMPT"
+    fi
+    if [ -f "$OUT" ]; then
+        cp "$OUT" "$REPORT_OUTPUT"
+    fi
+    if [ -f "$LEDGER" ]; then
+        cp "$LEDGER" "$REPORT_LEDGER"
+    fi
+    {
+        printf '# DS4 Agent Context Loop Report\n\n'
+        printf 'prompt_file: `%s`\n\n' "$REPORT_PROMPT"
+        printf 'response_file: `%s`\n\n' "$REPORT_OUTPUT"
+        printf 'ledger_file: `%s`\n\n' "$REPORT_LEDGER"
+        printf 'run_dir: `%s`\n\n' "$RUN_DIR"
+        printf '## Prompt\n\n```text\n'
+        print_report_file "$PROMPT_FILE"
+        printf '\n```\n\n## DS4 Response\n\n```text\n'
+        print_report_file "$OUT"
+        printf '\n```\n\n## Ledger\n\n```text\n'
+        print_report_file "$LEDGER"
+        printf '\n```\n'
+    } >"$REPORT"
+}
+
+cleanup() {
+    write_report >/dev/null 2>&1 || true
+    if [ "${DS4_KEEP_LOOP_TEST_DIR:-0}" != "1" ]; then
+        rm -rf "$RUN_DIR"
+    else
+        printf 'kept test directory: %s\n' "$RUN_DIR" >&2
+    fi
+}
+trap cleanup EXIT
+
+mkdir -p "$HOME_DIR" "$WORK_DIR"
+
+escape_sed() {
+    printf '%s' "$1" | sed 's/[&|]/\\&/g'
+}
+
+ROOT_ESC=$(escape_sed "$ROOT")
+LEDGER_ESC=$(escape_sed "$LEDGER")
+PROMPT=$(sed \
+    -e "s|__ROOT__|$ROOT_ESC|g" \
+    -e "s|__LEDGER__|$LEDGER_ESC|g" \
+    "$ROOT/tests/ds4_agent_context_loop_prompt.md")
+printf '%s\n' "$PROMPT" >"$PROMPT_FILE"
+
+if ! HOME="$HOME_DIR" "$ROOT/ds4-agent" \
+    --model "$ROOT/ds4flash.gguf" \
+    --non-interactive \
+    --chdir "$ROOT" \
+    --ctx "${DS4_AGENT_LOOP_CTX:-8192}" \
+    --tokens "${DS4_AGENT_LOOP_TOKENS:-2500}" \
+    --temp 0 \
+    --seed 1 \
+    --nothink \
+    --prompt "$PROMPT" >"$OUT" 2>&1
+then
+    cat "$OUT" >&2
+    echo "ds4-agent loop run failed" >&2
+    exit 1
+fi
+
+if [ ! -f "$LEDGER" ]; then
+    cat "$OUT" >&2
+    echo "missing generated ledger: $LEDGER" >&2
+    exit 1
+fi
+
+grep -q "loop_limit=2" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing loop_limit=2" >&2
+    exit 1
+}
+
+grep -Fq "ds4_prompt=validate DS4's own agent context loop capability" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing the compact DS4 prompt" >&2
+    exit 1
+}
+
+grep -q "ds4_response=LOOP_DONE" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing the final DS4 response" >&2
+    exit 1
+}
+
+grep -q "attempt=1 status=pass" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    cat "$OUT" >&2
+    echo "DS4 did not mark the measured attempt as pass" >&2
+    exit 1
+}
+
+grep -q "attempt=1 metric=ds4_agent_context_test passed" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing the expected metric" >&2
+    exit 1
+}
+
+grep -q "LOOP_DONE" "$OUT" || {
+    cat "$OUT" >&2
+    echo "DS4 did not finish the loop" >&2
+    exit 1
+}
+
+CONTEXT_DIR="$HOME_DIR/.ds4/kvcache/context"
+if [ ! -d "$CONTEXT_DIR" ]; then
+    cat "$OUT" >&2
+    echo "missing context checkpoint directory: $CONTEXT_DIR" >&2
+    exit 1
+fi
+
+if ! grep -R "ds4-generated-loop-after-pass" "$CONTEXT_DIR" >/dev/null 2>&1; then
+    find "$CONTEXT_DIR" -maxdepth 1 -type f -print >&2
+    cat "$OUT" >&2
+    echo "missing DS4-generated context checkpoint label" >&2
+    exit 1
+fi
+
+write_report
+
+grep -q "^## Prompt" "$REPORT" || {
+    cat "$REPORT" >&2
+    echo "report is missing the prompt section" >&2
+    exit 1
+}
+
+grep -q "^## DS4 Response" "$REPORT" || {
+    cat "$REPORT" >&2
+    echo "report is missing the DS4 response section" >&2
+    exit 1
+}
+
+grep -q "^## Ledger" "$REPORT" || {
+    cat "$REPORT" >&2
+    echo "report is missing the ledger section" >&2
+    exit 1
+}
+
+grep -q "LOOP_DONE" "$REPORT" || {
+    cat "$REPORT" >&2
+    echo "report is missing the final DS4 response" >&2
+    exit 1
+}
+
+printf 'ds4 agent context loop e2e: ok\n'
diff --git a/tests/ds4_agent_context_loop_prompt.md b/tests/ds4_agent_context_loop_prompt.md
new file mode 100644
index 00000000..41dee3b1
--- /dev/null
+++ b/tests/ds4_agent_context_loop_prompt.md
@@ -0,0 +1,44 @@
+You are validating DS4's own agent context loop capability.
+
+Do not explain the plan in prose. Use DSML tools to execute exactly this loop.
+
+Repository root: __ROOT__
+Ledger path: __LEDGER__
+
+Loop:
+1. Use the write tool to create __LEDGER__ with this exact starting content:
+
+# DS4 Generated Context Loop
+loop_limit=2
+goal=validate DS4 agent context loop against DS4's context helper test
+baseline=before-ds4-agent-context-test
+ds4_prompt=validate DS4's own agent context loop capability
+ds4_response=pending
+attempt=1 hypothesis=the context helper test passes
+attempt=1 command=cd __ROOT__ && ./tests/ds4_agent_context_test
+attempt=1 status=pending
+attempt=1 metric=pending
+
+2. Use the bash tool to run exactly:
+cd __ROOT__ && ./tests/ds4_agent_context_test
+
+3. If the bash result reports success, use the edit tool to replace:
+ds4_response=pending
+attempt=1 status=pending
+attempt=1 metric=pending
+
+with:
+ds4_response=LOOP_DONE
+attempt=1 status=pass
+attempt=1 metric=ds4_agent_context_test passed
+
+If the bash result reports failure, replace the same status and metric lines
+with status=fail and the observed failure summary, and replace
+ds4_response=pending with the final response you will return.
+
+4. If the attempt passed, use the context tool with action=checkpoint and label
+ds4-generated-loop-after-pass. This context checkpoint call must be the only
+tool call in its DSML block.
+
+5. After the checkpoint tool result, answer exactly:
+LOOP_DONE
diff --git a/tests/ds4_agent_context_self_improvement_e2e.sh b/tests/ds4_agent_context_self_improvement_e2e.sh
new file mode 100755
index 00000000..cd99b6aa
--- /dev/null
+++ b/tests/ds4_agent_context_self_improvement_e2e.sh
@@ -0,0 +1,241 @@
+#!/bin/sh
+set -eu
+
+ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
+BASE=${TMPDIR:-/tmp}
+RUN_DIR=$(mktemp -d "${BASE%/}/ds4-agent-context-self-improvement.XXXXXX")
+HOME_DIR="$RUN_DIR/home"
+REPO="$RUN_DIR/repo"
+OUT="$RUN_DIR/output.txt"
+TRACE="$RUN_DIR/trace.txt"
+LEDGER="$RUN_DIR/ds4-context-self-improvement-ledger.md"
+PROMPT_FILE="$RUN_DIR/prompt.md"
+REPORT_DIR="$ROOT/tests/generated"
+REPORT="$REPORT_DIR/ds4_agent_context_self_improvement_report.md"
+REPORT_PROMPT="$REPORT_DIR/ds4_agent_context_self_improvement_prompt.md"
+REPORT_OUTPUT="$REPORT_DIR/ds4_agent_context_self_improvement_output.txt"
+REPORT_TRACE="$REPORT_DIR/ds4_agent_context_self_improvement_trace.txt"
+REPORT_LEDGER="$REPORT_DIR/ds4_agent_context_self_improvement_ledger.md"
+
+print_report_file() {
+    if [ -f "$1" ]; then
+        sed 's/```/` ` `/g' "$1"
+    else
+        printf '(missing: %s)\n' "$1"
+    fi
+}
+
+write_report() {
+    mkdir -p "$REPORT_DIR"
+    [ -f "$PROMPT_FILE" ] && cp "$PROMPT_FILE" "$REPORT_PROMPT"
+    [ -f "$OUT" ] && cp "$OUT" "$REPORT_OUTPUT"
+    [ -f "$TRACE" ] && cp "$TRACE" "$REPORT_TRACE"
+    [ -f "$LEDGER" ] && cp "$LEDGER" "$REPORT_LEDGER"
+    {
+        printf '# DS4 Agent Context Self Improvement Report\n\n'
+        printf 'prompt_file: `%s`\n\n' "$REPORT_PROMPT"
+        printf 'response_file: `%s`\n\n' "$REPORT_OUTPUT"
+        printf 'trace_file: `%s`\n\n' "$REPORT_TRACE"
+        printf 'ledger_file: `%s`\n\n' "$REPORT_LEDGER"
+        printf 'run_dir: `%s`\n\n' "$RUN_DIR"
+        printf 'repo: `%s`\n\n' "$REPO"
+        printf '## Prompt\n\n```text\n'
+        print_report_file "$PROMPT_FILE"
+        printf '\n```\n\n## DS4 Response\n\n```text\n'
+        print_report_file "$OUT"
+        printf '\n```\n\n## Trace\n\n```text\n'
+        print_report_file "$TRACE"
+        printf '\n```\n\n## Ledger\n\n```text\n'
+        print_report_file "$LEDGER"
+        printf '\n```\n\n## Final toy_math.py\n\n```python\n'
+        print_report_file "$REPO/toy_math.py"
+        printf '\n```\n'
+    } >"$REPORT"
+}
+
+cleanup() {
+    write_report >/dev/null 2>&1 || true
+    if [ "${DS4_KEEP_CONTEXT_SELF_IMPROVEMENT_TEST_DIR:-0}" != "1" ]; then
+        rm -rf "$RUN_DIR"
+    else
+        printf 'kept test directory: %s\n' "$RUN_DIR" >&2
+    fi
+}
+trap cleanup EXIT
+
+mkdir -p "$HOME_DIR" "$REPO"
+
+cat >"$REPO/toy_math.py" <<'PY'
+def normalize_score(value, maximum):
+    """Return value as a score in the inclusive range 0.0..1.0."""
+    if maximum == 0:
+        return 0.0
+    return value / maximum
+PY
+
+cat >"$REPO/test_toy_math.py" <<'PY'
+from toy_math import normalize_score
+
+
+def check(name, got, expected):
+    if got != expected:
+        raise SystemExit(f"{name}: got {got!r}, expected {expected!r}")
+
+
+check("normal", normalize_score(3, 6), 0.5)
+check("zero maximum", normalize_score(3, 0), 0.0)
+check("negative maximum", normalize_score(3, -1), 0.0)
+check("lower clamp", normalize_score(-2, 10), 0.0)
+check("upper clamp", normalize_score(12, 10), 1.0)
+print("toy_math tests passed")
+PY
+
+git -C "$REPO" init -q
+git -C "$REPO" config user.email ds4-agent-test@example.invalid
+git -C "$REPO" config user.name "DS4 Agent Test"
+git -C "$REPO" add toy_math.py test_toy_math.py
+git -C "$REPO" commit -q -m "initial broken toy math"
+
+escape_sed() {
+    printf '%s' "$1" | sed 's/[&|]/\\&/g'
+}
+
+REPO_ESC=$(escape_sed "$REPO")
+LEDGER_ESC=$(escape_sed "$LEDGER")
+sed \
+    -e "s|__REPO__|$REPO_ESC|g" \
+    -e "s|__LEDGER__|$LEDGER_ESC|g" \
+    "$ROOT/tests/ds4_agent_context_self_improvement_prompt.md" >"$PROMPT_FILE"
+
+if ! HOME="$HOME_DIR" "$ROOT/ds4-agent" \
+    --model "$ROOT/ds4flash.gguf" \
+    --non-interactive \
+    --chdir "$ROOT" \
+    --ctx "${DS4_AGENT_CONTEXT_SELF_IMPROVEMENT_CTX:-8192}" \
+    --tokens "${DS4_AGENT_CONTEXT_SELF_IMPROVEMENT_TOKENS:-4500}" \
+    --temp 0 \
+    --seed 11 \
+    --nothink \
+    --trace "$TRACE" \
+    --prompt "$(cat "$PROMPT_FILE")" >"$OUT" 2>&1
+then
+    cat "$OUT" >&2
+    echo "ds4-agent context self-improvement run failed" >&2
+    exit 1
+fi
+
+if [ ! -f "$LEDGER" ]; then
+    cat "$OUT" >&2
+    echo "missing generated context self-improvement ledger: $LEDGER" >&2
+    exit 1
+fi
+
+python3 "$REPO/test_toy_math.py" >/dev/null
+
+grep -q "git_status_checked=yes" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report git status check" >&2
+    exit 1
+}
+
+grep -Eq "git_status_mode=(native|bash)" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report a valid git status mode" >&2
+    exit 1
+}
+
+grep -q "git_diff_checked=yes" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report git diff check" >&2
+    exit 1
+}
+
+grep -Eq "git_diff_mode=(native|bash)" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report a valid git diff mode" >&2
+    exit 1
+}
+
+grep -q "context_checkpoint_before=yes" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report pre-fix checkpoint" >&2
+    exit 1
+}
+
+grep -q "context_checkpoint_after=yes" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report post-fix checkpoint" >&2
+    exit 1
+}
+
+grep -q "context_restore_used=yes" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report restore usage" >&2
+    exit 1
+}
+
+grep -q "tests_before_restore=pass" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report pre-restore passing tests" >&2
+    exit 1
+}
+
+grep -q "tests_after_restore=pass" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger does not report post-restore passing tests" >&2
+    exit 1
+}
+
+grep -q "final=CONTEXT_SELF_IMPROVEMENT_DONE" "$LEDGER" || {
+    cat "$LEDGER" >&2
+    echo "ledger is missing final marker" >&2
+    exit 1
+}
+
+grep -q "CONTEXT_SELF_IMPROVEMENT_DONE" "$OUT" || {
+    cat "$OUT" >&2
+    echo "DS4 did not finish context self-improvement scenario" >&2
+    exit 1
+}
+
+if grep -Eq "git[[:space:]]+action=status" "$OUT"; then
+    :
+elif grep -Fq "git status --short" "$OUT"; then
+    :
+else
+    cat "$OUT" >&2
+    echo "output does not prove git status was checked" >&2
+    exit 1
+fi
+
+if grep -Eq "git[[:space:]]+action=diff" "$OUT"; then
+    :
+elif grep -Fq "git diff -- toy_math.py" "$OUT"; then
+    :
+else
+    cat "$OUT" >&2
+    echo "output does not prove git diff was checked" >&2
+    exit 1
+fi
+
+grep -Eq "context[[:space:]]+action=checkpoint" "$OUT" || {
+    cat "$OUT" >&2
+    echo "output does not prove context checkpoint was used" >&2
+    exit 1
+}
+
+grep -Eq "context[[:space:]]+action=restore|Context restored from checkpoint" "$OUT" || {
+    cat "$OUT" >&2
+    echo "output does not prove context restore was used" >&2
+    exit 1
+}
+
+if git -C "$REPO" diff --quiet -- toy_math.py; then
+    git -C "$REPO" diff -- toy_math.py >&2
+    echo "final patch did not modify toy_math.py" >&2
+    exit 1
+fi
+
+write_report
+
+printf 'ds4 agent context self-improvement e2e: ok\n'
diff --git a/tests/ds4_agent_context_self_improvement_prompt.md b/tests/ds4_agent_context_self_improvement_prompt.md
new file mode 100644
index 00000000..f5476d83
--- /dev/null
+++ b/tests/ds4_agent_context_self_improvement_prompt.md
@@ -0,0 +1,93 @@
+You are validating DS4's own KV-backed self-improvement loop.
+
+Do not explain the plan in prose. Use DSML tools.
+
+Repository root: __REPO__
+Ledger path: __LEDGER__
+
+Task:
+Fix the small Python project in the repository so its test suite passes. The
+bug is intentionally simple and local to the repository.
+
+Use absolute file paths under the repository root for read, edit, write, and
+bash commands.
+
+Repository inspection mode:
+
+- If the available tool schemas include a native git DSML tool, prefer that tool
+  for repository status and diff inspection.
+- If the native git DSML tool is not available, use the bash tool to run git
+  commands in the temporary repository.
+
+Required sequence:
+
+1. Use the context tool with action=checkpoint and label
+   context-self-improvement-before. This context checkpoint call must be the
+   only tool call in its DSML block.
+
+2. Inspect repository status.
+
+If native git DSML is available, use the git tool with action=status and repo
+set to the repository root.
+
+Otherwise, use the bash tool to run exactly:
+
+cd __REPO__ && git status --short
+
+3. Use read/edit/write/bash tools as needed to inspect, fix, and test the
+   project. Run exactly this test command with the bash tool:
+
+cd __REPO__ && python3 test_toy_math.py
+
+4. Inspect the produced repository diff.
+
+If native git DSML is available, use the git tool with action=diff, repo set to
+the repository root, and path set to toy_math.py.
+
+Otherwise, use the bash tool to run exactly:
+
+cd __REPO__ && git diff -- toy_math.py
+
+5. If the test passes, use the context tool with action=checkpoint and label
+   context-self-improvement-after-pass. This context checkpoint call must be the
+   only tool call in its DSML block. Save the returned checkpoint id for step 6.
+
+6. Use the context tool with action=restore, id set to the checkpoint id from
+   step 5, reason=context-self-improvement-restore-check, and
+   allow_side_effect_mismatch=true. This context restore call must be the only
+   tool call in its DSML block.
+
+7. After restore, inspect repository status again.
+
+If native git DSML is available, use the git tool with action=status and repo
+set to the repository root.
+
+Otherwise, use the bash tool to run exactly:
+
+cd __REPO__ && git status --short
+
+Then run exactly this test command again with the bash tool:
+
+cd __REPO__ && python3 test_toy_math.py
+
+After this restore, do not create any more context checkpoints and do not call
+context restore again. Proceed directly to the ledger.
+
+8. Use the write tool to create the ledger file at the ledger path. The ledger
+   must contain these exact field names:
+
+# DS4 Context Self Improvement Ledger
+git_status_checked=yes
+git_status_mode=<native or bash>
+git_diff_checked=yes
+git_diff_mode=<native or bash>
+context_checkpoint_before=yes
+context_checkpoint_after=yes
+context_restore_used=yes
+tests_before_restore=pass
+tests_after_restore=pass
+fixed_file=toy_math.py
+final=CONTEXT_SELF_IMPROVEMENT_DONE
+
+9. After the write tool result, answer exactly:
+CONTEXT_SELF_IMPROVEMENT_DONE
diff --git a/tests/ds4_agent_context_test.c b/tests/ds4_agent_context_test.c
new file mode 100644
index 00000000..7f80d2f0
--- /dev/null
+++ b/tests/ds4_agent_context_test.c
@@ -0,0 +1,243 @@
+#include "../ds4_agent_context.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+static void test_fail(const char *msg) {
+    fprintf(stderr, "ds4_agent_context_test: %s\n", msg);
+    exit(1);
+}
+
+#define CHECK(cond, msg) do { if (!(cond)) test_fail(msg); } while (0)
+
+static char *test_strdup(const char *s) {
+    size_t n = strlen(s);
+    char *out = malloc(n + 1);
+    CHECK(out != NULL, "malloc failed");
+    memcpy(out, s, n + 1);
+    return out;
+}
+
+static char *make_temp_dir(void) {
+    const char *base = getenv("TMPDIR");
+    if (!base || !base[0]) base = "/tmp";
+    for (int i = 0; i < 100; i++) {
+        char path[PATH_MAX];
+        snprintf(path, sizeof(path), "%s/ds4-agent-context-test-%ld-%ld-%d",
+                 base, (long)getpid(), (long)time(NULL), i);
+        if (mkdir(path, 0700) == 0) return test_strdup(path);
+        if (errno != EEXIST) break;
+    }
+    test_fail("failed to create temp dir");
+    return NULL;
+}
+
+static void write_raw_file(const char *path, const char *id) {
+    FILE *fp = fopen(path, "wb");
+    CHECK(fp != NULL, "failed to open raw metadata");
+    fprintf(fp,
+            "{\n"
+            "  \"id\": \"%s\",\n"
+            "  \"label\": \"unsafe\",\n"
+            "  \"created_at\": 1,\n"
+            "  \"world_epoch\": 1,\n"
+            "  \"transcript_tokens\": 10,\n"
+            "  \"kv_path\": \"../escape.kv\",\n"
+            "  \"memory_path\": \"safe.memory.md\"\n"
+            "}\n",
+            id);
+    CHECK(fclose(fp) == 0, "failed to write raw metadata");
+}
+
+static void write_key_collision_file(const char *path, const char *id) {
+    FILE *fp = fopen(path, "wb");
+    CHECK(fp != NULL, "failed to open key collision metadata");
+    fprintf(fp,
+            "{\n"
+            "  \"id\": \"%s\",\n"
+            "  \"label\": \"label mentions \\\"world_epoch\\\": 99 before the real key\",\n"
+            "  \"created_at\": 7,\n"
+            "  \"world_epoch\": 42,\n"
+            "  \"transcript_tokens\": 77,\n"
+            "  \"kv_path\": \"%s.kv\",\n"
+            "  \"memory_path\": \"%s.memory.md\"\n"
+            "}\n",
+            id, id, id);
+    CHECK(fclose(fp) == 0, "failed to write key collision metadata");
+}
+
+static void fill_meta(ds4_agent_context_meta *m, const char *id,
+                      const char *label, uint64_t epoch, int tokens) {
+    snprintf(m->id, sizeof(m->id), "%s", id);
+    m->label = test_strdup(label);
+    m->kv_file = ds4_agent_context_file_name(m->id, ".kv");
+    m->memory_file = ds4_agent_context_file_name(m->id, ".memory.md");
+    m->created_at = 1234;
+    m->world_epoch = epoch;
+    m->transcript_tokens = tokens;
+}
+
+int main(void) {
+    static const char id1[] = "1111111111111111111111111111111111111111";
+    static const char id2[] = "2222222222222222222222222222222222222222";
+    static const char id3[] = "3333333333333333333333333333333333333333";
+    static const char id4[] = "4444444444444444444444444444444444444444";
+    char err[256] = {0};
+    char *dir = make_temp_dir();
+
+    char *meta1_name = ds4_agent_context_file_name(id1, ".meta.json");
+    char *meta2_name = ds4_agent_context_file_name(id2, ".meta.json");
+    char *meta1_path = ds4_agent_context_path_for_file(dir, meta1_name);
+    char *meta2_path = ds4_agent_context_path_for_file(dir, meta2_name);
+
+    ds4_agent_context_meta m1 = {0};
+    fill_meta(&m1, id1, "first \"checkpoint\"\\line\nnext", 4, 101);
+    CHECK(ds4_agent_context_write_meta(&m1, meta1_path, err, sizeof(err)),
+          "failed to write first metadata");
+    ds4_agent_context_meta_free(&m1);
+
+    ds4_agent_context_meta m2 = {0};
+    fill_meta(&m2, id2, "second", 12, 202);
+    CHECK(ds4_agent_context_write_meta(&m2, meta2_path, err, sizeof(err)),
+          "failed to write second metadata");
+    ds4_agent_context_meta_free(&m2);
+
+    ds4_agent_context_meta read_back = {0};
+    CHECK(ds4_agent_context_read_meta_file(meta1_path, &read_back, err, sizeof(err)),
+          "failed to read metadata roundtrip");
+    CHECK(strcmp(read_back.id, id1) == 0, "roundtrip id mismatch");
+    CHECK(strcmp(read_back.label, "first \"checkpoint\"\\line\nnext") == 0,
+          "roundtrip label mismatch");
+    CHECK(read_back.world_epoch == 4, "roundtrip epoch mismatch");
+    CHECK(read_back.transcript_tokens == 101, "roundtrip tokens mismatch");
+    ds4_agent_context_meta_free(&read_back);
+
+    CHECK(ds4_agent_context_count_checkpoints(dir) == 2, "checkpoint count mismatch");
+    CHECK(ds4_agent_context_max_world_epoch(dir) == 12, "max world epoch mismatch");
+
+    ds4_agent_context_meta found = {0};
+    char *found_meta_path = NULL;
+    char *found_kv_path = NULL;
+    CHECK(ds4_agent_context_find_checkpoint(dir, "2222", &found,
+                                            &found_meta_path, &found_kv_path,
+                                            err, sizeof(err)),
+          "failed to find checkpoint by prefix");
+    CHECK(strcmp(found.id, id2) == 0, "found checkpoint id mismatch");
+    CHECK(strstr(found_kv_path, "2222222222222222222222222222222222222222.kv") != NULL,
+          "found kv path mismatch");
+    ds4_agent_context_meta_free(&found);
+    free(found_meta_path);
+    free(found_kv_path);
+
+    char *unsafe_name = ds4_agent_context_file_name(id3, ".meta.json");
+    char *unsafe_path = ds4_agent_context_path_for_file(dir, unsafe_name);
+    write_raw_file(unsafe_path, id3);
+    ds4_agent_context_meta unsafe = {0};
+    CHECK(!ds4_agent_context_read_meta_file(unsafe_path, &unsafe, err, sizeof(err)),
+          "unsafe metadata path should be rejected");
+    ds4_agent_context_meta_free(&unsafe);
+
+    char *collision_name = ds4_agent_context_file_name(id4, ".meta.json");
+    char *collision_path = ds4_agent_context_path_for_file(dir, collision_name);
+    write_key_collision_file(collision_path, id4);
+    ds4_agent_context_meta collision = {0};
+    CHECK(ds4_agent_context_read_meta_file(collision_path, &collision,
+                                           err, sizeof(err)),
+          "key collision metadata should parse");
+    CHECK(collision.world_epoch == 42,
+          "parser matched key text inside a string value");
+    CHECK(collision.transcript_tokens == 77,
+          "key collision transcript tokens mismatch");
+    ds4_agent_context_meta_free(&collision);
+
+    CHECK(ds4_agent_context_restore_epoch_guard(12, 12, false, err, sizeof(err)),
+          "equal epoch restore should be allowed");
+    CHECK(ds4_agent_context_restore_epoch_guard(13, 12, true, err, sizeof(err)),
+          "explicit side-effect override should be allowed");
+    CHECK(!ds4_agent_context_restore_epoch_guard(13, 12, false, err, sizeof(err)),
+          "epoch mismatch restore should be rejected");
+    CHECK(strstr(err, "world_epoch=13 to 12") != NULL,
+          "epoch guard error missing epoch details");
+    CHECK(ds4_agent_context_no_running_bash_guard("restore", 0, err, sizeof(err)),
+          "restore should allow no running bash jobs");
+    CHECK(!ds4_agent_context_no_running_bash_guard("restore", 2, err, sizeof(err)),
+          "restore should reject running bash jobs");
+    CHECK(strstr(err, "2 bash job(s)") != NULL,
+          "bash guard error missing job count");
+
+    ds4_agent_context_restore_metrics metrics = {
+        .checkpoint_tokens = 101,
+        .restore_notice_tokens = 13,
+        .restored_tokens = 114,
+    };
+    char *metrics_line = ds4_agent_context_restore_expected_metrics_line(&metrics);
+    CHECK(strstr(metrics_line, "KV restore expected metrics:") != NULL,
+          "restore metrics line missing expected label");
+    CHECK(strstr(metrics_line, "checkpoint_tokens=101") != NULL,
+          "restore metrics line missing checkpoint tokens");
+    CHECK(strstr(metrics_line, "expected_restore_notice_tokens=13") != NULL,
+          "restore metrics line missing notice tokens");
+    CHECK(strstr(metrics_line, "expected_prefill_suffix_tokens=13") != NULL,
+          "restore metrics line missing expected prefill suffix");
+    CHECK(strstr(metrics_line, "expected_saved_prefill_tokens=101") != NULL,
+          "restore metrics line missing expected saved prefill");
+    CHECK(strstr(metrics_line, " saved_prefill_tokens=") == NULL,
+          "restore metrics line should not present expected values as actual");
+    free(metrics_line);
+
+    ds4_agent_side_effects effects = {0};
+    uint64_t epoch = 3;
+    epoch = ds4_agent_side_effects_note(&effects, epoch,
+                                        "write", "experiment.md\nsecond line");
+    CHECK(epoch == 4, "side effect epoch mismatch");
+    char *summary = ds4_agent_side_effects_summary_since(&effects, 3);
+    CHECK(strstr(summary, "Known side effects after checkpoint:") != NULL,
+          "side effect summary header missing");
+    CHECK(strstr(summary, "epoch=4 write experiment.md second line") != NULL,
+          "side effect summary content missing");
+    free(summary);
+    summary = ds4_agent_side_effects_summary_since(&effects, 4);
+    CHECK(strcmp(summary, "") == 0, "empty side effect summary mismatch");
+    free(summary);
+    ds4_agent_side_effects_free(&effects);
+
+    for (int i = 0; i < 70; i++) {
+        char detail[32];
+        snprintf(detail, sizeof(detail), "effect-%d", i + 1);
+        epoch = ds4_agent_side_effects_note(&effects, epoch, "bash", detail);
+    }
+    CHECK(effects.count == 64, "side effect retained count mismatch");
+    CHECK(effects.evicted_count == 6, "side effect evicted count mismatch");
+    summary = ds4_agent_side_effects_summary_since(&effects, 4);
+    CHECK(strstr(summary, "may be incomplete") != NULL,
+          "truncated side effect warning missing");
+    CHECK(strstr(summary, "6 older side effect") != NULL,
+          "truncated side effect count missing");
+    CHECK(strstr(summary, "... more side effects omitted ...") != NULL,
+          "retained side effect omission marker missing");
+    free(summary);
+    ds4_agent_side_effects_free(&effects);
+
+    unlink(meta1_path);
+    unlink(meta2_path);
+    unlink(unsafe_path);
+    unlink(collision_path);
+    rmdir(dir);
+    free(meta1_name);
+    free(meta2_name);
+    free(unsafe_name);
+    free(collision_name);
+    free(meta1_path);
+    free(meta2_path);
+    free(unsafe_path);
+    free(collision_path);
+    free(dir);
+    return 0;
+}
diff --git a/tests/ds4_kv_cache_benefit_test.c b/tests/ds4_kv_cache_benefit_test.c
new file mode 100644
index 00000000..fd617ff8
--- /dev/null
+++ b/tests/ds4_kv_cache_benefit_test.c
@@ -0,0 +1,355 @@
+#include "../ds4.h"
+
+#include <errno.h>
+#include <float.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+typedef struct {
+    char *ptr;
+    size_t len;
+    size_t cap;
+} test_buf;
+
+typedef struct {
+    int top1_a;
+    int top1_b;
+    int nonfinite;
+    double rms;
+    float max_abs;
+    bool same_top1;
+} logit_cmp;
+
+static void fail(const char *msg) {
+    fprintf(stderr, "ds4_kv_cache_benefit_test: %s\n", msg);
+    exit(1);
+}
+
+#define CHECK(cond, msg) do { if (!(cond)) fail(msg); } while (0)
+
+static double now_sec(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9;
+}
+
+static void *xmalloc(size_t n) {
+    void *p = malloc(n ? n : 1);
+    CHECK(p != NULL, "malloc failed");
+    return p;
+}
+
+static char *xstrdup(const char *s) {
+    size_t n = strlen(s);
+    char *out = xmalloc(n + 1);
+    memcpy(out, s, n + 1);
+    return out;
+}
+
+static void buf_reserve(test_buf *b, size_t add) {
+    if (b->len + add + 1 <= b->cap) return;
+    size_t cap = b->cap ? b->cap * 2 : 4096;
+    while (cap < b->len + add + 1) cap *= 2;
+    char *p = realloc(b->ptr, cap);
+    CHECK(p != NULL, "realloc failed");
+    b->ptr = p;
+    b->cap = cap;
+}
+
+static void buf_append(test_buf *b, const char *s, size_t n) {
+    buf_reserve(b, n);
+    memcpy(b->ptr + b->len, s, n);
+    b->len += n;
+    b->ptr[b->len] = '\0';
+}
+
+static void buf_puts(test_buf *b, const char *s) {
+    buf_append(b, s, strlen(s));
+}
+
+static int env_int(const char *name, int def, int min, int max) {
+    const char *s = getenv(name);
+    if (!s || !s[0]) return def;
+    char *end = NULL;
+    long v = strtol(s, &end, 10);
+    if (!end || *end || v < min || v > max) {
+        fprintf(stderr, "ds4_kv_cache_benefit_test: ignoring invalid %s=%s\n",
+                name, s);
+        return def;
+    }
+    return (int)v;
+}
+
+static const char *model_path(void) {
+    const char *path = getenv("DS4_TEST_MODEL");
+    return path && path[0] ? path : "ds4flash.gguf";
+}
+
+static char *make_prompt_text(int lines) {
+    test_buf b = {0};
+    buf_puts(&b,
+        "KV cache benchmark corpus. Every line below is deterministic and "
+        "contains canary facts that should survive exact model-state restore.\n");
+    for (int i = 0; i < lines; i++) {
+        char line[256];
+        snprintf(line, sizeof(line),
+                 "Fact %04d: project=DS4 cache_test=enabled "
+                 "canary=CANARY-BENCH-%04d checksum=%08x "
+                 "instruction=preserve-prefix-state-without-refill.\n",
+                 i, i, (unsigned)(i * 2654435761u));
+        buf_puts(&b, line);
+    }
+    return b.ptr ? b.ptr : xstrdup("");
+}
+
+static void build_prompt(ds4_engine *engine, int target_tokens, int ctx,
+                         ds4_tokens *prompt, int *lines_out) {
+    int lines = 32;
+    for (int attempt = 0; attempt < 18; attempt++) {
+        char *text = make_prompt_text(lines);
+        ds4_tokens_free(prompt);
+        memset(prompt, 0, sizeof(*prompt));
+        ds4_encode_chat_prompt(engine, "", text, DS4_THINK_NONE, prompt);
+        free(text);
+
+        if (prompt->len >= target_tokens && prompt->len + 256 < ctx) break;
+        if (prompt->len + 256 >= ctx && lines > 8) {
+            lines = (lines * 3) / 4;
+            if (lines < 8) lines = 8;
+        } else if (prompt->len < target_tokens) {
+            lines *= 2;
+        } else {
+            break;
+        }
+    }
+    CHECK(prompt->len > 256, "benchmark prompt too small");
+    CHECK(prompt->len + 256 < ctx, "benchmark prompt does not fit context");
+    if (lines_out) *lines_out = lines;
+}
+
+static char *temp_payload_path(void) {
+    const char *base = getenv("TMPDIR");
+    if (!base || !base[0]) base = "/tmp";
+    char tmpl[PATH_MAX];
+    snprintf(tmpl, sizeof(tmpl), "%s/ds4-kv-cache-benefit-%ld-XXXXXX",
+             base, (long)getpid());
+    int fd = mkstemp(tmpl);
+    CHECK(fd >= 0, strerror(errno));
+    close(fd);
+    return xstrdup(tmpl);
+}
+
+static void progress_cb(void *ud, const char *event, int current, int total) {
+    const char *label = ud ? (const char *)ud : "sync";
+    if (strcmp(event, "prefill_chunk")) return;
+    if (current == 0 || current == total || current % 512 == 0) {
+        fprintf(stderr, "ds4-kv-benefit: %s prefill %d/%d\n",
+                label, current, total);
+    }
+}
+
+static int logit_argmax(const float *x, int n) {
+    int best = -1;
+    float best_v = -FLT_MAX;
+    for (int i = 0; i < n; i++) {
+        if (!isfinite(x[i])) continue;
+        if (best < 0 || x[i] > best_v) {
+            best = i;
+            best_v = x[i];
+        }
+    }
+    return best;
+}
+
+static logit_cmp compare_logits(const float *a, const float *b, int n) {
+    logit_cmp c = {0};
+    c.top1_a = logit_argmax(a, n);
+    c.top1_b = logit_argmax(b, n);
+    c.same_top1 = c.top1_a >= 0 && c.top1_a == c.top1_b;
+    double sumsq = 0.0;
+    for (int i = 0; i < n; i++) {
+        if (!isfinite(a[i]) || !isfinite(b[i])) {
+            c.nonfinite++;
+            continue;
+        }
+        float d = b[i] - a[i];
+        float ad = fabsf(d);
+        if (ad > c.max_abs) c.max_abs = ad;
+        sumsq += (double)d * (double)d;
+    }
+    c.rms = sqrt(sumsq / (double)n);
+    return c;
+}
+
+static uint64_t file_size_or_die(const char *path) {
+    struct stat st;
+    CHECK(stat(path, &st) == 0, strerror(errno));
+    CHECK(st.st_size >= 0, "negative file size");
+    return (uint64_t)st.st_size;
+}
+
+int main(void) {
+    const int ctx = env_int("DS4_KV_BENCH_CTX", 4096, 1024, 262144);
+    const int target_tokens = env_int("DS4_KV_BENCH_TARGET_TOKENS",
+                                      ctx / 2, 256, ctx - 512);
+
+    ds4_engine *engine = NULL;
+    ds4_engine_options opt = {
+        .model_path = model_path(),
+#ifdef __APPLE__
+        .backend = DS4_BACKEND_METAL,
+#else
+        .backend = DS4_BACKEND_CUDA,
+#endif
+        .quality = false,
+    };
+    CHECK(ds4_engine_open(&engine, &opt) == 0, "failed to open DS4 engine");
+    const int vocab = ds4_engine_vocab_size(engine);
+    CHECK(vocab > 0, "invalid vocab size");
+
+    ds4_tokens prompt = {0};
+    int prompt_lines = 0;
+    build_prompt(engine, target_tokens, ctx, &prompt, &prompt_lines);
+
+    ds4_session *base = NULL;
+    CHECK(ds4_session_create(&base, engine, ctx) == 0, "failed to create base session");
+    char err[256] = {0};
+    ds4_session_set_progress(base, progress_cb, "base");
+    double t0 = now_sec();
+    CHECK(ds4_session_sync(base, &prompt, err, sizeof(err)) == 0,
+          err[0] ? err : "base prefill failed");
+    double base_sync_sec = now_sec() - t0;
+    ds4_session_set_progress(base, NULL, NULL);
+    CHECK(ds4_session_pos(base) == prompt.len, "base session token count mismatch");
+
+    float *base_logits = xmalloc((size_t)vocab * sizeof(*base_logits));
+    CHECK(ds4_session_copy_logits(base, base_logits, vocab) == vocab,
+          "failed to copy base logits");
+
+    uint64_t payload_bytes = ds4_session_payload_bytes(base);
+    CHECK(payload_bytes > 0, "base session has no KV payload");
+    char *payload_path = temp_payload_path();
+    FILE *fp = fopen(payload_path, "wb");
+    CHECK(fp != NULL, strerror(errno));
+    t0 = now_sec();
+    CHECK(ds4_session_save_payload(base, fp, err, sizeof(err)) == 0,
+          err[0] ? err : "failed to save KV payload");
+    CHECK(fclose(fp) == 0, "failed to close KV payload");
+    double save_sec = now_sec() - t0;
+    CHECK(file_size_or_die(payload_path) == payload_bytes,
+          "payload byte count mismatch");
+
+    ds4_session *restored = NULL;
+    CHECK(ds4_session_create(&restored, engine, ctx) == 0,
+          "failed to create restored session");
+    fp = fopen(payload_path, "rb");
+    CHECK(fp != NULL, strerror(errno));
+    t0 = now_sec();
+    CHECK(ds4_session_load_payload(restored, fp, payload_bytes,
+                                   err, sizeof(err)) == 0,
+          err[0] ? err : "failed to load KV payload");
+    double load_sec = now_sec() - t0;
+    fclose(fp);
+    CHECK(ds4_session_pos(restored) == prompt.len,
+          "restored session token count mismatch");
+
+    float *loaded_logits = xmalloc((size_t)vocab * sizeof(*loaded_logits));
+    CHECK(ds4_session_copy_logits(restored, loaded_logits, vocab) == vocab,
+          "failed to copy loaded logits");
+    logit_cmp base_cmp = compare_logits(base_logits, loaded_logits, vocab);
+    CHECK(base_cmp.nonfinite == 0, "non-finite logits after KV load");
+    CHECK(base_cmp.same_top1, "KV load changed top-1 token");
+    CHECK(base_cmp.max_abs <= 1.0e-4f, "KV load changed base logits");
+
+    ds4_tokens suffix = {0};
+    ds4_tokenize_text(engine,
+        "\n\nKV cache continuation probe: report CANARY-BENCH-0042 exactly once.",
+        &suffix);
+    CHECK(suffix.len > 0 && suffix.len < 128, "unexpected suffix token count");
+    ds4_tokens extended = {0};
+    ds4_tokens_copy(&extended, &prompt);
+    for (int i = 0; i < suffix.len; i++) ds4_tokens_push(&extended, suffix.v[i]);
+    CHECK(extended.len + 64 < ctx, "extended prompt does not fit context");
+
+    int common = ds4_session_common_prefix(restored, &extended);
+    int cached = common == ds4_session_pos(restored) &&
+                 extended.len >= ds4_session_pos(restored) ? common : 0;
+    int restored_prefill_tokens = extended.len - cached;
+    CHECK(cached == prompt.len, "restored session did not retain prompt prefix");
+    CHECK(restored_prefill_tokens == suffix.len,
+          "restored session would prefill more than the suffix");
+
+    ds4_session_set_progress(restored, progress_cb, "restored-suffix");
+    t0 = now_sec();
+    CHECK(ds4_session_sync(restored, &extended, err, sizeof(err)) == 0,
+          err[0] ? err : "suffix sync failed");
+    double suffix_sync_sec = now_sec() - t0;
+    ds4_session_set_progress(restored, NULL, NULL);
+    CHECK(ds4_session_pos(restored) == extended.len,
+          "restored suffix token count mismatch");
+
+    float *restored_suffix_logits = xmalloc((size_t)vocab * sizeof(*restored_suffix_logits));
+    CHECK(ds4_session_copy_logits(restored, restored_suffix_logits, vocab) == vocab,
+          "failed to copy restored suffix logits");
+
+    ds4_session *full = NULL;
+    CHECK(ds4_session_create(&full, engine, ctx) == 0,
+          "failed to create full-prefill session");
+    ds4_session_set_progress(full, progress_cb, "full");
+    t0 = now_sec();
+    CHECK(ds4_session_sync(full, &extended, err, sizeof(err)) == 0,
+          err[0] ? err : "full prefill failed");
+    double full_sync_sec = now_sec() - t0;
+    ds4_session_set_progress(full, NULL, NULL);
+
+    float *full_logits = xmalloc((size_t)vocab * sizeof(*full_logits));
+    CHECK(ds4_session_copy_logits(full, full_logits, vocab) == vocab,
+          "failed to copy full logits");
+    logit_cmp extended_cmp = compare_logits(full_logits, restored_suffix_logits, vocab);
+    CHECK(extended_cmp.nonfinite == 0, "non-finite logits after suffix sync");
+    CHECK(extended_cmp.same_top1,
+          "KV restore plus suffix changed top-1 versus full prefill");
+
+    printf("kv-cache-benefit: prompt_lines=%d base_tokens=%d suffix_tokens=%d "
+           "full_prefill_tokens=%d restored_prefill_tokens=%d saved_prefill_tokens=%d "
+           "payload_bytes=%" PRIu64 " base_sync_sec=%.3f save_sec=%.3f "
+           "load_sec=%.3f suffix_sync_sec=%.3f full_extended_sync_sec=%.3f "
+           "base_top1_equal=%s base_max_abs=%g extended_top1_equal=%s "
+           "extended_max_abs=%g extended_rms=%g quality_guard=logits_equivalence\n",
+           prompt_lines, prompt.len, suffix.len,
+           extended.len, restored_prefill_tokens,
+           extended.len - restored_prefill_tokens,
+           payload_bytes, base_sync_sec, save_sec,
+           load_sec, suffix_sync_sec, full_sync_sec,
+           base_cmp.same_top1 ? "true" : "false", base_cmp.max_abs,
+           extended_cmp.same_top1 ? "true" : "false",
+           extended_cmp.max_abs, extended_cmp.rms);
+
+    unlink(payload_path);
+    free(payload_path);
+    free(full_logits);
+    free(restored_suffix_logits);
+    free(loaded_logits);
+    free(base_logits);
+    ds4_tokens_free(&extended);
+    ds4_tokens_free(&suffix);
+    ds4_tokens_free(&prompt);
+    ds4_session_free(full);
+    ds4_session_free(restored);
+    ds4_session_free(base);
+    ds4_engine_close(engine);
+    return 0;
+}