From 9450594d1444c16cce6630a40dacaeaa9d8dcd62 Mon Sep 17 00:00:00 2001 From: Giovanni Montana Date: Mon, 25 May 2026 18:08:13 +0100 Subject: [PATCH] fix(server): evict disk KV entry that fails prefill After an unclean shutdown the on-disk KV checkpoint can be intact (header, hash, token count all valid) but leave Metal in a state where prefill fails. Since the file keeps passing load-time checks it gets reloaded on every request, looping forever until the user manually deletes the cache directory. On prefill failure, if the prefix came from a disk entry, unlink it and invalidate the session. Next request gets a clean cache miss. Closes #251 --- ds4_server.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ds4_server.c b/ds4_server.c index 53a1eb2b..649bae9b 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -8763,6 +8763,14 @@ static void kv_cache_restore_suppressed_continued(kv_disk_cache *kc, ds4_kvstore_restore_suppressed_continued(kc, old_tokens, suppressed_tokens); } +static void kv_cache_evict_failed_disk_entry(server *s, const char *path) { + if (!path) return; + server_log(DS4_LOG_KVCACHE, + "ds4-server: kv cache evicted reason=prefill-failed file=%s", path); + unlink(path); + ds4_session_invalidate(s->session); +} + static void kv_cache_maybe_store_continued(server *s) { kv_disk_cache *kc = &s->kv; const ds4_tokens *tokens = ds4_session_tokens(s->session); @@ -10029,7 +10037,6 @@ static void generate_job(server *s, job *j) { const double t0 = now_sec(); uint64_t trace_id = trace_begin(s, j, cached, prompt_tokens, &cache_diag, cache_source, disk_cached, disk_cache_path); - free(disk_cache_path); char ctx_span[48]; request_ctx_span(ctx_span, sizeof(ctx_span), cached, prompt_tokens); server_prefill_progress progress = { @@ -10131,6 +10138,8 @@ static void generate_job(server *s, job *j) { ds4_session_set_display_progress(s->session, NULL, NULL); kv_cache_restore_suppressed_continued(&s->kv, suppressed_continued_last, cold_store_len); + kv_cache_evict_failed_disk_entry(s, disk_cache_path); + free(disk_cache_path); trace_event(s, trace_id, "prefill failed: %s", err); send_prefill_failure_response(s, j, &progress, ctx_span, req_flags, err); return; @@ -10152,10 +10161,13 @@ static void generate_job(server *s, job *j) { ds4_session_set_display_progress(s->session, NULL, NULL); kv_cache_restore_suppressed_continued(&s->kv, suppressed_continued_last, cold_store_len); + kv_cache_evict_failed_disk_entry(s, disk_cache_path); + free(disk_cache_path); trace_event(s, trace_id, "prefill failed: %s", err); send_prefill_failure_response(s, j, &progress, ctx_span, req_flags, err); return; } + free(disk_cache_path); /* Once a non-live request wins, old protocol live bindings are stale. Keep * a binding only when this request explicitly continued from it. */ if (!responses_live_continuation) responses_live_clear(s);