From 96c046cc8ba9d1059fb454b865c2118e06f16a9f Mon Sep 17 00:00:00 2001
From: bitifirefly <bitifirefly@gmail.com>
Date: Sun, 22 Mar 2026 07:19:43 +0000
Subject: [PATCH] perf(provider): improve anthropic prompt cache breakpoints

---
 opencane/providers/litellm_provider.py | 39 +++++++++++++++++---------
 tests/test_litellm_prompt_caching.py   | 10 +++++--
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/opencane/providers/litellm_provider.py b/opencane/providers/litellm_provider.py
index e3889b8a38..919ace15dd 100644
--- a/opencane/providers/litellm_provider.py
+++ b/opencane/providers/litellm_provider.py
@@ -115,31 +115,42 @@ def _apply_cache_control(
         messages: list[dict[str, Any]],
         tools: list[dict[str, Any]] | None,
     ) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
-        """Return copies of messages/tools with ephemeral cache_control hints."""
-        new_messages: list[dict[str, Any]] = []
-        for msg in messages:
-            if msg.get("role") != "system":
-                new_messages.append(msg)
-                continue
+        """Return copies of messages and tools with cache_control injected.
+
+        Two breakpoints are placed:
+        1. System message — caches the static system prompt.
+        2. Second-to-last message — caches conversation history prefix.
+        """
+        cache_marker = {"type": "ephemeral"}
+        new_messages = list(messages)
 
+        def _mark(msg: dict[str, Any]) -> dict[str, Any]:
             content = msg.get("content")
             if isinstance(content, str):
-                new_content: Any = [{"type": "text", "text": content, "cache_control": {"type": "ephemeral"}}]
-            elif isinstance(content, list) and content:
+                return {
+                    **msg,
+                    "content": [{"type": "text", "text": content, "cache_control": cache_marker}],
+                }
+            if isinstance(content, list) and content:
                 new_content = list(content)
                 last = new_content[-1]
                 if isinstance(last, dict):
-                    new_content[-1] = {**last, "cache_control": {"type": "ephemeral"}}
-            else:
-                new_messages.append(msg)
-                continue
+                    new_content[-1] = {**last, "cache_control": cache_marker}
+                return {**msg, "content": new_content}
+            return msg
+
+        # Breakpoint 1: system message.
+        if new_messages and new_messages[0].get("role") == "system":
+            new_messages[0] = _mark(new_messages[0])
 
-            new_messages.append({**msg, "content": new_content})
+        # Breakpoint 2: second-to-last message (conversation history prefix).
+        if len(new_messages) >= 3:
+            new_messages[-2] = _mark(new_messages[-2])
 
         new_tools = tools
         if tools:
             new_tools = list(tools)
-            new_tools[-1] = {**new_tools[-1], "cache_control": {"type": "ephemeral"}}
+            new_tools[-1] = {**new_tools[-1], "cache_control": cache_marker}
 
         return new_messages, new_tools
 
diff --git a/tests/test_litellm_prompt_caching.py b/tests/test_litellm_prompt_caching.py
index 688ee6f14d..2dba02224f 100644
--- a/tests/test_litellm_prompt_caching.py
+++ b/tests/test_litellm_prompt_caching.py
@@ -36,11 +36,12 @@ def test_supports_cache_control_respects_gateway_capability_flag() -> None:
     assert not aihubmix_provider._supports_cache_control("anthropic/claude-sonnet-4-5")
 
 
-def test_apply_cache_control_marks_system_and_last_tool_without_mutating_inputs() -> None:
+def test_apply_cache_control_marks_system_history_breakpoint_and_last_tool() -> None:
     provider = LiteLLMProvider(default_model="anthropic/claude-sonnet-4-5")
     messages = [
         {"role": "system", "content": "system prompt"},
         {"role": "user", "content": "hello"},
+        {"role": "assistant", "content": "answer"},
     ]
     tools = [
         {"type": "function", "function": {"name": "tool_a"}},
@@ -51,11 +52,15 @@ def test_apply_cache_control_marks_system_and_last_tool_without_mutating_inputs(
 
     assert isinstance(new_messages[0]["content"], list)
     assert new_messages[0]["content"][0]["cache_control"]["type"] == "ephemeral"
+    assert isinstance(new_messages[1]["content"], list)
+    assert new_messages[1]["content"][0]["cache_control"]["type"] == "ephemeral"
+    assert new_messages[2]["content"] == "answer"
     assert new_tools is not None
     assert new_tools[-1]["cache_control"]["type"] == "ephemeral"
 
     assert "cache_control" not in tools[-1]
     assert messages[0]["content"] == "system prompt"
+    assert messages[1]["content"] == "hello"
 
 
 @pytest.mark.asyncio
@@ -74,11 +79,12 @@ async def _fake_acompletion(**kwargs: Any) -> Any:
         messages=[
             {"role": "system", "content": "system prompt"},
             {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "answer"},
         ],
         tools=[{"type": "function", "function": {"name": "tool_a"}}],
     )
 
     kwargs = captured["kwargs"]
     assert kwargs["messages"][0]["content"][0]["cache_control"]["type"] == "ephemeral"
+    assert kwargs["messages"][1]["content"][0]["cache_control"]["type"] == "ephemeral"
     assert kwargs["tools"][-1]["cache_control"]["type"] == "ephemeral"
-