From 96c046cc8ba9d1059fb454b865c2118e06f16a9f Mon Sep 17 00:00:00 2001 From: bitifirefly Date: Sun, 22 Mar 2026 07:19:43 +0000 Subject: [PATCH] perf(provider): improve anthropic prompt cache breakpoints --- opencane/providers/litellm_provider.py | 39 +++++++++++++++++--------- tests/test_litellm_prompt_caching.py | 10 +++++-- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/opencane/providers/litellm_provider.py b/opencane/providers/litellm_provider.py index e3889b8a38..919ace15dd 100644 --- a/opencane/providers/litellm_provider.py +++ b/opencane/providers/litellm_provider.py @@ -115,31 +115,42 @@ def _apply_cache_control( messages: list[dict[str, Any]], tools: list[dict[str, Any]] | None, ) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]: - """Return copies of messages/tools with ephemeral cache_control hints.""" - new_messages: list[dict[str, Any]] = [] - for msg in messages: - if msg.get("role") != "system": - new_messages.append(msg) - continue + """Return copies of messages and tools with cache_control injected. + + Two breakpoints are placed: + 1. System message — caches the static system prompt. + 2. Second-to-last message — caches conversation history prefix. + """ + cache_marker = {"type": "ephemeral"} + new_messages = list(messages) + def _mark(msg: dict[str, Any]) -> dict[str, Any]: content = msg.get("content") if isinstance(content, str): - new_content: Any = [{"type": "text", "text": content, "cache_control": {"type": "ephemeral"}}] - elif isinstance(content, list) and content: + return { + **msg, + "content": [{"type": "text", "text": content, "cache_control": cache_marker}], + } + if isinstance(content, list) and content: new_content = list(content) last = new_content[-1] if isinstance(last, dict): - new_content[-1] = {**last, "cache_control": {"type": "ephemeral"}} - else: - new_messages.append(msg) - continue + new_content[-1] = {**last, "cache_control": cache_marker} + return {**msg, "content": new_content} + return msg + + # Breakpoint 1: system message. + if new_messages and new_messages[0].get("role") == "system": + new_messages[0] = _mark(new_messages[0]) - new_messages.append({**msg, "content": new_content}) + # Breakpoint 2: second-to-last message (conversation history prefix). + if len(new_messages) >= 3: + new_messages[-2] = _mark(new_messages[-2]) new_tools = tools if tools: new_tools = list(tools) - new_tools[-1] = {**new_tools[-1], "cache_control": {"type": "ephemeral"}} + new_tools[-1] = {**new_tools[-1], "cache_control": cache_marker} return new_messages, new_tools diff --git a/tests/test_litellm_prompt_caching.py b/tests/test_litellm_prompt_caching.py index 688ee6f14d..2dba02224f 100644 --- a/tests/test_litellm_prompt_caching.py +++ b/tests/test_litellm_prompt_caching.py @@ -36,11 +36,12 @@ def test_supports_cache_control_respects_gateway_capability_flag() -> None: assert not aihubmix_provider._supports_cache_control("anthropic/claude-sonnet-4-5") -def test_apply_cache_control_marks_system_and_last_tool_without_mutating_inputs() -> None: +def test_apply_cache_control_marks_system_history_breakpoint_and_last_tool() -> None: provider = LiteLLMProvider(default_model="anthropic/claude-sonnet-4-5") messages = [ {"role": "system", "content": "system prompt"}, {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "answer"}, ] tools = [ {"type": "function", "function": {"name": "tool_a"}}, @@ -51,11 +52,15 @@ def test_apply_cache_control_marks_system_and_last_tool_without_mutating_inputs( assert isinstance(new_messages[0]["content"], list) assert new_messages[0]["content"][0]["cache_control"]["type"] == "ephemeral" + assert isinstance(new_messages[1]["content"], list) + assert new_messages[1]["content"][0]["cache_control"]["type"] == "ephemeral" + assert new_messages[2]["content"] == "answer" assert new_tools is not None assert new_tools[-1]["cache_control"]["type"] == "ephemeral" assert "cache_control" not in tools[-1] assert messages[0]["content"] == "system prompt" + assert messages[1]["content"] == "hello" @pytest.mark.asyncio @@ -74,11 +79,12 @@ async def _fake_acompletion(**kwargs: Any) -> Any: messages=[ {"role": "system", "content": "system prompt"}, {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "answer"}, ], tools=[{"type": "function", "function": {"name": "tool_a"}}], ) kwargs = captured["kwargs"] assert kwargs["messages"][0]["content"][0]["cache_control"]["type"] == "ephemeral" + assert kwargs["messages"][1]["content"][0]["cache_control"]["type"] == "ephemeral" assert kwargs["tools"][-1]["cache_control"]["type"] == "ephemeral" -