rmescandon · rmescandon · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/README.md b/README.md
@@ -31,7 +31,29 @@ modelcost --source all gpt-4o 1000 500
 modelcost --json gpt-4o 1000 500
 ```
 
-List available models:
+### Cached and reasoning tokens
+
+Modern LLM APIs charge differently for cached input and reasoning output tokens.
+Pass them as optional flags — they default to 0 so existing usage is unchanged.
+
+```bash
+# Cached input tokens (served from prompt cache)
+modelcost gpt-4o 1000 500 --cached-input-tokens 200
+
+# Cache creation tokens (first-time cache writes)
+modelcost gpt-4o 1000 500 --cache-creation-input-tokens 100
+
+# Reasoning tokens (subset of output_tokens, e.g. o1/R1 thinking)
+modelcost deepseek/deepseek-r1 2000 5000 --reasoning-tokens 3000
+
+# All together
+modelcost gpt-4.1-mini 1000 500 \
+  --cached-input-tokens 200 \
+  --cache-creation-input-tokens 100 \
+  --reasoning-tokens 150
+```
+
+### List models
 
 ```bash
 modelcost models
@@ -40,7 +62,7 @@ modelcost models --filter gpt
 modelcost models --json
 ```
 
-CLI help:
+### Help
 
 ```bash
 modelcost --help
@@ -52,13 +74,26 @@ modelcost models --help
 ```python
 from modelcost.calculator import calculate_cost, list_models
 
+# Basic usage (backward compatible)
 result = calculate_cost("gpt-4o", 1000, 500)
 
 for source in result.available_sources:
     print(f"{source.source}: ${source.total_cost_usd:.6f}")
 
-litellm_cost = next(s for s in result.sources if s.source == "litellm")
-print(litellm_cost.price_per_million_input, litellm_cost.price_per_million_output)
+# With cached and reasoning tokens
+result = calculate_cost(
+    "gpt-4.1-mini",
+    input_tokens=1000,
+    output_tokens=500,
+    cached_input_tokens=200,
+    cache_creation_input_tokens=100,
+    reasoning_tokens=150,
+)
+
+s = result.sources[0]
+print(f"${s.total_cost_usd:.6f}")
+print(f"  cache read:  ${s.price_per_million_cache_read}/M")
+print(f"  reasoning:   ${s.price_per_million_reasoning}/M")
 
 models = list_models("openrouter")
 ```
@@ -67,16 +102,63 @@ models = list_models("openrouter")
 
 `calculate_cost()` returns a `CostResult` with:
 - `model`, `input_tokens`, `output_tokens`
+- `cached_input_tokens`, `cache_creation_input_tokens`, `reasoning_tokens` (0 when not used)
 - `sources`: list of `SourceCost` objects
 - `available_sources`: only sources with prices found
 
 Each `SourceCost` includes:
 - `source`
 - `total_cost_usd`
-- `price_per_million_input`
-- `price_per_million_output`
+- `price_per_million_input`, `price_per_million_output`
+- `price_per_million_cache_read`, `price_per_million_cache_creation`, `price_per_million_reasoning` (present only when the source has specific pricing for these)
 - `error` (when not available)
 
+### Cost formula
+
+All subset tokens (`cached_input_tokens`, `cache_creation_input_tokens`, `reasoning_tokens`)
+are treated as **subsets** of their parent total and are clamped accordingly:
+
+```
+text_input  = input_tokens  - cached_input - cache_creation
+text_output = output_tokens - reasoning_tokens
+
+total = text_input     * input_rate
+      + cached_input   * cache_read_rate    (fallback: input_rate)
+      + cache_creation * cache_creation_rate (fallback: input_rate)
+      + text_output    * output_rate
+      + reasoning      * reasoning_rate      (fallback: output_rate)
+```
+
+This matches how most APIs report usage — `input_tokens` and `output_tokens` are the
+totals including cached/reasoning, and the detail fields are subsets.
+When a specific rate is missing, the base rate for that category is used as fallback.
+
+### JSON output
+
+`--json` / `result.to_dict()` includes the new fields only when non-zero:
+
+```json
+{
+  "model": "gpt-4.1-mini",
+  "input_tokens": 1000,
+  "output_tokens": 500,
+  "cached_input_tokens": 200,
+  "reasoning_tokens": 150,
+  "costs": [
+    {
+      "source": "litellm",
+      "total_cost_usd": 0.001148,
+      "price_per_million_input": 0.4,
+      "price_per_million_output": 1.6,
+      "price_per_million_cache_read": 0.1,
+      "price_per_million_cache_creation": 0.48,
+      "price_per_million_reasoning": 1.6,
+      "error": null
+    }
+  ]
+}
+```
+
 ## Caching
 
 `openrouter` responses are cached in `~/.modelcost_cache.json` for 1 hour.
@@ -86,3 +168,4 @@ Each `SourceCost` includes:
 - Prices are fetched at runtime from the upstream catalogs.
 - If a model is missing in a source, that source is marked as unavailable.
 - Network sources are fetched in parallel for the `all` option.
+- `tokencost` does not expose cache/reasoning pricing — when used with `source="all"`, its cost may be higher than `litellm` for calls that include cached or reasoning tokens.
diff --git a/modelcost/calculator.py b/modelcost/calculator.py
@@ -29,29 +29,60 @@ def calculate_cost(
     model: str,
     input_tokens: int,
     output_tokens: int,
+    *,
+    cached_input_tokens: int = 0,
+    cache_creation_input_tokens: int = 0,
+    reasoning_tokens: int = 0,
     source: str = "litellm",
 ) -> CostResult:
-    if input_tokens < 0 or output_tokens < 0:
-        raise ValueError("input_tokens and output_tokens must be >= 0")
+    if any(
+        v < 0
+        for v in (
+            input_tokens,
+            output_tokens,
+            cached_input_tokens,
+            cache_creation_input_tokens,
+            reasoning_tokens,
+        )
+    ):
+        raise ValueError("All token counts must be >= 0")
 
     if source not in VALID_SOURCES:
         raise ValueError(f"Invalid source '{source}'. Valid values: {VALID_SOURCES}")
 
     active = ["litellm", "openrouter", "tokencost"] if source == "all" else [source]
 
-    sources = _fetch_all(model, input_tokens, output_tokens, active)
+    sources = _fetch_all(
+        model,
+        input_tokens,
+        output_tokens,
+        active,
+        cached_input_tokens=cached_input_tokens,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        reasoning_tokens=reasoning_tokens,
+    )
 
     return CostResult(
         model=model,
         input_tokens=input_tokens,
         output_tokens=output_tokens,
         sources=sources,
         single_source=source != "all",  # output formatting flag
+        cached_input_tokens=cached_input_tokens,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        reasoning_tokens=reasoning_tokens,
     )
 
 
 def _fetch_all(
-    model: str, input_tokens: int, output_tokens: int, active: list[str]
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    active: list[str],
+    *,
+    cached_input_tokens: int = 0,
+    cache_creation_input_tokens: int = 0,
+    reasoning_tokens: int = 0,
 ) -> list[SourceCost]:
     network_tasks = {
         name: fn
@@ -68,7 +99,15 @@ def _fetch_all(
         with ThreadPoolExecutor(max_workers=len(network_tasks)) as executor:
             futures = {
                 executor.submit(
-                    _compute, name, fn, model, input_tokens, output_tokens
+                    _compute,
+                    name,
+                    fn,
+                    model,
+                    input_tokens,
+                    output_tokens,
+                    cached_input_tokens=cached_input_tokens,
+                    cache_creation_input_tokens=cache_creation_input_tokens,
+                    reasoning_tokens=reasoning_tokens,
                 ): name
                 for name, fn in network_tasks.items()
             }
@@ -84,7 +123,15 @@ def _fetch_all(
 
 
 def _compute(
-    source_name: str, fetch_fn, model: str, input_tokens: int, output_tokens: int
+    source_name: str,
+    fetch_fn,
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    *,
+    cached_input_tokens: int = 0,
+    cache_creation_input_tokens: int = 0,
+    reasoning_tokens: int = 0,
 ) -> SourceCost:
     try:
         prices = fetch_fn()
@@ -101,13 +148,44 @@ def _compute(
                 price_per_million_output=None,
                 error=f"Model '{model}' not found",
             )
-        cost = pricing["prompt"] * input_tokens + pricing["completion"] * output_tokens
-        return SourceCost(
+
+        prompt_rate = pricing["prompt"]
+        completion_rate = pricing["completion"]
+        cache_read_rate = pricing.get("cache_read", prompt_rate)
+        cache_creation_rate = pricing.get("cache_creation", prompt_rate)
+        reasoning_rate = pricing.get("reasoning", completion_rate)
+
+        # All subset tokens are clamped so they never exceed the parent total.
+        effective_cached = min(cached_input_tokens, input_tokens)
+        effective_creation = min(
+            cache_creation_input_tokens, input_tokens - effective_cached
+        )
+        text_input = input_tokens - effective_cached - effective_creation
+
+        effective_reasoning = min(reasoning_tokens, output_tokens)
+        text_output = output_tokens - effective_reasoning
+
+        cost = (
+            text_input * prompt_rate
+            + effective_cached * cache_read_rate
+            + effective_creation * cache_creation_rate
+            + text_output * completion_rate
+            + effective_reasoning * reasoning_rate
+        )
+
+        result = SourceCost(
             source=source_name,
             total_cost_usd=cost,
-            price_per_million_input=pricing["prompt"] * 1_000_000,
-            price_per_million_output=pricing["completion"] * 1_000_000,
+            price_per_million_input=prompt_rate * 1_000_000,
+            price_per_million_output=completion_rate * 1_000_000,
         )
+        if "cache_read" in pricing:
+            result.price_per_million_cache_read = cache_read_rate * 1_000_000
+        if "cache_creation" in pricing:
+            result.price_per_million_cache_creation = cache_creation_rate * 1_000_000
+        if "reasoning" in pricing:
+            result.price_per_million_reasoning = reasoning_rate * 1_000_000
+        return result
     except Exception as e:
         return SourceCost(
             source=source_name,

diff --git a/modelcost/cli.py b/modelcost/cli.py
@@ -33,14 +33,50 @@ def main() -> None:
     type=click.Choice(VALID_SOURCES),
     help="Pricing source.",
 )
+@click.option(
+    "--cached-input-tokens",
+    default=0,
+    show_default=False,
+    type=int,
+    help="Tokens served from cache (charged at cache-read rate).",
+)
+@click.option(
+    "--cache-creation-input-tokens",
+    default=0,
+    show_default=False,
+    type=int,
+    help="Tokens written to cache for the first time.",
+)
+@click.option(
+    "--reasoning-tokens",
+    default=0,
+    show_default=False,
+    type=int,
+    help="Thinking/reasoning output tokens (subset of output_tokens).",
+)
 @click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
 def cost_cmd(
-    model: str, input_tokens: int, output_tokens: int, source: str, as_json: bool
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    source: str,
+    cached_input_tokens: int,
+    cache_creation_input_tokens: int,
+    reasoning_tokens: int,
+    as_json: bool,
 ) -> None:
     """Calculate the cost for MODEL with INPUT_TOKENS and OUTPUT_TOKENS."""
     # ── Calculation mode ──────────────────────────────────────────────
     try:
-        result = calculate_cost(model, input_tokens, output_tokens, source=source)
+        result = calculate_cost(
+            model,
+            input_tokens,
+            output_tokens,
+            cached_input_tokens=cached_input_tokens,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            reasoning_tokens=reasoning_tokens,
+            source=source,
+        )
     except Exception as e:
         click.echo(f"Error: {e}", err=True)
         sys.exit(1)
@@ -57,9 +93,15 @@ def cost_cmd(
             click.echo(f"unavailable — {s.error}", err=True)
             sys.exit(1)
     else:
-        click.echo(
-            f"Model: {result.model}  ({result.input_tokens} in / {result.output_tokens} out)\n"
-        )
+        parts = [f"{result.input_tokens} in", f"{result.output_tokens} out"]
+        if result.cached_input_tokens:
+            parts.append(f"{result.cached_input_tokens} cached")
+        if result.cache_creation_input_tokens:
+            parts.append(f"{result.cache_creation_input_tokens} cache-create")
+        if result.reasoning_tokens:
+            parts.append(f"{result.reasoning_tokens} reasoning")
+        header = " / ".join(parts)
+        click.echo(f"Model: {result.model}  ({header})\n")
         for s in result.sources:
             if s.available:
                 click.echo(f"  [{s.source:<12}] ${s.total_cost_usd:.6f} USD")