Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 89 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,29 @@ modelcost --source all gpt-4o 1000 500
modelcost --json gpt-4o 1000 500
```

List available models:
### Cached and reasoning tokens

Modern LLM APIs charge differently for cached input and reasoning output tokens.
Pass them as optional flags — they default to 0 so existing usage is unchanged.

```bash
# Cached input tokens (served from prompt cache)
modelcost gpt-4o 1000 500 --cached-input-tokens 200

# Cache creation tokens (first-time cache writes)
modelcost gpt-4o 1000 500 --cache-creation-input-tokens 100

# Reasoning tokens (subset of output_tokens, e.g. o1/R1 thinking)
modelcost deepseek/deepseek-r1 2000 5000 --reasoning-tokens 3000

# All together
modelcost gpt-4.1-mini 1000 500 \
--cached-input-tokens 200 \
--cache-creation-input-tokens 100 \
--reasoning-tokens 150
```

### List models

```bash
modelcost models
Expand All @@ -40,7 +62,7 @@ modelcost models --filter gpt
modelcost models --json
```

CLI help:
### Help

```bash
modelcost --help
Expand All @@ -52,13 +74,26 @@ modelcost models --help
```python
from modelcost.calculator import calculate_cost, list_models

# Basic usage (backward compatible)
result = calculate_cost("gpt-4o", 1000, 500)

for source in result.available_sources:
print(f"{source.source}: ${source.total_cost_usd:.6f}")

litellm_cost = next(s for s in result.sources if s.source == "litellm")
print(litellm_cost.price_per_million_input, litellm_cost.price_per_million_output)
# With cached and reasoning tokens
result = calculate_cost(
"gpt-4.1-mini",
input_tokens=1000,
output_tokens=500,
cached_input_tokens=200,
cache_creation_input_tokens=100,
reasoning_tokens=150,
)

s = result.sources[0]
print(f"${s.total_cost_usd:.6f}")
print(f" cache read: ${s.price_per_million_cache_read}/M")
print(f" reasoning: ${s.price_per_million_reasoning}/M")

models = list_models("openrouter")
```
Expand All @@ -67,16 +102,63 @@ models = list_models("openrouter")

`calculate_cost()` returns a `CostResult` with:
- `model`, `input_tokens`, `output_tokens`
- `cached_input_tokens`, `cache_creation_input_tokens`, `reasoning_tokens` (0 when not used)
- `sources`: list of `SourceCost` objects
- `available_sources`: only sources with prices found

Each `SourceCost` includes:
- `source`
- `total_cost_usd`
- `price_per_million_input`
- `price_per_million_output`
- `price_per_million_input`, `price_per_million_output`
- `price_per_million_cache_read`, `price_per_million_cache_creation`, `price_per_million_reasoning` (present only when the source has specific pricing for these)
- `error` (when not available)

### Cost formula

All subset tokens (`cached_input_tokens`, `cache_creation_input_tokens`, `reasoning_tokens`)
are treated as **subsets** of their parent total and are clamped accordingly:

```
text_input = input_tokens - cached_input - cache_creation
text_output = output_tokens - reasoning_tokens

total = text_input * input_rate
+ cached_input * cache_read_rate (fallback: input_rate)
+ cache_creation * cache_creation_rate (fallback: input_rate)
+ text_output * output_rate
+ reasoning * reasoning_rate (fallback: output_rate)
```

This matches how most APIs report usage — `input_tokens` and `output_tokens` are the
totals including cached/reasoning, and the detail fields are subsets.
When a specific rate is missing, the base rate for that category is used as fallback.

### JSON output

`--json` / `result.to_dict()` includes the new fields only when non-zero:

```json
{
"model": "gpt-4.1-mini",
"input_tokens": 1000,
"output_tokens": 500,
"cached_input_tokens": 200,
"reasoning_tokens": 150,
"costs": [
{
"source": "litellm",
"total_cost_usd": 0.001148,
"price_per_million_input": 0.4,
"price_per_million_output": 1.6,
"price_per_million_cache_read": 0.1,
"price_per_million_cache_creation": 0.48,
"price_per_million_reasoning": 1.6,
"error": null
}
]
}
```

## Caching

`openrouter` responses are cached in `~/.modelcost_cache.json` for 1 hour.
Expand All @@ -86,3 +168,4 @@ Each `SourceCost` includes:
- Prices are fetched at runtime from the upstream catalogs.
- If a model is missing in a source, that source is marked as unavailable.
- Network sources are fetched in parallel for the `all` option.
- `tokencost` does not expose cache/reasoning pricing — when used with `source="all"`, its cost may be higher than `litellm` for calls that include cached or reasoning tokens.
98 changes: 88 additions & 10 deletions modelcost/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,60 @@ def calculate_cost(
model: str,
input_tokens: int,
output_tokens: int,
*,
cached_input_tokens: int = 0,
cache_creation_input_tokens: int = 0,
Comment thread
lmcalvo marked this conversation as resolved.
reasoning_tokens: int = 0,
source: str = "litellm",
) -> CostResult:
if input_tokens < 0 or output_tokens < 0:
raise ValueError("input_tokens and output_tokens must be >= 0")
if any(
v < 0
for v in (
input_tokens,
output_tokens,
cached_input_tokens,
cache_creation_input_tokens,
reasoning_tokens,
)
):
raise ValueError("All token counts must be >= 0")

if source not in VALID_SOURCES:
raise ValueError(f"Invalid source '{source}'. Valid values: {VALID_SOURCES}")

active = ["litellm", "openrouter", "tokencost"] if source == "all" else [source]

sources = _fetch_all(model, input_tokens, output_tokens, active)
sources = _fetch_all(
model,
input_tokens,
output_tokens,
active,
cached_input_tokens=cached_input_tokens,
cache_creation_input_tokens=cache_creation_input_tokens,
reasoning_tokens=reasoning_tokens,
)

return CostResult(
model=model,
input_tokens=input_tokens,
output_tokens=output_tokens,
sources=sources,
single_source=source != "all", # output formatting flag
cached_input_tokens=cached_input_tokens,
cache_creation_input_tokens=cache_creation_input_tokens,
reasoning_tokens=reasoning_tokens,
)


def _fetch_all(
model: str, input_tokens: int, output_tokens: int, active: list[str]
model: str,
input_tokens: int,
output_tokens: int,
active: list[str],
*,
cached_input_tokens: int = 0,
cache_creation_input_tokens: int = 0,
reasoning_tokens: int = 0,
) -> list[SourceCost]:
network_tasks = {
name: fn
Expand All @@ -68,7 +99,15 @@ def _fetch_all(
with ThreadPoolExecutor(max_workers=len(network_tasks)) as executor:
futures = {
executor.submit(
_compute, name, fn, model, input_tokens, output_tokens
_compute,
name,
fn,
model,
input_tokens,
output_tokens,
cached_input_tokens=cached_input_tokens,
cache_creation_input_tokens=cache_creation_input_tokens,
reasoning_tokens=reasoning_tokens,
): name
for name, fn in network_tasks.items()
}
Expand All @@ -84,7 +123,15 @@ def _fetch_all(


def _compute(
source_name: str, fetch_fn, model: str, input_tokens: int, output_tokens: int
source_name: str,
fetch_fn,
model: str,
input_tokens: int,
output_tokens: int,
*,
cached_input_tokens: int = 0,
cache_creation_input_tokens: int = 0,
reasoning_tokens: int = 0,
) -> SourceCost:
try:
prices = fetch_fn()
Expand All @@ -101,13 +148,44 @@ def _compute(
price_per_million_output=None,
error=f"Model '{model}' not found",
)
cost = pricing["prompt"] * input_tokens + pricing["completion"] * output_tokens
return SourceCost(

prompt_rate = pricing["prompt"]
completion_rate = pricing["completion"]
cache_read_rate = pricing.get("cache_read", prompt_rate)
cache_creation_rate = pricing.get("cache_creation", prompt_rate)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CORRECTNESS] Reported reasoning token count can differ from billed count.
Why: Billing clamps with effective_reasoning = min(reasoning_tokens, output_tokens) but CostResult.reasoning_tokens keeps the original value, so CLI output can over-report reasoning usage.
Suggestion: Store/report the clamped reasoning value (or include both raw and billed values with clear labels) so displayed usage matches billed usage.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically valid observation, but I'd push back on the fix. The clamping (min(reasoning_tokens, output_tokens)) is a defensive guard, not an expected case. If reasoning > output, the caller passed bad data. Reporting the raw value in CostResult is more useful for debugging — the caller can see their own input. Silently changing it would be confusing. The total_cost_usd is always correct regardless.

reasoning_rate = pricing.get("reasoning", completion_rate)

# All subset tokens are clamped so they never exceed the parent total.
effective_cached = min(cached_input_tokens, input_tokens)
effective_creation = min(
cache_creation_input_tokens, input_tokens - effective_cached
)
text_input = input_tokens - effective_cached - effective_creation

effective_reasoning = min(reasoning_tokens, output_tokens)
text_output = output_tokens - effective_reasoning

cost = (
text_input * prompt_rate
+ effective_cached * cache_read_rate
+ effective_creation * cache_creation_rate
+ text_output * completion_rate
+ effective_reasoning * reasoning_rate
)

result = SourceCost(
source=source_name,
total_cost_usd=cost,
price_per_million_input=pricing["prompt"] * 1_000_000,
price_per_million_output=pricing["completion"] * 1_000_000,
price_per_million_input=prompt_rate * 1_000_000,
price_per_million_output=completion_rate * 1_000_000,
)
if "cache_read" in pricing:
result.price_per_million_cache_read = cache_read_rate * 1_000_000
if "cache_creation" in pricing:
result.price_per_million_cache_creation = cache_creation_rate * 1_000_000
if "reasoning" in pricing:
result.price_per_million_reasoning = reasoning_rate * 1_000_000
return result
except Exception as e:
return SourceCost(
source=source_name,
Expand Down
52 changes: 47 additions & 5 deletions modelcost/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,50 @@ def main() -> None:
type=click.Choice(VALID_SOURCES),
help="Pricing source.",
)
@click.option(
"--cached-input-tokens",
default=0,
show_default=False,
type=int,
help="Tokens served from cache (charged at cache-read rate).",
)
@click.option(
"--cache-creation-input-tokens",
default=0,
show_default=False,
type=int,
help="Tokens written to cache for the first time.",
)
@click.option(
"--reasoning-tokens",
default=0,
show_default=False,
type=int,
help="Thinking/reasoning output tokens (subset of output_tokens).",
)
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
def cost_cmd(
model: str, input_tokens: int, output_tokens: int, source: str, as_json: bool
model: str,
input_tokens: int,
output_tokens: int,
source: str,
cached_input_tokens: int,
cache_creation_input_tokens: int,
reasoning_tokens: int,
as_json: bool,
) -> None:
"""Calculate the cost for MODEL with INPUT_TOKENS and OUTPUT_TOKENS."""
# ── Calculation mode ──────────────────────────────────────────────
try:
result = calculate_cost(model, input_tokens, output_tokens, source=source)
result = calculate_cost(
model,
input_tokens,
output_tokens,
cached_input_tokens=cached_input_tokens,
cache_creation_input_tokens=cache_creation_input_tokens,
reasoning_tokens=reasoning_tokens,
source=source,
)
except Exception as e:
click.echo(f"Error: {e}", err=True)
sys.exit(1)
Expand All @@ -57,9 +93,15 @@ def cost_cmd(
click.echo(f"unavailable — {s.error}", err=True)
sys.exit(1)
else:
click.echo(
f"Model: {result.model} ({result.input_tokens} in / {result.output_tokens} out)\n"
)
parts = [f"{result.input_tokens} in", f"{result.output_tokens} out"]
if result.cached_input_tokens:
parts.append(f"{result.cached_input_tokens} cached")
if result.cache_creation_input_tokens:
parts.append(f"{result.cache_creation_input_tokens} cache-create")
if result.reasoning_tokens:
parts.append(f"{result.reasoning_tokens} reasoning")
header = " / ".join(parts)
click.echo(f"Model: {result.model} ({header})\n")
for s in result.sources:
if s.available:
click.echo(f" [{s.source:<12}] ${s.total_cost_usd:.6f} USD")
Expand Down
Loading
Loading