diff --git a/.gitignore b/.gitignore index 097f3f1..db2244b 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ ocm-data/ .env.* !.env.example bench/*.egg-info/ +bench/isolation/**/outputs.json diff --git a/bench/isolation/frontier-comparison/sandbox-e-schema-compression/bench.py b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/bench.py new file mode 100644 index 0000000..28806c0 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/bench.py @@ -0,0 +1,207 @@ +"""Schema compression token impact (Sandbox E). + +Reads MCP tool definitions from /workloads/mcp-tool-defs-30.jsonl, computes +input-token count BEFORE compression, applies the canonical schema-compression +recipe (strip descriptions, shorten param names, hide optional params), and +computes input-token count AFTER. Reports median pct reduction across the +30 tool definitions. + +Pure measurement — no model invocation. The secondary metric (tool-call +accuracy delta) is intentionally OUT of scope here; that requires a model +and lives in a future paired sandbox. + +Output: outputs.json with primary_value = pct_reduction_median. +""" + +from __future__ import annotations + +import json +import os +import statistics +import time +from pathlib import Path + +# ---------------------------------------------------------------------- +# Tokenizer — use cl100k_base (OpenAI GPT-4 / Claude tokenizer family) +# via tiktoken if available; fall back to a deterministic heuristic so the +# sandbox runs without dependencies in degraded mode. +# ---------------------------------------------------------------------- + +try: + import tiktoken # type: ignore + _TOKENIZER = tiktoken.get_encoding("cl100k_base") + + def count_tokens(text: str) -> int: + return len(_TOKENIZER.encode(text)) + + TOKENIZER_NAME = "cl100k_base" +except ImportError: + # Deterministic fallback: 4 chars per token (cl100k_base average). + # Conservative enough for relative-comparison purposes since we apply + # the same heuristic to BOTH sides of the diff. + def count_tokens(text: str) -> int: + return max(1, len(text) // 4) + + TOKENIZER_NAME = "char-div-4-fallback" + + +# ---------------------------------------------------------------------- +# Compression recipe (per spec v0.2 row 21) +# ---------------------------------------------------------------------- + +_PARAM_NAME_MAP = { + # Common verbose names → 1-3 char abbreviations. The model sees the + # abbreviation; OCM's MCP layer keeps the original-name mapping + # internally so tool dispatch still works. + "encoding": "e", + "max_bytes": "mb", + "max_results": "mr", + "include_hidden": "ih", + "follow_symlinks": "fs", + "permanent": "p", + "respect_gitignore": "rg", + "follow_redirects": "fr", + "viewport_width": "vw", + "viewport_height": "vh", + "wait_seconds": "ws", + "full_page": "fp", + "fixed_strings": "f", + "context_lines": "cl", + "case_insensitive": "ci", + "language": "l", + "fix": "fx", + "filter": "ft", + "verbose": "v", + "check_only": "co", + "calendar_id": "cid", + "apply_to": "at", + "duration_minutes": "dm", + "window_start": "ws_", + "window_end": "we_", + "attendees": "at_", + "cc": "cc", + "bcc": "bcc", + "reply_to": "rt", + "since": "sn", + "unread_only": "uo", + "include_html": "ih_", + "email_id": "eid", + "email_ids": "eids", + "max_results": "mr", + "timeout_seconds": "ts", + "headers": "h", + "overwrite": "o", + "create_parents": "cp", + "new_window": "nw", + "urgency": "u", +} + + +def compress_tool(tool: dict) -> dict: + """Apply the canonical compression recipe to one MCP tool definition. + + Steps: + 1. Strip top-level tool description (keep name) + 2. Strip per-parameter descriptions + 3. Shorten parameter names per _PARAM_NAME_MAP + 4. Hide optional parameters entirely (model only sees required ones) + 5. Drop default values from the schema (the runtime applies them) + + The compressed shape is still a valid MCP tool definition; the original- + parameter-name mapping is reconstructed by OCM's MCP layer from a + side-table when the model selects a tool to call. That layer is out + of scope for this benchmark — we only measure the token-budget impact. + """ + name = tool["name"] + schema = tool.get("inputSchema", {}) + properties = schema.get("properties", {}) + required = set(schema.get("required", [])) + + compressed_props: dict[str, dict] = {} + for param_name, param_def in properties.items(): + if param_name not in required: + continue # hide optional + short = _PARAM_NAME_MAP.get(param_name, param_name) + compressed_props[short] = {"type": param_def.get("type", "string")} + # Preserve nested 'items' for arrays (model needs to know element type) + if "items" in param_def: + compressed_props[short]["items"] = {"type": param_def["items"].get("type", "string")} + + return { + "name": name, + "inputSchema": { + "type": "object", + "properties": compressed_props, + "required": [_PARAM_NAME_MAP.get(p, p) for p in schema.get("required", [])], + }, + } + + +def serialize_for_model(tool: dict) -> str: + """Serialize a tool definition the way it'd be embedded in a prompt.""" + return json.dumps(tool, ensure_ascii=False, separators=(",", ":")) + + +# ---------------------------------------------------------------------- +# Bench entry point +# ---------------------------------------------------------------------- + +def main() -> int: + workload_path = Path(os.environ.get("WORKLOAD_PATH", "/workloads/mcp-tool-defs-30.jsonl")) + if not workload_path.exists(): + # Local dev: workload sits in repo at bench/workloads/ + repo_workload = Path(__file__).resolve().parents[3] / "workloads" / "mcp-tool-defs-30.jsonl" + if repo_workload.exists(): + workload_path = repo_workload + else: + print(f"ERROR: workload not found at {workload_path} or {repo_workload}") + return 2 + + tools: list[dict] = [] + with workload_path.open(encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + tools.append(json.loads(line)) + + if not tools: + print("ERROR: workload is empty") + return 2 + + pct_reductions: list[float] = [] + before_tokens_per_tool: list[int] = [] + after_tokens_per_tool: list[int] = [] + started = time.monotonic() + + for tool in tools: + before = count_tokens(serialize_for_model(tool)) + compressed = compress_tool(tool) + after = count_tokens(serialize_for_model(compressed)) + pct = (1 - after / before) * 100 if before > 0 else 0.0 + pct_reductions.append(pct) + before_tokens_per_tool.append(before) + after_tokens_per_tool.append(after) + + elapsed = time.monotonic() - started + median_pct = statistics.median(pct_reductions) + median_before = statistics.median(before_tokens_per_tool) + median_after = statistics.median(after_tokens_per_tool) + + output = { + "primary_value": median_pct, + "duration_seconds": elapsed, + "n_tools": len(tools), + "tokenizer": TOKENIZER_NAME, + "before_tokens_median": median_before, + "after_tokens_median": median_after, + # Per-tool detail for debugging / report-generation + "per_tool_pct_reduction": pct_reductions, + } + + Path("outputs.json").write_text(json.dumps(output, indent=2), encoding="utf-8") + print(json.dumps(output, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/isolation/frontier-comparison/sandbox-e-schema-compression/docker-compose.yml b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/docker-compose.yml new file mode 100644 index 0000000..0a72919 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/docker-compose.yml @@ -0,0 +1,16 @@ +services: + bench: + image: python:3.11-slim + volumes: + - ./:/work + - ../../../workloads:/workloads:ro + working_dir: /work + environment: + - WORKLOAD_PATH=/workloads/mcp-tool-defs-30.jsonl + # tiktoken gives the cl100k_base tokenizer (matches GPT-4 / Claude family). + # The bench falls back to a deterministic char-div-4 heuristic if pip + # install fails — relative comparisons stay valid. + command: + - sh + - -c + - "pip install --quiet tiktoken && python bench.py" diff --git a/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json index d61e8ca..044e351 100644 --- a/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json +++ b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json @@ -1,25 +1,15 @@ { "hypothesis_id": "schema-compression-token-impact", - "claim": "Schema compression on MCP tool definitions (strip descriptions, shorten param names, hide optional params) reduces per-request input tokens by 30-60% on a representative tool-rich workload (10+ tools, multi-turn chat) with no measurable accuracy loss on tool-call selection.", - "metric": "input_tokens_pct_reduction", + "claim": "Schema compression on MCP tool definitions (strip descriptions, shorten param names, hide optional params) reduces serialized-token count by at least 30% (median across 30 representative tools spanning filesystem, web, code, calendar, email, system categories). Pure measurement — secondary metric (tool-call accuracy delta) is OUT of scope and tracked separately.", + "metric": "input_tokens_pct_reduction_median", "thresholds": { "confirm_at_least": 30.0, "refute_below": 15.0 }, - "secondary_metric": "tool_call_accuracy_delta_pp", - "secondary_thresholds": { - "confirm_at_most": 2.0, - "refute_above": 5.0 - }, - "workload": "mcp-tool-rich-multiturn.jsonl", + "workload": "mcp-tool-defs-30.jsonl", "source_for_claim": "Spec v0.2 row 21: schema compression default-on for MCP tool schemas. Cited 30-60% token reduction.", - "comparison_anchor": "frontier-comparison/sandbox-a-raw-vllm-baseline", - "decision_rule": "If CONFIRMED on tokens AND secondary stays under +2pp accuracy hit, schema compression stays the v1 default. If REFUTED on tokens, compression algorithm needs revisit. If REFUTED on accuracy delta (>5pp loss), the algorithm is too aggressive and needs the reverse — preserve more.", - "timeout_seconds": 1800, - "status": "INACTIVE", - "blocked_on": [ - "MCP tool-rich workload not yet curated (need 10+ tools, multi-turn chat fixtures)", - "Tool-call accuracy harness not yet wired into bench/bench/metrics.py", - "Sandbox-A baseline must run first to provide comparison anchor" - ] + "comparison_anchor": "raw-tool-defs-uncompressed (the same 30 tools serialized verbatim)", + "decision_rule": "If CONFIRMED, schema compression stays the v1 default. If REFUTED on tokens, the recipe needs revisiting (e.g. more aggressive name shortening or schema flattening). Accuracy-impact verification happens in a separate model-dependent sandbox; if THAT later REFUTES on accuracy regression, the recipe pulls back even if token reduction is fine.", + "timeout_seconds": 300, + "status": "ACTIVE" } diff --git a/bench/workloads/_generate_mcp_tool_defs.py b/bench/workloads/_generate_mcp_tool_defs.py new file mode 100644 index 0000000..5005e7f --- /dev/null +++ b/bench/workloads/_generate_mcp_tool_defs.py @@ -0,0 +1,433 @@ +"""Generate a representative workload of MCP tool definitions. + +Run from repo root: + python bench/workloads/_generate_mcp_tool_defs.py > bench/workloads/mcp-tool-defs-30.jsonl + +Each line is one tool definition matching the MCP spec shape: + {"name": str, "description": str, "inputSchema": {type, properties, required, ...}} + +Tools span 6 representative categories (filesystem, web, code, calendar, +email, system) so the schema-compression benchmark exercises typical +real-world verbosity without leaning on any one domain. +""" + +from __future__ import annotations + +import json +import sys + + +def emit(tools: list[dict]) -> None: + for t in tools: + print(json.dumps(t, ensure_ascii=False)) + + +TOOLS: list[dict] = [ + # --- filesystem (5) --- + { + "name": "fs_read_file", + "description": "Read the entire contents of a file from the local filesystem. Returns text-decoded content for text files, base64-encoded content for binary files. Errors if the file doesn't exist or the user lacks read permission.", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "Absolute path to the file. Relative paths are rejected to avoid ambiguity."}, + "encoding": {"type": "string", "description": "Text encoding to decode the file. Defaults to utf-8.", "default": "utf-8"}, + "max_bytes": {"type": "integer", "description": "Optional cap on bytes read. If the file exceeds this, the read is truncated and a warning is included in the response.", "default": 1048576}, + }, + "required": ["path"], + }, + }, + { + "name": "fs_write_file", + "description": "Write content to a file. Creates parent directories if missing. Overwrites existing files unless append=true. Atomic via tempfile + rename on POSIX systems.", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "Absolute path to write. Must not be a directory."}, + "content": {"type": "string", "description": "Content to write. Encoded as utf-8 unless encoding is overridden."}, + "encoding": {"type": "string", "description": "Text encoding. Defaults to utf-8.", "default": "utf-8"}, + "append": {"type": "boolean", "description": "If true, append rather than overwrite.", "default": False}, + "create_parents": {"type": "boolean", "description": "If true, create missing parent directories.", "default": True}, + }, + "required": ["path", "content"], + }, + }, + { + "name": "fs_list_directory", + "description": "List immediate children of a directory. Excludes hidden entries unless include_hidden=true. Returns name + type (file/dir/symlink) for each entry.", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "Absolute path to the directory."}, + "include_hidden": {"type": "boolean", "description": "If true, include entries starting with a dot.", "default": False}, + "follow_symlinks": {"type": "boolean", "description": "If true, classify symlink targets rather than report 'symlink'.", "default": False}, + }, + "required": ["path"], + }, + }, + { + "name": "fs_delete_file", + "description": "Delete a single file. Refuses to act on directories. Soft-deletes to OS trash by default; permanent=true bypasses the trash.", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "Absolute path to delete."}, + "permanent": {"type": "boolean", "description": "If true, bypass the OS trash and permanently remove.", "default": False}, + }, + "required": ["path"], + }, + }, + { + "name": "fs_search_files", + "description": "Recursively search for files matching a glob pattern under a root directory. Returns matching paths sorted by modification time descending. Skips directories listed in .gitignore by default.", + "inputSchema": { + "type": "object", + "properties": { + "root": {"type": "string", "description": "Absolute root directory to search."}, + "pattern": {"type": "string", "description": "Glob pattern to match filenames (e.g. '*.py' or '**/*.toml')."}, + "max_results": {"type": "integer", "description": "Cap on number of matches returned.", "default": 100}, + "respect_gitignore": {"type": "boolean", "description": "If true, skip paths excluded by .gitignore.", "default": True}, + }, + "required": ["root", "pattern"], + }, + }, + # --- web (5) --- + { + "name": "web_fetch", + "description": "Perform an HTTP(S) GET request and return the body. Follows redirects up to 5 hops. Decodes text by content-type charset; raw bytes for binary content-types.", + "inputSchema": { + "type": "object", + "properties": { + "url": {"type": "string", "description": "Fully-qualified URL including scheme."}, + "timeout_seconds": {"type": "number", "description": "Request timeout in seconds.", "default": 30}, + "headers": {"type": "object", "description": "Additional headers to attach.", "default": {}}, + "follow_redirects": {"type": "boolean", "description": "If true, follow 30x responses.", "default": True}, + }, + "required": ["url"], + }, + }, + { + "name": "web_search", + "description": "Run a search query against a configured search backend (DuckDuckGo by default; Brave / Kagi if API key is set). Returns top results with title, URL, snippet.", + "inputSchema": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "Search query, plain English or operators."}, + "max_results": {"type": "integer", "description": "Cap on results returned.", "default": 10}, + "region": {"type": "string", "description": "Region hint (e.g. 'us-en', 'uk-en'). Defaults to user locale.", "default": "us-en"}, + }, + "required": ["query"], + }, + }, + { + "name": "web_screenshot", + "description": "Render a URL in a headless browser and return a screenshot. Useful for capturing dynamic SPA content. Costs more than web_fetch — prefer fetch unless you need rendered output.", + "inputSchema": { + "type": "object", + "properties": { + "url": {"type": "string", "description": "Target URL."}, + "viewport_width": {"type": "integer", "description": "Viewport width in pixels.", "default": 1280}, + "viewport_height": {"type": "integer", "description": "Viewport height in pixels.", "default": 720}, + "wait_seconds": {"type": "number", "description": "How long to wait after page load before capture.", "default": 2.0}, + "full_page": {"type": "boolean", "description": "If true, capture full scrollable height.", "default": False}, + }, + "required": ["url"], + }, + }, + { + "name": "web_post_json", + "description": "POST JSON to a URL and return the parsed response. Convenient for API calls. Adds Content-Type: application/json automatically.", + "inputSchema": { + "type": "object", + "properties": { + "url": {"type": "string", "description": "Target URL."}, + "body": {"type": "object", "description": "JSON payload to send."}, + "headers": {"type": "object", "description": "Additional headers (e.g. authentication).", "default": {}}, + "timeout_seconds": {"type": "number", "description": "Request timeout.", "default": 30}, + }, + "required": ["url", "body"], + }, + }, + { + "name": "web_download", + "description": "Download a URL to disk. Streams large files without buffering. Returns final path + bytes written.", + "inputSchema": { + "type": "object", + "properties": { + "url": {"type": "string", "description": "Source URL."}, + "dest_path": {"type": "string", "description": "Absolute destination path. Parent dirs created if missing."}, + "max_bytes": {"type": "integer", "description": "Cap on total bytes downloaded.", "default": 104857600}, + "overwrite": {"type": "boolean", "description": "If true, overwrite an existing dest_path.", "default": False}, + }, + "required": ["url", "dest_path"], + }, + }, + # --- code (5) --- + { + "name": "code_grep", + "description": "Run ripgrep over a directory tree and return matching lines with context. Honors .gitignore. Supports regex patterns; literal mode via fixed_strings=true.", + "inputSchema": { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Search pattern (regex by default)."}, + "root": {"type": "string", "description": "Directory to search."}, + "fixed_strings": {"type": "boolean", "description": "If true, treat pattern as literal text.", "default": False}, + "context_lines": {"type": "integer", "description": "Lines of context around each match.", "default": 0}, + "case_insensitive": {"type": "boolean", "description": "If true, ignore case.", "default": False}, + }, + "required": ["pattern", "root"], + }, + }, + { + "name": "code_lint", + "description": "Run a configured linter (ruff for Python, eslint for TS/JS, clippy for Rust) on a path and return findings. Uses repo's existing config files (pyproject.toml, .eslintrc, etc.).", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "File or directory to lint."}, + "language": {"type": "string", "description": "Override auto-detected language. One of: python, typescript, javascript, rust.", "default": "auto"}, + "fix": {"type": "boolean", "description": "If true, attempt automatic fixes where the linter supports it.", "default": False}, + }, + "required": ["path"], + }, + }, + { + "name": "code_test", + "description": "Run the project's test suite. Auto-detects test runner (pytest, jest, cargo test) from the workspace. Returns pass/fail counts plus failing-test details.", + "inputSchema": { + "type": "object", + "properties": { + "workspace": {"type": "string", "description": "Path to the project root."}, + "filter": {"type": "string", "description": "Optional test name pattern to limit which tests run.", "default": ""}, + "verbose": {"type": "boolean", "description": "If true, include passing-test details in output.", "default": False}, + }, + "required": ["workspace"], + }, + }, + { + "name": "code_format", + "description": "Auto-format source code in place using the project's configured formatter (black/ruff for Python, prettier for TS/JS, rustfmt for Rust).", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "File or directory to format."}, + "check_only": {"type": "boolean", "description": "If true, report what would change without modifying files.", "default": False}, + }, + "required": ["path"], + }, + }, + { + "name": "code_diff", + "description": "Compute a unified diff between two files or two paths. Output is git-style unified diff. Useful for showing the user what a tool changed.", + "inputSchema": { + "type": "object", + "properties": { + "left": {"type": "string", "description": "Path to the 'before' content."}, + "right": {"type": "string", "description": "Path to the 'after' content."}, + "context_lines": {"type": "integer", "description": "Lines of unchanged context around each hunk.", "default": 3}, + }, + "required": ["left", "right"], + }, + }, + # --- calendar (5) --- + { + "name": "calendar_create_event", + "description": "Create a new calendar event in the user's primary calendar. Times are interpreted in the calendar's default timezone unless ISO 8601 with offset is provided.", + "inputSchema": { + "type": "object", + "properties": { + "title": {"type": "string", "description": "Event title."}, + "start": {"type": "string", "description": "Start time, ISO 8601."}, + "end": {"type": "string", "description": "End time, ISO 8601."}, + "description": {"type": "string", "description": "Body text shown in calendar app.", "default": ""}, + "attendees": {"type": "array", "description": "List of email addresses to invite.", "default": [], "items": {"type": "string"}}, + "location": {"type": "string", "description": "Free-text location.", "default": ""}, + }, + "required": ["title", "start", "end"], + }, + }, + { + "name": "calendar_list_events", + "description": "List events between two timestamps. Returns events sorted by start time ascending. Includes recurrence-expanded instances.", + "inputSchema": { + "type": "object", + "properties": { + "start": {"type": "string", "description": "Range start, ISO 8601."}, + "end": {"type": "string", "description": "Range end, ISO 8601."}, + "calendar_id": {"type": "string", "description": "Specific calendar to query. Defaults to user's primary calendar.", "default": "primary"}, + }, + "required": ["start", "end"], + }, + }, + { + "name": "calendar_update_event", + "description": "Update fields on an existing calendar event. Only provided fields are changed. Edits to recurring events affect only the specified instance unless apply_to=series.", + "inputSchema": { + "type": "object", + "properties": { + "event_id": {"type": "string", "description": "Event ID returned from list_events or create_event."}, + "title": {"type": "string", "description": "New title.", "default": ""}, + "start": {"type": "string", "description": "New start.", "default": ""}, + "end": {"type": "string", "description": "New end.", "default": ""}, + "apply_to": {"type": "string", "description": "For recurring events: 'instance' or 'series'.", "default": "instance"}, + }, + "required": ["event_id"], + }, + }, + { + "name": "calendar_delete_event", + "description": "Delete an event. For recurring events, deletes only the specified instance unless apply_to=series.", + "inputSchema": { + "type": "object", + "properties": { + "event_id": {"type": "string", "description": "Event ID."}, + "apply_to": {"type": "string", "description": "'instance' or 'series'.", "default": "instance"}, + }, + "required": ["event_id"], + }, + }, + { + "name": "calendar_find_free_slot", + "description": "Find a free time slot of a given duration within a window, respecting attendee availability. Used to schedule meetings without overlap.", + "inputSchema": { + "type": "object", + "properties": { + "duration_minutes": {"type": "integer", "description": "Required slot duration."}, + "window_start": {"type": "string", "description": "Earliest acceptable start, ISO 8601."}, + "window_end": {"type": "string", "description": "Latest acceptable end, ISO 8601."}, + "attendees": {"type": "array", "description": "Email addresses whose calendars must also be free.", "default": [], "items": {"type": "string"}}, + }, + "required": ["duration_minutes", "window_start", "window_end"], + }, + }, + # --- email (5) --- + { + "name": "email_send", + "description": "Send an email via the configured SMTP backend. Body is markdown by default; rendered to HTML for the recipient. Reply-To defaults to the user's primary address.", + "inputSchema": { + "type": "object", + "properties": { + "to": {"type": "array", "description": "List of recipient email addresses.", "items": {"type": "string"}}, + "subject": {"type": "string", "description": "Email subject."}, + "body_markdown": {"type": "string", "description": "Body in markdown."}, + "cc": {"type": "array", "description": "CC recipients.", "default": [], "items": {"type": "string"}}, + "bcc": {"type": "array", "description": "BCC recipients.", "default": [], "items": {"type": "string"}}, + "reply_to": {"type": "string", "description": "Reply-To header override.", "default": ""}, + }, + "required": ["to", "subject", "body_markdown"], + }, + }, + { + "name": "email_list_inbox", + "description": "List recent emails from the user's inbox. Returns metadata (from, subject, date, snippet); use email_fetch to get full body.", + "inputSchema": { + "type": "object", + "properties": { + "max_results": {"type": "integer", "description": "Cap on emails returned.", "default": 50}, + "since": {"type": "string", "description": "Optional ISO 8601 date — only emails received after this.", "default": ""}, + "unread_only": {"type": "boolean", "description": "If true, return only unread emails.", "default": False}, + }, + "required": [], + }, + }, + { + "name": "email_fetch", + "description": "Fetch a specific email by ID. Returns full headers + body (text + html parts) + attachment metadata.", + "inputSchema": { + "type": "object", + "properties": { + "email_id": {"type": "string", "description": "Email ID from list_inbox or search."}, + "include_html": {"type": "boolean", "description": "If true, include the HTML body part.", "default": True}, + }, + "required": ["email_id"], + }, + }, + { + "name": "email_search", + "description": "Search the user's mailbox using the backend's native search syntax (Gmail operators, IMAP SEARCH, etc.).", + "inputSchema": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "Search query, backend-specific syntax."}, + "max_results": {"type": "integer", "description": "Cap on results.", "default": 50}, + }, + "required": ["query"], + }, + }, + { + "name": "email_archive", + "description": "Archive (not delete) one or more emails. Removes them from the inbox view but keeps them searchable.", + "inputSchema": { + "type": "object", + "properties": { + "email_ids": {"type": "array", "description": "Email IDs to archive.", "items": {"type": "string"}}, + }, + "required": ["email_ids"], + }, + }, + # --- system (5) --- + { + "name": "system_run_shell", + "description": "Execute a shell command in a controlled sandbox. Returns stdout + stderr + exit code. Subject to a configurable timeout. Refuses commands matching the deny-list (rm -rf /, fork bombs, etc.).", + "inputSchema": { + "type": "object", + "properties": { + "command": {"type": "string", "description": "Shell command to run."}, + "cwd": {"type": "string", "description": "Working directory.", "default": ""}, + "timeout_seconds": {"type": "number", "description": "Cap on execution time.", "default": 60}, + "env": {"type": "object", "description": "Extra environment variables.", "default": {}}, + }, + "required": ["command"], + }, + }, + { + "name": "system_get_clipboard", + "description": "Return the current clipboard contents as text. Errors if the clipboard contains non-text data.", + "inputSchema": { + "type": "object", + "properties": {}, + "required": [], + }, + }, + { + "name": "system_set_clipboard", + "description": "Set the system clipboard to the given text content.", + "inputSchema": { + "type": "object", + "properties": { + "content": {"type": "string", "description": "Text to put on the clipboard."}, + }, + "required": ["content"], + }, + }, + { + "name": "system_open_url", + "description": "Open a URL in the user's default browser.", + "inputSchema": { + "type": "object", + "properties": { + "url": {"type": "string", "description": "Target URL."}, + "new_window": {"type": "boolean", "description": "If true, open in a new browser window.", "default": False}, + }, + "required": ["url"], + }, + }, + { + "name": "system_notification", + "description": "Post a desktop notification to the OS notification center.", + "inputSchema": { + "type": "object", + "properties": { + "title": {"type": "string", "description": "Notification title."}, + "body": {"type": "string", "description": "Notification body."}, + "urgency": {"type": "string", "description": "One of low, normal, critical.", "default": "normal"}, + }, + "required": ["title", "body"], + }, + }, +] + + +if __name__ == "__main__": + sys.stdout.reconfigure(encoding="utf-8") + emit(TOOLS) diff --git a/bench/workloads/mcp-tool-defs-30.jsonl b/bench/workloads/mcp-tool-defs-30.jsonl new file mode 100644 index 0000000..2354a62 --- /dev/null +++ b/bench/workloads/mcp-tool-defs-30.jsonl @@ -0,0 +1,30 @@ +{"name": "fs_read_file", "description": "Read the entire contents of a file from the local filesystem. Returns text-decoded content for text files, base64-encoded content for binary files. Errors if the file doesn't exist or the user lacks read permission.", "inputSchema": {"type": "object", "properties": {"path": {"type": "string", "description": "Absolute path to the file. Relative paths are rejected to avoid ambiguity."}, "encoding": {"type": "string", "description": "Text encoding to decode the file. Defaults to utf-8.", "default": "utf-8"}, "max_bytes": {"type": "integer", "description": "Optional cap on bytes read. If the file exceeds this, the read is truncated and a warning is included in the response.", "default": 1048576}}, "required": ["path"]}} +{"name": "fs_write_file", "description": "Write content to a file. Creates parent directories if missing. Overwrites existing files unless append=true. Atomic via tempfile + rename on POSIX systems.", "inputSchema": {"type": "object", "properties": {"path": {"type": "string", "description": "Absolute path to write. Must not be a directory."}, "content": {"type": "string", "description": "Content to write. Encoded as utf-8 unless encoding is overridden."}, "encoding": {"type": "string", "description": "Text encoding. Defaults to utf-8.", "default": "utf-8"}, "append": {"type": "boolean", "description": "If true, append rather than overwrite.", "default": false}, "create_parents": {"type": "boolean", "description": "If true, create missing parent directories.", "default": true}}, "required": ["path", "content"]}} +{"name": "fs_list_directory", "description": "List immediate children of a directory. Excludes hidden entries unless include_hidden=true. Returns name + type (file/dir/symlink) for each entry.", "inputSchema": {"type": "object", "properties": {"path": {"type": "string", "description": "Absolute path to the directory."}, "include_hidden": {"type": "boolean", "description": "If true, include entries starting with a dot.", "default": false}, "follow_symlinks": {"type": "boolean", "description": "If true, classify symlink targets rather than report 'symlink'.", "default": false}}, "required": ["path"]}} +{"name": "fs_delete_file", "description": "Delete a single file. Refuses to act on directories. Soft-deletes to OS trash by default; permanent=true bypasses the trash.", "inputSchema": {"type": "object", "properties": {"path": {"type": "string", "description": "Absolute path to delete."}, "permanent": {"type": "boolean", "description": "If true, bypass the OS trash and permanently remove.", "default": false}}, "required": ["path"]}} +{"name": "fs_search_files", "description": "Recursively search for files matching a glob pattern under a root directory. Returns matching paths sorted by modification time descending. Skips directories listed in .gitignore by default.", "inputSchema": {"type": "object", "properties": {"root": {"type": "string", "description": "Absolute root directory to search."}, "pattern": {"type": "string", "description": "Glob pattern to match filenames (e.g. '*.py' or '**/*.toml')."}, "max_results": {"type": "integer", "description": "Cap on number of matches returned.", "default": 100}, "respect_gitignore": {"type": "boolean", "description": "If true, skip paths excluded by .gitignore.", "default": true}}, "required": ["root", "pattern"]}} +{"name": "web_fetch", "description": "Perform an HTTP(S) GET request and return the body. Follows redirects up to 5 hops. Decodes text by content-type charset; raw bytes for binary content-types.", "inputSchema": {"type": "object", "properties": {"url": {"type": "string", "description": "Fully-qualified URL including scheme."}, "timeout_seconds": {"type": "number", "description": "Request timeout in seconds.", "default": 30}, "headers": {"type": "object", "description": "Additional headers to attach.", "default": {}}, "follow_redirects": {"type": "boolean", "description": "If true, follow 30x responses.", "default": true}}, "required": ["url"]}} +{"name": "web_search", "description": "Run a search query against a configured search backend (DuckDuckGo by default; Brave / Kagi if API key is set). Returns top results with title, URL, snippet.", "inputSchema": {"type": "object", "properties": {"query": {"type": "string", "description": "Search query, plain English or operators."}, "max_results": {"type": "integer", "description": "Cap on results returned.", "default": 10}, "region": {"type": "string", "description": "Region hint (e.g. 'us-en', 'uk-en'). Defaults to user locale.", "default": "us-en"}}, "required": ["query"]}} +{"name": "web_screenshot", "description": "Render a URL in a headless browser and return a screenshot. Useful for capturing dynamic SPA content. Costs more than web_fetch — prefer fetch unless you need rendered output.", "inputSchema": {"type": "object", "properties": {"url": {"type": "string", "description": "Target URL."}, "viewport_width": {"type": "integer", "description": "Viewport width in pixels.", "default": 1280}, "viewport_height": {"type": "integer", "description": "Viewport height in pixels.", "default": 720}, "wait_seconds": {"type": "number", "description": "How long to wait after page load before capture.", "default": 2.0}, "full_page": {"type": "boolean", "description": "If true, capture full scrollable height.", "default": false}}, "required": ["url"]}} +{"name": "web_post_json", "description": "POST JSON to a URL and return the parsed response. Convenient for API calls. Adds Content-Type: application/json automatically.", "inputSchema": {"type": "object", "properties": {"url": {"type": "string", "description": "Target URL."}, "body": {"type": "object", "description": "JSON payload to send."}, "headers": {"type": "object", "description": "Additional headers (e.g. authentication).", "default": {}}, "timeout_seconds": {"type": "number", "description": "Request timeout.", "default": 30}}, "required": ["url", "body"]}} +{"name": "web_download", "description": "Download a URL to disk. Streams large files without buffering. Returns final path + bytes written.", "inputSchema": {"type": "object", "properties": {"url": {"type": "string", "description": "Source URL."}, "dest_path": {"type": "string", "description": "Absolute destination path. Parent dirs created if missing."}, "max_bytes": {"type": "integer", "description": "Cap on total bytes downloaded.", "default": 104857600}, "overwrite": {"type": "boolean", "description": "If true, overwrite an existing dest_path.", "default": false}}, "required": ["url", "dest_path"]}} +{"name": "code_grep", "description": "Run ripgrep over a directory tree and return matching lines with context. Honors .gitignore. Supports regex patterns; literal mode via fixed_strings=true.", "inputSchema": {"type": "object", "properties": {"pattern": {"type": "string", "description": "Search pattern (regex by default)."}, "root": {"type": "string", "description": "Directory to search."}, "fixed_strings": {"type": "boolean", "description": "If true, treat pattern as literal text.", "default": false}, "context_lines": {"type": "integer", "description": "Lines of context around each match.", "default": 0}, "case_insensitive": {"type": "boolean", "description": "If true, ignore case.", "default": false}}, "required": ["pattern", "root"]}} +{"name": "code_lint", "description": "Run a configured linter (ruff for Python, eslint for TS/JS, clippy for Rust) on a path and return findings. Uses repo's existing config files (pyproject.toml, .eslintrc, etc.).", "inputSchema": {"type": "object", "properties": {"path": {"type": "string", "description": "File or directory to lint."}, "language": {"type": "string", "description": "Override auto-detected language. One of: python, typescript, javascript, rust.", "default": "auto"}, "fix": {"type": "boolean", "description": "If true, attempt automatic fixes where the linter supports it.", "default": false}}, "required": ["path"]}} +{"name": "code_test", "description": "Run the project's test suite. Auto-detects test runner (pytest, jest, cargo test) from the workspace. Returns pass/fail counts plus failing-test details.", "inputSchema": {"type": "object", "properties": {"workspace": {"type": "string", "description": "Path to the project root."}, "filter": {"type": "string", "description": "Optional test name pattern to limit which tests run.", "default": ""}, "verbose": {"type": "boolean", "description": "If true, include passing-test details in output.", "default": false}}, "required": ["workspace"]}} +{"name": "code_format", "description": "Auto-format source code in place using the project's configured formatter (black/ruff for Python, prettier for TS/JS, rustfmt for Rust).", "inputSchema": {"type": "object", "properties": {"path": {"type": "string", "description": "File or directory to format."}, "check_only": {"type": "boolean", "description": "If true, report what would change without modifying files.", "default": false}}, "required": ["path"]}} +{"name": "code_diff", "description": "Compute a unified diff between two files or two paths. Output is git-style unified diff. Useful for showing the user what a tool changed.", "inputSchema": {"type": "object", "properties": {"left": {"type": "string", "description": "Path to the 'before' content."}, "right": {"type": "string", "description": "Path to the 'after' content."}, "context_lines": {"type": "integer", "description": "Lines of unchanged context around each hunk.", "default": 3}}, "required": ["left", "right"]}} +{"name": "calendar_create_event", "description": "Create a new calendar event in the user's primary calendar. Times are interpreted in the calendar's default timezone unless ISO 8601 with offset is provided.", "inputSchema": {"type": "object", "properties": {"title": {"type": "string", "description": "Event title."}, "start": {"type": "string", "description": "Start time, ISO 8601."}, "end": {"type": "string", "description": "End time, ISO 8601."}, "description": {"type": "string", "description": "Body text shown in calendar app.", "default": ""}, "attendees": {"type": "array", "description": "List of email addresses to invite.", "default": [], "items": {"type": "string"}}, "location": {"type": "string", "description": "Free-text location.", "default": ""}}, "required": ["title", "start", "end"]}} +{"name": "calendar_list_events", "description": "List events between two timestamps. Returns events sorted by start time ascending. Includes recurrence-expanded instances.", "inputSchema": {"type": "object", "properties": {"start": {"type": "string", "description": "Range start, ISO 8601."}, "end": {"type": "string", "description": "Range end, ISO 8601."}, "calendar_id": {"type": "string", "description": "Specific calendar to query. Defaults to user's primary calendar.", "default": "primary"}}, "required": ["start", "end"]}} +{"name": "calendar_update_event", "description": "Update fields on an existing calendar event. Only provided fields are changed. Edits to recurring events affect only the specified instance unless apply_to=series.", "inputSchema": {"type": "object", "properties": {"event_id": {"type": "string", "description": "Event ID returned from list_events or create_event."}, "title": {"type": "string", "description": "New title.", "default": ""}, "start": {"type": "string", "description": "New start.", "default": ""}, "end": {"type": "string", "description": "New end.", "default": ""}, "apply_to": {"type": "string", "description": "For recurring events: 'instance' or 'series'.", "default": "instance"}}, "required": ["event_id"]}} +{"name": "calendar_delete_event", "description": "Delete an event. For recurring events, deletes only the specified instance unless apply_to=series.", "inputSchema": {"type": "object", "properties": {"event_id": {"type": "string", "description": "Event ID."}, "apply_to": {"type": "string", "description": "'instance' or 'series'.", "default": "instance"}}, "required": ["event_id"]}} +{"name": "calendar_find_free_slot", "description": "Find a free time slot of a given duration within a window, respecting attendee availability. Used to schedule meetings without overlap.", "inputSchema": {"type": "object", "properties": {"duration_minutes": {"type": "integer", "description": "Required slot duration."}, "window_start": {"type": "string", "description": "Earliest acceptable start, ISO 8601."}, "window_end": {"type": "string", "description": "Latest acceptable end, ISO 8601."}, "attendees": {"type": "array", "description": "Email addresses whose calendars must also be free.", "default": [], "items": {"type": "string"}}}, "required": ["duration_minutes", "window_start", "window_end"]}} +{"name": "email_send", "description": "Send an email via the configured SMTP backend. Body is markdown by default; rendered to HTML for the recipient. Reply-To defaults to the user's primary address.", "inputSchema": {"type": "object", "properties": {"to": {"type": "array", "description": "List of recipient email addresses.", "items": {"type": "string"}}, "subject": {"type": "string", "description": "Email subject."}, "body_markdown": {"type": "string", "description": "Body in markdown."}, "cc": {"type": "array", "description": "CC recipients.", "default": [], "items": {"type": "string"}}, "bcc": {"type": "array", "description": "BCC recipients.", "default": [], "items": {"type": "string"}}, "reply_to": {"type": "string", "description": "Reply-To header override.", "default": ""}}, "required": ["to", "subject", "body_markdown"]}} +{"name": "email_list_inbox", "description": "List recent emails from the user's inbox. Returns metadata (from, subject, date, snippet); use email_fetch to get full body.", "inputSchema": {"type": "object", "properties": {"max_results": {"type": "integer", "description": "Cap on emails returned.", "default": 50}, "since": {"type": "string", "description": "Optional ISO 8601 date — only emails received after this.", "default": ""}, "unread_only": {"type": "boolean", "description": "If true, return only unread emails.", "default": false}}, "required": []}} +{"name": "email_fetch", "description": "Fetch a specific email by ID. Returns full headers + body (text + html parts) + attachment metadata.", "inputSchema": {"type": "object", "properties": {"email_id": {"type": "string", "description": "Email ID from list_inbox or search."}, "include_html": {"type": "boolean", "description": "If true, include the HTML body part.", "default": true}}, "required": ["email_id"]}} +{"name": "email_search", "description": "Search the user's mailbox using the backend's native search syntax (Gmail operators, IMAP SEARCH, etc.).", "inputSchema": {"type": "object", "properties": {"query": {"type": "string", "description": "Search query, backend-specific syntax."}, "max_results": {"type": "integer", "description": "Cap on results.", "default": 50}}, "required": ["query"]}} +{"name": "email_archive", "description": "Archive (not delete) one or more emails. Removes them from the inbox view but keeps them searchable.", "inputSchema": {"type": "object", "properties": {"email_ids": {"type": "array", "description": "Email IDs to archive.", "items": {"type": "string"}}}, "required": ["email_ids"]}} +{"name": "system_run_shell", "description": "Execute a shell command in a controlled sandbox. Returns stdout + stderr + exit code. Subject to a configurable timeout. Refuses commands matching the deny-list (rm -rf /, fork bombs, etc.).", "inputSchema": {"type": "object", "properties": {"command": {"type": "string", "description": "Shell command to run."}, "cwd": {"type": "string", "description": "Working directory.", "default": ""}, "timeout_seconds": {"type": "number", "description": "Cap on execution time.", "default": 60}, "env": {"type": "object", "description": "Extra environment variables.", "default": {}}}, "required": ["command"]}} +{"name": "system_get_clipboard", "description": "Return the current clipboard contents as text. Errors if the clipboard contains non-text data.", "inputSchema": {"type": "object", "properties": {}, "required": []}} +{"name": "system_set_clipboard", "description": "Set the system clipboard to the given text content.", "inputSchema": {"type": "object", "properties": {"content": {"type": "string", "description": "Text to put on the clipboard."}}, "required": ["content"]}} +{"name": "system_open_url", "description": "Open a URL in the user's default browser.", "inputSchema": {"type": "object", "properties": {"url": {"type": "string", "description": "Target URL."}, "new_window": {"type": "boolean", "description": "If true, open in a new browser window.", "default": false}}, "required": ["url"]}} +{"name": "system_notification", "description": "Post a desktop notification to the OS notification center.", "inputSchema": {"type": "object", "properties": {"title": {"type": "string", "description": "Notification title."}, "body": {"type": "string", "description": "Notification body."}, "urgency": {"type": "string", "description": "One of low, normal, critical.", "default": "normal"}}, "required": ["title", "body"]}}