feat: CLI benchmark + apply commands — the two things that matter

Test · claude · Test · commit 932670344a3f · 2026-03-27T01:13:39.000+02:00
benchmark: runs PawBench (or manual fallback), produces a servingcard YAML.
Pluggable backend system — PawBenchBackend tries CLI then import, falls
back to ManualBackend with interactive prompts.

apply: pulls config from local path, registry shorthand (model/variant),
or URL. Generates framework-specific launch command (vLLM, TGI).
Registry fetches from GitHub raw URLs.

PawBenchResults added to schema — quality_score, cacp_compliance,
useful_token_ratio, tokens_per_turn, adaptability_score.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -35,7 +35,8 @@ for community sharing.
 - **Hardware-specific** -- same model, different configs for RTX 4090 vs A100 vs GB10
 - **Benchmarks required** -- configs without benchmarks are guesses, not serving cards
 - **Framework-aware** -- vLLM, TGI, SGLang, llama.cpp params in one standard
-- **One-command apply** -- `servingcard launch qwen3-coder/gb10-fp8-eagle3-spec3.yaml`
+- **One-command apply** -- `servingcard apply qwen3-coder/gb10-fp8-eagle3-spec3`
+- **Benchmark-first** -- `servingcard benchmark` runs PawBench and produces a card in one step
 - **Autoresearch-compatible** -- auto-tuning tools export directly to serving cards
 - **Community registry** -- share and discover optimized configs
 - **Transform documentation** -- model output quirks (think tags, float coercion) captured alongside the config
@@ -54,27 +55,43 @@ Serving cards are not the right tool for every situation. Be honest about scope:
 ## Quick Start
 
 ```bash
-# Clone the repo (PyPI package coming soon)
-git clone https://github.com/zenprocess/servingcard
-cd servingcard/packages/python
-pip install -e .
+pip install -e packages/python  # or: pip install servingcard
+```
 
-# Validate a config
-servingcard validate registry/qwen3-coder/gb10-fp8-eagle3-spec3.yaml
+### Benchmark your model
 
-# Show summary info
-servingcard info registry/qwen3-coder/gb10-fp8-eagle3-spec3.yaml
+```bash
+servingcard benchmark \
+  --model qwen3-coder \
+  --hardware nvidia-gb10 \
+  --endpoint http://localhost:8000
+
+# Produces: qwen3-coder-nvidia-gb10.yaml
+# Uses PawBench if installed, otherwise prompts for manual entry
+```
+
+### Apply a community config
+
+```bash
+# From the registry (shorthand)
+servingcard apply qwen3-coder/gb10-fp8-eagle3-spec3
+
+# From a local file
+servingcard apply ./my-config.yaml
+
+# From a URL
+servingcard apply --url https://raw.githubusercontent.com/.../config.yaml
+
+# Outputs the vllm serve command with optimized params — copy and run
+```
 
-# Search the registry
-servingcard search --model qwen3-coder --hardware nvidia-gb10
+### Validate and inspect
 
-# Launch vLLM from a serving card
-servingcard launch registry/qwen3-coder/gb10-fp8-eagle3-spec3.yaml
-# Expands to: vllm serve Qwen/Qwen3-Coder-480B-A35B-FP8 \
-#   --tensor-parallel-size 1 --max-model-len 131072 \
-#   --gpu-memory-utilization 0.90 --quantization fp8 \
-#   --speculative-model aurora-spec-qwen3-coder \
-#   --num-speculative-tokens 3
+```bash
+servingcard validate registry/qwen3-coder/gb10-fp8-eagle3-spec3.yaml
+servingcard info registry/qwen3-coder/gb10-fp8-eagle3-spec3.yaml
+servingcard search qwen3-coder
+servingcard search --hardware nvidia-gb10
 ```
 
 ## Format Overview
@@ -366,10 +383,10 @@ runs on specific hardware, not how good its outputs are.
 
 | Tool | Status | Description |
 |------|--------|-------------|
-| vLLM | `servingcard launch` | Generate vLLM CLI from a serving card |
+| vLLM | `servingcard apply` | Generate vLLM CLI from a serving card |
 | Multi-agent dispatchers | Compatible | Any dispatcher can read serving cards for routing and capacity |
 | [auto-tuning-vllm](https://github.com/zenprocess/auto-tuning-vllm) | Planned | Export tuning results as serving cards |
-| TGI | Planned | `servingcard launch --engine tgi` param mapping |
+| TGI | Planned | `servingcard apply --engine tgi` param mapping |
 | SGLang | Planned | SGLang param mapping |
 | Your tool here | -- | PRs welcome |
 
diff --git a/packages/python/pyproject.toml b/packages/python/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+benchmark = ["pawbench"]
 dev = [
     "pytest>=7.0",
     "pytest-cov",
diff --git a/packages/python/servingcard/__init__.py b/packages/python/servingcard/__init__.py
@@ -7,6 +7,7 @@
     BenchmarkSection,
     CapacitySection,
     HardwareDetails,
+    PawBenchResults,
     QuantizationSection,
     ServingCard,
     ServingSection,
@@ -19,6 +20,7 @@
     "BenchmarkSection",
     "CapacitySection",
     "HardwareDetails",
+    "PawBenchResults",
     "QuantizationSection",
     "ServingCard",
     "ServingSection",
diff --git a/packages/python/servingcard/apply.py b/packages/python/servingcard/apply.py
@@ -0,0 +1,118 @@
+"""Generate framework-specific launch commands from a servingcard."""
+
+from __future__ import annotations
+
+from servingcard.schema import ServingCard
+
+REGISTRY_BASE_URL = (
+    "https://raw.githubusercontent.com/zenprocess/servingcard/main/registry"
+)
+
+
+def resolve_source(source: str) -> str:
+    """Resolve a config source to a fetchable URL or local path.
+
+    Supports:
+      - Local file path: ./my-config.yaml, /abs/path.yaml
+      - Full URL: https://...
+      - Registry shorthand: model/variant -> GitHub raw URL
+    """
+    # Full URL
+    if source.startswith("http://") or source.startswith("https://"):
+        return source
+
+    # Local file (contains dot-slash, slash, or ends with .yaml/.yml)
+    if "/" in source and (
+        source.startswith(".")
+        or source.startswith("/")
+        or source.endswith(".yaml")
+        or source.endswith(".yml")
+    ):
+        return source
+
+    # Registry shorthand: model/variant
+    if "/" in source:
+        return f"{REGISTRY_BASE_URL}/{source}.yaml"
+
+    # Bare name -- assume it is a local file
+    return source
+
+
+def generate_vllm_command(card: ServingCard) -> str:
+    """Generate a vLLM serve command from a servingcard."""
+    if not card.serving or not card.serving.engine_args:
+        return f"# No engine_args in servingcard -- cannot generate vllm command\nvllm serve {card.model}"
+
+    args = card.serving.engine_args.copy()
+    model_id = args.pop("model", card.model)
+
+    parts = [f"vllm serve {model_id}"]
+
+    # Map engine_args keys to CLI flags
+    for key, value in args.items():
+        flag = f"--{key.replace('_', '-')}"
+        if isinstance(value, bool):
+            if value:
+                parts.append(f"  {flag}")
+        else:
+            parts.append(f"  {flag} {value}")
+
+    # Add tensor-parallel-size if not in engine_args but inferable
+    # Add enable-prefix-caching if not explicitly set
+    return " \\\n".join(parts)
+
+
+def generate_tgi_command(card: ServingCard) -> str:
+    """Generate a TGI launch command from a servingcard."""
+    if not card.serving or not card.serving.engine_args:
+        return f"# No engine_args in servingcard -- cannot generate TGI command\ntext-generation-launcher --model-id {card.model}"
+
+    args = card.serving.engine_args.copy()
+    model_id = args.pop("model", card.model)
+
+    parts = [f"text-generation-launcher --model-id {model_id}"]
+
+    # Map common vLLM args to TGI equivalents
+    tgi_map = {
+        "quantization": "quantize",
+        "max_model_len": "max-input-length",
+        "max_num_seqs": "max-batch-size",
+    }
+
+    for key, value in args.items():
+        tgi_key = tgi_map.get(key, key.replace("_", "-"))
+        # Skip speculative decoding args -- TGI doesn't support them the same way
+        if key in ("speculative_model", "num_speculative_tokens"):
+            parts.append(f"  # --{tgi_key} {value}  # speculative decoding: adjust for TGI")
+            continue
+        if key in ("gpu_memory_utilization",):
+            continue  # TGI doesn't have this flag
+        if isinstance(value, bool):
+            if value:
+                parts.append(f"  --{tgi_key}")
+        else:
+            parts.append(f"  --{tgi_key} {value}")
+
+    return " \\\n".join(parts)
+
+
+def generate_launch_command(card: ServingCard, engine: str | None = None) -> str:
+    """Generate a launch command for the given engine.
+
+    If engine is None, infer from card.framework.
+    """
+    if engine is None:
+        framework = card.framework.lower()
+        if "vllm" in framework:
+            engine = "vllm"
+        elif "tgi" in framework or "text-generation" in framework:
+            engine = "tgi"
+        else:
+            engine = "vllm"  # default
+
+    if engine == "vllm":
+        return generate_vllm_command(card)
+    elif engine == "tgi":
+        return generate_tgi_command(card)
+    else:
+        return f"# Unsupported engine: {engine}\n# Use --engine vllm or --engine tgi"
diff --git a/packages/python/servingcard/backends.py b/packages/python/servingcard/backends.py
@@ -0,0 +1,139 @@
+"""Pluggable benchmark backends for servingcard."""
+
+from __future__ import annotations
+
+import json
+import shutil
+import subprocess
+import sys
+from abc import ABC, abstractmethod
+
+
+class BenchmarkBackend(ABC):
+    """Interface for benchmark harnesses."""
+
+    @abstractmethod
+    def run(self, endpoint: str, model: str, **kwargs: object) -> dict:
+        """Run benchmarks, return results dict.
+
+        Returns a dict with keys:
+            single_stream_tok_s, ttft_ms, quality_score, cacp_compliance,
+            parallel_peak_tok_s (optional), peak_concurrency (optional),
+            useful_token_ratio (optional), tokens_per_turn (optional),
+            adaptability_score (optional), suite (optional).
+        """
+        raise NotImplementedError
+
+
+class PawBenchBackend(BenchmarkBackend):
+    """PawBench integration -- subprocess first, then Python import."""
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if PawBench is installed."""
+        if shutil.which("pawbench"):
+            return True
+        try:
+            import importlib
+
+            importlib.import_module("pawbench")
+            return True
+        except ImportError:
+            return False
+
+    def run(self, endpoint: str, model: str, **kwargs: object) -> dict:
+        """Run PawBench against an endpoint."""
+        # Try subprocess first (works if pawbench is a CLI tool)
+        pawbench_bin = shutil.which("pawbench")
+        if pawbench_bin:
+            return self._run_subprocess(endpoint, model, **kwargs)
+        # Fall back to Python import
+        return self._run_python(endpoint, model, **kwargs)
+
+    def _run_subprocess(self, endpoint: str, model: str, **kwargs: object) -> dict:
+        """Run PawBench via subprocess."""
+        cmd = [
+            "pawbench",
+            "run",
+            "--endpoint",
+            endpoint,
+            "--model",
+            model,
+            "--output-json",
+            "-",
+        ]
+        suite = kwargs.get("suite")
+        if suite:
+            cmd.extend(["--suite", str(suite)])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+        if result.returncode != 0:
+            raise RuntimeError(f"PawBench failed (exit {result.returncode}): {result.stderr}")
+        return json.loads(result.stdout)  # type: ignore[no-any-return]
+
+    def _run_python(self, endpoint: str, model: str, **kwargs: object) -> dict:
+        """Run PawBench via Python API."""
+        try:
+            from pawbench import run_benchmark  # type: ignore[import-untyped]
+        except ImportError:
+            raise RuntimeError(
+                "PawBench not found. Install: pip install pawbench"
+            ) from None
+
+        results: dict = run_benchmark(endpoint=endpoint, model=model, **kwargs)
+        return results
+
+
+class ManualBackend(BenchmarkBackend):
+    """Manual entry -- user provides benchmark numbers interactively."""
+
+    def run(self, endpoint: str, model: str, **kwargs: object) -> dict:
+        """Prompt the user for benchmark results."""
+        print("\nPawBench not found. Enter benchmark results manually:\n")
+
+        tok_s = self._prompt_float("  Single-stream tok/s: ")
+        ttft_ms = self._prompt_float("  TTFT (ms): ")
+        quality = self._prompt_float("  Quality score (0-1): ", min_val=0, max_val=1)
+        cacp = self._prompt_float("  CACP compliance (0-1): ", min_val=0, max_val=1)
+
+        parallel_tok_s_str = input("  Parallel peak tok/s (Enter to skip): ").strip()
+        concurrency_str = input("  Peak concurrency (Enter to skip): ").strip()
+
+        result: dict = {
+            "single_stream_tok_s": tok_s,
+            "ttft_ms": ttft_ms,
+            "quality_score": quality,
+            "cacp_compliance": cacp,
+            "suite": "manual",
+        }
+        if parallel_tok_s_str:
+            result["parallel_peak_tok_s"] = float(parallel_tok_s_str)
+        if concurrency_str:
+            result["peak_concurrency"] = int(concurrency_str)
+
+        return result
+
+    @staticmethod
+    def _prompt_float(
+        prompt: str, min_val: float | None = None, max_val: float | None = None
+    ) -> float:
+        """Prompt for a float value with optional range validation."""
+        while True:
+            try:
+                val = float(input(prompt))
+                if min_val is not None and val < min_val:
+                    print(f"    Must be >= {min_val}")
+                    continue
+                if max_val is not None and val > max_val:
+                    print(f"    Must be <= {max_val}")
+                    continue
+                return val
+            except ValueError:
+                print("    Enter a number.")
+
+
+def get_backend() -> BenchmarkBackend:
+    """Return the best available benchmark backend."""
+    if PawBenchBackend.is_available():
+        return PawBenchBackend()
+    return ManualBackend()
diff --git a/packages/python/servingcard/cli.py b/packages/python/servingcard/cli.py
diff --git a/packages/python/servingcard/schema.py b/packages/python/servingcard/schema.py

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ dependencies = [`
`46`	`46`	`]`
`47`	`47`
`48`	`48`	`[project.optional-dependencies]`
	`49`	`+benchmark = ["pawbench"]`
`49`	`50`	`dev = [`
`50`	`51`	`"pytest>=7.0",`
`51`	`52`	`"pytest-cov",`