diff --git a/2.0/README.md b/2.0/README.md index e0f650c0..1942fe1e 100644 --- a/2.0/README.md +++ b/2.0/README.md @@ -34,3 +34,21 @@ This variant keeps the same SIFT1M-scale service contract and recall target as `vector_db_ann`, but reduces the load/index-build penalty by 10x so stronger offline indexing strategies are more viable. Its problem ID is `vector_db_ann_relaxed`. + +## BBOPlace ISPD2005 + +This VLSI placement problem asks agents to generate macro placement candidates +for the ISPD2005 benchmarks used by BBOPlace-Bench. Its problem ID is +`bboplace_ispd2005`. The public iterative feedback path evaluates the first +benchmark only, while the final verifier reruns the best iterative artifact and +the final submission across the full ISPD2005 suite. Scoring minimizes MP-HPWL +against relaxed MGO baselines and clips negative scores to zero. The task is +CPU-only and does not require DREAMPlace, GPU execution, or Ray. + +## BBOPlace ICCAD2015 + +This VLSI placement problem uses the ICCAD2015 benchmark suite from +BBOPlace-Bench. Its problem ID is `bboplace_iccad2015`. It follows the same +candidate format, CPU-only evaluator, MP-HPWL metric, relaxed MGO baselines, +and quick-versus-final evaluation flow as `bboplace_ispd2005`, but scores the +ICCAD2015 benchmark set. diff --git a/2.0/problems/bboplace_iccad2015/config.yaml b/2.0/problems/bboplace_iccad2015/config.yaml new file mode 100644 index 00000000..cf5bc374 --- /dev/null +++ b/2.0/problems/bboplace_iccad2015/config.yaml @@ -0,0 +1,15 @@ +tag: optimization +runtime: + language: python + timeout_seconds: 10800 + environment: "Python solution returning BBOPlace MGO placement candidates; hidden ICCAD2015 judge data" + apt_packages: + - python3-numpy + docker: + image: ubuntu:24.04 + judge_image: ghcr.io/frontiercs/frontiercs-bboplace-data:2026-06-ispd-iccad +environment: + cpus: 8 + memory_mb: 16384 + storage_mb: 8192 + build_timeout_seconds: 3600 diff --git a/2.0/problems/bboplace_iccad2015/evaluate.sh b/2.0/problems/bboplace_iccad2015/evaluate.sh new file mode 100755 index 00000000..23cd83b3 --- /dev/null +++ b/2.0/problems/bboplace_iccad2015/evaluate.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SOLUTION="/work/execution_env/solution_env/solution.py" + +if [[ ! -f "$SOLUTION" ]]; then + echo "Error: Missing $SOLUTION" >&2 + exit 1 +fi + +python "$SCRIPT_DIR/evaluator.py" "$SOLUTION" diff --git a/2.0/problems/bboplace_iccad2015/evaluator.py b/2.0/problems/bboplace_iccad2015/evaluator.py new file mode 100644 index 00000000..09df4fbc --- /dev/null +++ b/2.0/problems/bboplace_iccad2015/evaluator.py @@ -0,0 +1,486 @@ +"""Evaluator for the BBOPlace ICCAD2015 Frontier-CS 2.0 problem.""" + +from __future__ import annotations + +import importlib.util +import json +import math +import os +import pickle +import pwd +import shutil +import subprocess +import sys +import tempfile +import traceback +from argparse import Namespace +from pathlib import Path +from types import ModuleType, SimpleNamespace +from typing import Any + + +DATASET = "iccad2015" +BENCHMARKS = ( + "superblue1", + "superblue3", + "superblue4", + "superblue5", + "superblue7", + "superblue10", + "superblue16", + "superblue18", +) +QUICK_BENCHMARKS = (BENCHMARKS[0],) +MAX_CANDIDATES = 16 +TIMEOUT_SECONDS = int(os.environ.get("BBOPLACE_SOLUTION_TIMEOUT_SECONDS", "10800")) +BBOPLACE_ROOT = Path(os.environ.get("BBOPLACE_ROOT", "/opt/bboplace-bench")) +INF_HPWL_THRESHOLD = 1e15 + +# BBOPlace-Bench report, Table V, MGO + PSO MP-HPWL. +# Reported unit is x10^5; constants below are raw HPWL values relaxed by 1.2x. +BASELINE_HPWL = { + "superblue1": 0.696e5, + "superblue3": 1.824e5, + "superblue4": 1.128e5, + "superblue5": 4.512e5, + "superblue7": 2.028e5, + "superblue10": 0.648e5, + "superblue16": 1.152e5, + "superblue18": 0.576e5, +} + +_EVALUATORS: dict[str, Any] = {} + + +def _protect_evaluator_source() -> None: + try: + evaluator_path = Path(__file__).resolve() + if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0: + evaluator_path.chmod(0o600) + except Exception: + pass + + +_protect_evaluator_source() + + +def _solution_preexec(): + if os.name != "posix": + return None + try: + if os.geteuid() != 0: + return None + nobody = pwd.getpwnam("nobody") + except Exception: + return None + + def demote() -> None: + os.setgid(nobody.pw_gid) + os.setuid(nobody.pw_uid) + + return demote + + +def _ensure_runtime_paths() -> None: + if not BBOPLACE_ROOT.exists(): + raise RuntimeError( + f"BBOPlace runtime not found at {BBOPLACE_ROOT}; the judge image must include it" + ) + for rel in ("src", "config", f"benchmarks/{DATASET}"): + path = BBOPLACE_ROOT / rel + if not path.exists(): + raise RuntimeError(f"BBOPlace judge image is missing {path}") + for path in ( + BBOPLACE_ROOT, + BBOPLACE_ROOT / "src", + BBOPLACE_ROOT / "benchmarks", + BBOPLACE_ROOT / "thirdparty", + BBOPLACE_ROOT / "thirdparty" / "dreamplace", + ): + text = str(path) + if text not in sys.path: + sys.path.insert(0, text) + os.environ["PYTHONPATH"] = ":".join(sys.path) + + +def _install_runtime_shims() -> None: + if "ray" not in sys.modules: + ray = ModuleType("ray") + + class RemoteFunction: + def __init__(self, fn): + self.fn = fn + + def remote(self, *args, **kwargs): + return self.fn(*args, **kwargs) + + def remote(*args, **kwargs): + if args and callable(args[0]) and len(args) == 1 and not kwargs: + return RemoteFunction(args[0]) + + def decorator(fn): + return RemoteFunction(fn) + + return decorator + + ray.remote = remote # type: ignore[attr-defined] + ray.get = lambda value: value # type: ignore[attr-defined] + ray.init = lambda *args, **kwargs: None # type: ignore[attr-defined] + sys.modules["ray"] = ray + + if "matplotlib" not in sys.modules: + matplotlib = ModuleType("matplotlib") + pyplot = ModuleType("matplotlib.pyplot") + patches = ModuleType("matplotlib.patches") + pyplot.figure = lambda *args, **kwargs: SimpleNamespace( # type: ignore[attr-defined] + add_subplot=lambda *a, **k: SimpleNamespace( + axes=SimpleNamespace( + xaxis=SimpleNamespace(set_visible=lambda *_: None), + yaxis=SimpleNamespace(set_visible=lambda *_: None), + ), + add_patch=lambda *_args, **_kwargs: None, + ), + savefig=lambda *_args, **_kwargs: None, + ) + pyplot.close = lambda *args, **kwargs: None # type: ignore[attr-defined] + patches.Rectangle = lambda *args, **kwargs: object() # type: ignore[attr-defined] + sys.modules["matplotlib"] = matplotlib + sys.modules["matplotlib.pyplot"] = pyplot + sys.modules["matplotlib.patches"] = patches + + +def _load_bbo_evaluator_class(): + _ensure_runtime_paths() + _install_runtime_shims() + import yaml # type: ignore + from config.benchmark import ( # type: ignore + BENCHMARK_DIR, + ROOT_DIR, + benchmark_dict, + benchmark_n_macro_dict, + benchmark_type_dict, + ) + from src.placedb import PlaceDB # type: ignore + placer_package = ModuleType("src.placer") + placer_package.__path__ = [str(Path(ROOT_DIR) / "src" / "placer")] # type: ignore[attr-defined] + sys.modules.setdefault("src.placer", placer_package) + from src.placer.mgo_placer import MaskGuidedOptimizationPlacer # type: ignore + + class Evaluator: + def __init__(self, args: Namespace): + config_path = Path(ROOT_DIR) / "config" + file_config_dict: dict[str, Any] = {} + with (config_path / "default.yaml").open("r", encoding="utf-8") as f: + file_config_dict.update(yaml.load(f, Loader=yaml.FullLoader) or {}) + with (config_path / "placer" / "mgo.yaml").open("r", encoding="utf-8") as f: + file_config_dict.update(yaml.load(f, Loader=yaml.FullLoader) or {}) + + benchmark_base = None + for candidate_base, names in benchmark_dict.items(): + if args.benchmark in names: + benchmark_base = candidate_base + break + if benchmark_base is None: + raise RuntimeError(f"benchmark is not registered: {args.benchmark}") + + file_config_dict.update( + { + "ROOT_DIR": ROOT_DIR, + "SOURCE_DIR": str(Path(ROOT_DIR) / "src"), + "THIRDPARTY_DIR": str(Path(ROOT_DIR) / "thirdparty"), + "placer": "mgo", + "benchmark": args.benchmark, + "benchmark_base": benchmark_base, + "benchmark_path": str(Path(BENCHMARK_DIR) / benchmark_base / args.benchmark), + "benchmark_type": benchmark_type_dict[benchmark_base], + "n_macro": benchmark_n_macro_dict[benchmark_base], + "eval_gp_hpwl": False, + "n_cpu_max": 1, + "result_path": str( + Path(tempfile.gettempdir()) + / "frontier_bboplace_results" + / DATASET + / args.benchmark + ), + "unique_token": "frontier_cs_2_0", + } + ) + args.__dict__.update({k: v for k, v in file_config_dict.items() if k not in args.__dict__}) + Path(args.result_path).mkdir(parents=True, exist_ok=True) + self.args = args + self.placedb = PlaceDB(args=args) + self.placer = MaskGuidedOptimizationPlacer(args=args, placedb=self.placedb) + + @property + def n_dim(self): + return self.placer.placedb.node_cnt * 2 + + return Evaluator + + +def _make_args(benchmark: str) -> Namespace: + return Namespace( + placer="mgo", + benchmark=benchmark, + eval_gp_hpwl=False, + seed=1, + use_wandb=False, + error_redirect=False, + n_cpu_max=1, + gpu=0, + ) + + +def _ensure_evaluator(benchmark: str) -> Any: + if benchmark in _EVALUATORS: + return _EVALUATORS[benchmark] + if benchmark not in BENCHMARKS: + raise RuntimeError(f"unknown benchmark: {benchmark}") + Evaluator = _load_bbo_evaluator_class() + _EVALUATORS[benchmark] = Evaluator(_make_args(benchmark)) + return _EVALUATORS[benchmark] + + +def _benchmark_info(benchmark: str, evaluator: Any) -> dict[str, Any]: + placedb = evaluator.placer.placedb + return { + "dataset": DATASET, + "benchmark": benchmark, + "placer": "mgo", + "metric": "mp_hpwl", + "objective": "minimize", + "dim": int(evaluator.n_dim), + "node_cnt": int(placedb.node_cnt), + "net_cnt": int(getattr(placedb, "net_cnt", len(getattr(placedb, "net_info", {})))), + "canvas_width": float(placedb.canvas_width), + "canvas_height": float(placedb.canvas_height), + "n_grid_x": int(evaluator.args.n_grid_x), + "n_grid_y": int(evaluator.args.n_grid_y), + "bounds_kind": "mgo_repeated_grid", + "max_candidates_per_submission": MAX_CANDIDATES, + "baseline_hpwl": float(BASELINE_HPWL[benchmark]), + "baseline_source": "BBOPlace-Bench Table V, MGO + PSO MP-HPWL, unit x10^5, relaxed by 1.2x", + } + + +def prepare() -> dict[str, Any]: + _ensure_runtime_paths() + return { + "dataset": DATASET, + "benchmarks": list(BENCHMARKS), + "quick_feedback_benchmarks": list(QUICK_BENCHMARKS), + "bboplace_root": str(BBOPLACE_ROOT), + "max_candidates_per_submission": MAX_CANDIDATES, + "load_mode": "lazy_per_benchmark", + } + + +def _selected_benchmarks() -> tuple[tuple[str, ...], str]: + role = os.environ.get("FRONTIER_SUBMISSION_ROLE", "agent") + if role == "final": + return BENCHMARKS, "full_suite" + return QUICK_BENCHMARKS, "quick_feedback" + + +def _run_solution(solution_path: str, info: dict[str, Any]) -> Any: + with tempfile.TemporaryDirectory(prefix=f"bboplace_{DATASET}_") as tmp: + tmp_path = Path(tmp) + isolated_solution_path = tmp_path / "solution.py" + runner_path = tmp_path / "runner.py" + info_path = tmp_path / "info.json" + result_path = tmp_path / "result.pkl" + shutil.copy2(solution_path, isolated_solution_path) + info_path.write_text(json.dumps(info), encoding="utf-8") + runner_path.write_text( + """ +import importlib.util +import json +import pickle +from pathlib import Path + +solution_path = __SOLUTION_PATH__ +info = json.loads(Path(__INFO_PATH__).read_text(encoding="utf-8")) +result_path = Path(__RESULT_PATH__) + + +def load_candidates(): + spec = importlib.util.spec_from_file_location("solution", solution_path) + if spec is None or spec.loader is None: + raise RuntimeError("could not import solution") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name in ("solve", "generate", "run"): + fn = getattr(module, name, None) + if callable(fn): + return fn(info) + + for name in ("CANDIDATES", "CANDIDATE", "PLACEMENT"): + value = getattr(module, name, None) + if value is not None: + return value + + raise RuntimeError("solution must define solve(info), generate(info), run(info), CANDIDATES, CANDIDATE, or PLACEMENT") + + +try: + with result_path.open("wb") as f: + pickle.dump({"candidates": load_candidates()}, f) +except Exception: + with result_path.open("wb") as f: + pickle.dump({"error": "solution failed while generating placement candidates"}, f) +""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path))) + .replace("__INFO_PATH__", repr(str(info_path))) + .replace("__RESULT_PATH__", repr(str(result_path))), + encoding="utf-8", + ) + preexec_fn = _solution_preexec() + if preexec_fn is not None: + nobody = pwd.getpwnam("nobody") + for path in (tmp_path, isolated_solution_path, runner_path, info_path): + os.chown(path, nobody.pw_uid, nobody.pw_gid) + os.chmod(tmp_path, 0o700) + + proc = subprocess.run( + [sys.executable, str(runner_path)], + capture_output=True, + text=True, + timeout=TIMEOUT_SECONDS, + preexec_fn=preexec_fn, + ) + if proc.returncode != 0: + raise RuntimeError(f"solution runner exited with code {proc.returncode}") + if not result_path.exists(): + raise RuntimeError("solution did not produce a result") + with result_path.open("rb") as f: + payload = pickle.load(f) + if "error" in payload: + raise RuntimeError(str(payload["error"])) + return payload["candidates"] + + +def _normalize_candidates(raw: Any, *, dim: int, node_cnt: int, n_grid_x: int, n_grid_y: int): + import numpy as np + + arr = np.asarray(raw, dtype=float) + if arr.ndim == 1: + if arr.size != dim: + raise ValueError(f"expected one candidate of length {dim}, got length {arr.size}") + arr = arr.reshape(1, dim) + elif arr.ndim == 2: + if arr.shape[1] != dim: + raise ValueError(f"expected candidates with dimension {dim}, got {arr.shape[1]}") + else: + raise ValueError("candidates must be a 1D vector or a 2D list/array") + + if arr.shape[0] < 1: + raise ValueError("at least one candidate is required") + if arr.shape[0] > MAX_CANDIDATES: + raise ValueError( + f"too many candidates: got {arr.shape[0]}, maximum is {MAX_CANDIDATES}" + ) + if not np.all(np.isfinite(arr)): + raise ValueError("all candidate coordinates must be finite") + x = arr[:, :node_cnt] + y = arr[:, node_cnt:] + if np.any(x < 0.0) or np.any(x > float(n_grid_x)): + raise ValueError(f"x-grid coordinates must be in [0, {n_grid_x}]") + if np.any(y < 0.0) or np.any(y > float(n_grid_y)): + raise ValueError(f"y-grid coordinates must be in [0, {n_grid_y}]") + return arr + + +def _evaluate_candidates(evaluator: Any, candidates: Any) -> tuple[float, float, int, int]: + import numpy as np + + hpwl_values, overlap_values, _macro_pos = evaluator.placer.evaluate(candidates) + hpwl_arr = np.asarray(hpwl_values, dtype=float).reshape(-1) + overlap_arr = np.asarray(overlap_values, dtype=float).reshape(-1) + if hpwl_arr.size != candidates.shape[0]: + raise RuntimeError("BBOPlace returned an unexpected number of HPWL values") + if not np.all(np.isfinite(hpwl_arr)): + raise RuntimeError("BBOPlace returned a non-finite HPWL") + best_index = int(np.argmin(hpwl_arr)) + if float(hpwl_arr[best_index]) >= INF_HPWL_THRESHOLD: + raise ValueError("BBOPlace could not legalize any submitted candidate") + overlap = float(overlap_arr[best_index]) if overlap_arr.size > best_index else math.nan + return float(hpwl_arr[best_index]), overlap, best_index, int(candidates.shape[0]) + + +def evaluate(solution_path: str) -> tuple[float, float, str, dict[str, Any]]: + try: + per_benchmark: list[dict[str, Any]] = [] + bounded_scores: list[float] = [] + raw_scores: list[float] = [] + + selected_benchmarks, evaluation_scope = _selected_benchmarks() + for benchmark in selected_benchmarks: + evaluator = _ensure_evaluator(benchmark) + info = _benchmark_info(benchmark, evaluator) + raw_candidates = _run_solution(solution_path, info) + candidates = _normalize_candidates( + raw_candidates, + dim=info["dim"], + node_cnt=info["node_cnt"], + n_grid_x=info["n_grid_x"], + n_grid_y=info["n_grid_y"], + ) + candidate_hpwl, overlap_rate, candidate_index, n_candidates = _evaluate_candidates( + evaluator, candidates + ) + baseline_hpwl = BASELINE_HPWL[benchmark] + raw_score = 100.0 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl + bounded_score = max(0.0, raw_score) + bounded_scores.append(bounded_score) + raw_scores.append(raw_score) + per_benchmark.append( + { + "benchmark": benchmark, + "candidate_hpwl": candidate_hpwl, + "baseline_hpwl": baseline_hpwl, + "raw_score": raw_score, + "score": bounded_score, + "overlap_rate": overlap_rate, + "candidate_index": candidate_index, + "n_candidates": n_candidates, + } + ) + + score = sum(bounded_scores) / len(bounded_scores) + score_unbounded = sum(raw_scores) / len(raw_scores) + message = ( + f"dataset={DATASET}; scope={evaluation_scope}; benchmarks={len(selected_benchmarks)}; " + f"mean_score={score:.6f}; mean_score_unbounded={score_unbounded:.6f}; " + "metric=MP-HPWL; baseline=1.2x relaxed MGO paper constants" + ) + metrics = { + "dataset": DATASET, + "evaluation_scope": evaluation_scope, + "benchmark_count": len(selected_benchmarks), + "full_suite_benchmark_count": len(BENCHMARKS), + "score_formula": "max(0, 100 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl)", + "mean_candidate_hpwl": sum(item["candidate_hpwl"] for item in per_benchmark) + / len(per_benchmark), + "per_benchmark": per_benchmark, + } + return score, score_unbounded, message, metrics + except subprocess.TimeoutExpired: + return 0.0, 0.0, f"timed out after {TIMEOUT_SECONDS}s", {} + except Exception as exc: + return 0.0, 0.0, f"evaluation failed: {exc}", {"traceback": traceback.format_exc()} + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + print("usage: evaluator.py /path/to/solution.py", file=sys.stderr) + return 1 + score, score_unbounded, message, metrics = evaluate(argv[1]) + print(message, file=sys.stderr) + if metrics: + print(json.dumps(metrics, indent=2), file=sys.stderr) + print(f"{score:.12f} {score_unbounded:.12f}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/2.0/problems/bboplace_iccad2015/readme b/2.0/problems/bboplace_iccad2015/readme new file mode 100644 index 00000000..e2e413b4 --- /dev/null +++ b/2.0/problems/bboplace_iccad2015/readme @@ -0,0 +1,85 @@ +BBOPlace ICCAD2015 +================== + +Write a Python solution that proposes macro placements for the BBOPlace MGO +formulation on the ICCAD2015 benchmark suite. The hidden judge evaluates your +placements with the original BBOPlace-Bench MP-HPWL evaluator. + +Runtime and resources +--------------------- + +Your solution runs in the main agent container with: + +- 8 CPU cores +- 16 GiB memory +- 8 GiB storage +- no GPU +- Python 3 with NumPy available +- internet access may be available during the trial, but the benchmark data is + hidden and is not available in the agent workspace + +The final verifier timeout is 10800 seconds. The judge also runs on CPU only. +Do not rely on CUDA, DREAMPlace, Ray, or GPU placement libraries for scoring; +the official metric path used here is MGO with MP-HPWL. + +Your file must define one of: + +- `solve(info)` +- `generate(info)` +- `run(info)` +- `CANDIDATES`, `CANDIDATE`, or `PLACEMENT` + +The recommended interface is `solve(info)`. The judge calls it once per +benchmark. `info` contains: + +- `benchmark`: one of `superblue1`, `superblue3`, `superblue4`, `superblue5`, + `superblue7`, `superblue10`, `superblue16`, `superblue18` +- `dim`: placement vector length +- `node_cnt`: number of macros +- `n_grid_x`, `n_grid_y`: MGO grid bounds +- `max_candidates_per_submission`: 16 +- `baseline_hpwl`: the baseline HPWL used for scoring + +Return either one placement vector of length `dim`, or a 2D list/array with up +to 16 placement candidates. For MGO, the first `node_cnt` entries are x-grid +coordinates in `[0, n_grid_x]`, and the remaining `node_cnt` entries are y-grid +coordinates in `[0, n_grid_y]`. + +Score +----- + +The objective is to minimize MP-HPWL. For each benchmark: + +`raw_score = 100 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl` + +`score = max(0, raw_score)` + +The final score is the mean score over the eight ICCAD2015 benchmarks. The +unbounded score is the mean raw score before the zero clip. + +During iterative agent submissions, `/app/submit.sh` gives quick feedback on +`superblue1` only. The final verifier evaluates the full eight-benchmark suite. +Use the quick feedback to debug general placement logic, not as the complete +leaderboard score. + +The submit helper saves the best quick-feedback artifact it has seen. During +final verification, the verifier reruns both the current `/app/solution.py` and +that saved best iterative artifact on the full suite, then uses the better +full-suite score. A quick-feedback score is never used directly as the final +reward. + +The baseline constants are from the BBOPlace-Bench report, Table V, +`MGO + PSO` MP-HPWL. The paper reports values in units of `x10^5`; the judge +stores raw HPWL values relaxed by `1.2x`: + +- `superblue1`: `0.696e5` +- `superblue3`: `1.824e5` +- `superblue4`: `1.128e5` +- `superblue5`: `4.512e5` +- `superblue7`: `2.028e5` +- `superblue10`: `0.648e5` +- `superblue16`: `1.152e5` +- `superblue18`: `0.576e5` + +The benchmark data and evaluator source are only present in the judge image. +They are not available in the agent workspace. diff --git a/2.0/problems/bboplace_iccad2015/reference.py b/2.0/problems/bboplace_iccad2015/reference.py new file mode 100644 index 00000000..d52211d9 --- /dev/null +++ b/2.0/problems/bboplace_iccad2015/reference.py @@ -0,0 +1,7 @@ +"""Deterministic valid baseline for the BBOPlace ICCAD2015 task.""" + +from __future__ import annotations + + +def solve(info): + return [0.0] * int(info["dim"]) diff --git a/2.0/problems/bboplace_ispd2005/config.yaml b/2.0/problems/bboplace_ispd2005/config.yaml new file mode 100644 index 00000000..79d071b2 --- /dev/null +++ b/2.0/problems/bboplace_ispd2005/config.yaml @@ -0,0 +1,15 @@ +tag: optimization +runtime: + language: python + timeout_seconds: 10800 + environment: "Python solution returning BBOPlace MGO placement candidates; hidden ISPD2005 judge data" + apt_packages: + - python3-numpy + docker: + image: ubuntu:24.04 + judge_image: ghcr.io/frontiercs/frontiercs-bboplace-data:2026-06-ispd-iccad +environment: + cpus: 8 + memory_mb: 16384 + storage_mb: 8192 + build_timeout_seconds: 3600 diff --git a/2.0/problems/bboplace_ispd2005/evaluate.sh b/2.0/problems/bboplace_ispd2005/evaluate.sh new file mode 100755 index 00000000..23cd83b3 --- /dev/null +++ b/2.0/problems/bboplace_ispd2005/evaluate.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SOLUTION="/work/execution_env/solution_env/solution.py" + +if [[ ! -f "$SOLUTION" ]]; then + echo "Error: Missing $SOLUTION" >&2 + exit 1 +fi + +python "$SCRIPT_DIR/evaluator.py" "$SOLUTION" diff --git a/2.0/problems/bboplace_ispd2005/evaluator.py b/2.0/problems/bboplace_ispd2005/evaluator.py new file mode 100644 index 00000000..6ee0c32d --- /dev/null +++ b/2.0/problems/bboplace_ispd2005/evaluator.py @@ -0,0 +1,475 @@ +"""Evaluator for the BBOPlace ISPD2005 Frontier-CS 2.0 problem.""" + +from __future__ import annotations + +import importlib.util +import json +import math +import os +import pickle +import pwd +import shutil +import subprocess +import sys +import tempfile +import traceback +from argparse import Namespace +from pathlib import Path +from types import ModuleType, SimpleNamespace +from typing import Any + + +DATASET = "ispd2005" +BENCHMARKS = ("adaptec1", "adaptec2", "adaptec3", "adaptec4", "bigblue1", "bigblue3") +QUICK_BENCHMARKS = (BENCHMARKS[0],) +MAX_CANDIDATES = 16 +TIMEOUT_SECONDS = int(os.environ.get("BBOPLACE_SOLUTION_TIMEOUT_SECONDS", "10800")) +BBOPLACE_ROOT = Path(os.environ.get("BBOPLACE_ROOT", "/opt/bboplace-bench")) +INF_HPWL_THRESHOLD = 1e15 + +# BBOPlace-Bench report, Table III, MGO + Vanilla-EA MP-HPWL. +# Reported unit is x10^5; constants below are raw HPWL values relaxed by 1.2x. +BASELINE_HPWL = { + "adaptec1": 6.96e5, + "adaptec2": 73.752e5, + "adaptec3": 67.356e5, + "adaptec4": 68.148e5, + "bigblue1": 2.76e5, + "bigblue3": 62.88e5, +} + +_EVALUATORS: dict[str, Any] = {} + + +def _protect_evaluator_source() -> None: + try: + evaluator_path = Path(__file__).resolve() + if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0: + evaluator_path.chmod(0o600) + except Exception: + pass + + +_protect_evaluator_source() + + +def _solution_preexec(): + if os.name != "posix": + return None + try: + if os.geteuid() != 0: + return None + nobody = pwd.getpwnam("nobody") + except Exception: + return None + + def demote() -> None: + os.setgid(nobody.pw_gid) + os.setuid(nobody.pw_uid) + + return demote + + +def _ensure_runtime_paths() -> None: + if not BBOPLACE_ROOT.exists(): + raise RuntimeError( + f"BBOPlace runtime not found at {BBOPLACE_ROOT}; the judge image must include it" + ) + for rel in ("src", "config", f"benchmarks/{DATASET}"): + path = BBOPLACE_ROOT / rel + if not path.exists(): + raise RuntimeError(f"BBOPlace judge image is missing {path}") + for path in ( + BBOPLACE_ROOT, + BBOPLACE_ROOT / "src", + BBOPLACE_ROOT / "benchmarks", + BBOPLACE_ROOT / "thirdparty", + BBOPLACE_ROOT / "thirdparty" / "dreamplace", + ): + text = str(path) + if text not in sys.path: + sys.path.insert(0, text) + os.environ["PYTHONPATH"] = ":".join(sys.path) + + +def _install_runtime_shims() -> None: + if "ray" not in sys.modules: + ray = ModuleType("ray") + + class RemoteFunction: + def __init__(self, fn): + self.fn = fn + + def remote(self, *args, **kwargs): + return self.fn(*args, **kwargs) + + def remote(*args, **kwargs): + if args and callable(args[0]) and len(args) == 1 and not kwargs: + return RemoteFunction(args[0]) + + def decorator(fn): + return RemoteFunction(fn) + + return decorator + + ray.remote = remote # type: ignore[attr-defined] + ray.get = lambda value: value # type: ignore[attr-defined] + ray.init = lambda *args, **kwargs: None # type: ignore[attr-defined] + sys.modules["ray"] = ray + + if "matplotlib" not in sys.modules: + matplotlib = ModuleType("matplotlib") + pyplot = ModuleType("matplotlib.pyplot") + patches = ModuleType("matplotlib.patches") + pyplot.figure = lambda *args, **kwargs: SimpleNamespace( # type: ignore[attr-defined] + add_subplot=lambda *a, **k: SimpleNamespace( + axes=SimpleNamespace( + xaxis=SimpleNamespace(set_visible=lambda *_: None), + yaxis=SimpleNamespace(set_visible=lambda *_: None), + ), + add_patch=lambda *_args, **_kwargs: None, + ), + savefig=lambda *_args, **_kwargs: None, + ) + pyplot.close = lambda *args, **kwargs: None # type: ignore[attr-defined] + patches.Rectangle = lambda *args, **kwargs: object() # type: ignore[attr-defined] + sys.modules["matplotlib"] = matplotlib + sys.modules["matplotlib.pyplot"] = pyplot + sys.modules["matplotlib.patches"] = patches + + +def _load_bbo_evaluator_class(): + _ensure_runtime_paths() + _install_runtime_shims() + import yaml # type: ignore + from config.benchmark import ( # type: ignore + BENCHMARK_DIR, + ROOT_DIR, + benchmark_dict, + benchmark_n_macro_dict, + benchmark_type_dict, + ) + from src.placedb import PlaceDB # type: ignore + placer_package = ModuleType("src.placer") + placer_package.__path__ = [str(Path(ROOT_DIR) / "src" / "placer")] # type: ignore[attr-defined] + sys.modules.setdefault("src.placer", placer_package) + from src.placer.mgo_placer import MaskGuidedOptimizationPlacer # type: ignore + + class Evaluator: + def __init__(self, args: Namespace): + config_path = Path(ROOT_DIR) / "config" + file_config_dict: dict[str, Any] = {} + with (config_path / "default.yaml").open("r", encoding="utf-8") as f: + file_config_dict.update(yaml.load(f, Loader=yaml.FullLoader) or {}) + with (config_path / "placer" / "mgo.yaml").open("r", encoding="utf-8") as f: + file_config_dict.update(yaml.load(f, Loader=yaml.FullLoader) or {}) + + benchmark_base = None + for candidate_base, names in benchmark_dict.items(): + if args.benchmark in names: + benchmark_base = candidate_base + break + if benchmark_base is None: + raise RuntimeError(f"benchmark is not registered: {args.benchmark}") + + file_config_dict.update( + { + "ROOT_DIR": ROOT_DIR, + "SOURCE_DIR": str(Path(ROOT_DIR) / "src"), + "THIRDPARTY_DIR": str(Path(ROOT_DIR) / "thirdparty"), + "placer": "mgo", + "benchmark": args.benchmark, + "benchmark_base": benchmark_base, + "benchmark_path": str(Path(BENCHMARK_DIR) / benchmark_base / args.benchmark), + "benchmark_type": benchmark_type_dict[benchmark_base], + "n_macro": benchmark_n_macro_dict[benchmark_base], + "eval_gp_hpwl": False, + "n_cpu_max": 1, + "result_path": str( + Path(tempfile.gettempdir()) + / "frontier_bboplace_results" + / DATASET + / args.benchmark + ), + "unique_token": "frontier_cs_2_0", + } + ) + args.__dict__.update({k: v for k, v in file_config_dict.items() if k not in args.__dict__}) + Path(args.result_path).mkdir(parents=True, exist_ok=True) + self.args = args + self.placedb = PlaceDB(args=args) + self.placer = MaskGuidedOptimizationPlacer(args=args, placedb=self.placedb) + + @property + def n_dim(self): + return self.placer.placedb.node_cnt * 2 + + return Evaluator + + +def _make_args(benchmark: str) -> Namespace: + return Namespace( + placer="mgo", + benchmark=benchmark, + eval_gp_hpwl=False, + seed=1, + use_wandb=False, + error_redirect=False, + n_cpu_max=1, + gpu=0, + ) + + +def _ensure_evaluator(benchmark: str) -> Any: + if benchmark in _EVALUATORS: + return _EVALUATORS[benchmark] + if benchmark not in BENCHMARKS: + raise RuntimeError(f"unknown benchmark: {benchmark}") + Evaluator = _load_bbo_evaluator_class() + _EVALUATORS[benchmark] = Evaluator(_make_args(benchmark)) + return _EVALUATORS[benchmark] + + +def _benchmark_info(benchmark: str, evaluator: Any) -> dict[str, Any]: + placedb = evaluator.placer.placedb + return { + "dataset": DATASET, + "benchmark": benchmark, + "placer": "mgo", + "metric": "mp_hpwl", + "objective": "minimize", + "dim": int(evaluator.n_dim), + "node_cnt": int(placedb.node_cnt), + "net_cnt": int(getattr(placedb, "net_cnt", len(getattr(placedb, "net_info", {})))), + "canvas_width": float(placedb.canvas_width), + "canvas_height": float(placedb.canvas_height), + "n_grid_x": int(evaluator.args.n_grid_x), + "n_grid_y": int(evaluator.args.n_grid_y), + "bounds_kind": "mgo_repeated_grid", + "max_candidates_per_submission": MAX_CANDIDATES, + "baseline_hpwl": float(BASELINE_HPWL[benchmark]), + "baseline_source": "BBOPlace-Bench Table III, MGO + Vanilla-EA MP-HPWL, unit x10^5, relaxed by 1.2x", + } + + +def prepare() -> dict[str, Any]: + _ensure_runtime_paths() + return { + "dataset": DATASET, + "benchmarks": list(BENCHMARKS), + "quick_feedback_benchmarks": list(QUICK_BENCHMARKS), + "bboplace_root": str(BBOPLACE_ROOT), + "max_candidates_per_submission": MAX_CANDIDATES, + "load_mode": "lazy_per_benchmark", + } + + +def _selected_benchmarks() -> tuple[tuple[str, ...], str]: + role = os.environ.get("FRONTIER_SUBMISSION_ROLE", "agent") + if role == "final": + return BENCHMARKS, "full_suite" + return QUICK_BENCHMARKS, "quick_feedback" + + +def _run_solution(solution_path: str, info: dict[str, Any]) -> Any: + with tempfile.TemporaryDirectory(prefix=f"bboplace_{DATASET}_") as tmp: + tmp_path = Path(tmp) + isolated_solution_path = tmp_path / "solution.py" + runner_path = tmp_path / "runner.py" + info_path = tmp_path / "info.json" + result_path = tmp_path / "result.pkl" + shutil.copy2(solution_path, isolated_solution_path) + info_path.write_text(json.dumps(info), encoding="utf-8") + runner_path.write_text( + """ +import importlib.util +import json +import pickle +from pathlib import Path + +solution_path = __SOLUTION_PATH__ +info = json.loads(Path(__INFO_PATH__).read_text(encoding="utf-8")) +result_path = Path(__RESULT_PATH__) + + +def load_candidates(): + spec = importlib.util.spec_from_file_location("solution", solution_path) + if spec is None or spec.loader is None: + raise RuntimeError("could not import solution") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name in ("solve", "generate", "run"): + fn = getattr(module, name, None) + if callable(fn): + return fn(info) + + for name in ("CANDIDATES", "CANDIDATE", "PLACEMENT"): + value = getattr(module, name, None) + if value is not None: + return value + + raise RuntimeError("solution must define solve(info), generate(info), run(info), CANDIDATES, CANDIDATE, or PLACEMENT") + + +try: + with result_path.open("wb") as f: + pickle.dump({"candidates": load_candidates()}, f) +except Exception: + with result_path.open("wb") as f: + pickle.dump({"error": "solution failed while generating placement candidates"}, f) +""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path))) + .replace("__INFO_PATH__", repr(str(info_path))) + .replace("__RESULT_PATH__", repr(str(result_path))), + encoding="utf-8", + ) + preexec_fn = _solution_preexec() + if preexec_fn is not None: + nobody = pwd.getpwnam("nobody") + for path in (tmp_path, isolated_solution_path, runner_path, info_path): + os.chown(path, nobody.pw_uid, nobody.pw_gid) + os.chmod(tmp_path, 0o700) + + proc = subprocess.run( + [sys.executable, str(runner_path)], + capture_output=True, + text=True, + timeout=TIMEOUT_SECONDS, + preexec_fn=preexec_fn, + ) + if proc.returncode != 0: + raise RuntimeError(f"solution runner exited with code {proc.returncode}") + if not result_path.exists(): + raise RuntimeError("solution did not produce a result") + with result_path.open("rb") as f: + payload = pickle.load(f) + if "error" in payload: + raise RuntimeError(str(payload["error"])) + return payload["candidates"] + + +def _normalize_candidates(raw: Any, *, dim: int, node_cnt: int, n_grid_x: int, n_grid_y: int): + import numpy as np + + arr = np.asarray(raw, dtype=float) + if arr.ndim == 1: + if arr.size != dim: + raise ValueError(f"expected one candidate of length {dim}, got length {arr.size}") + arr = arr.reshape(1, dim) + elif arr.ndim == 2: + if arr.shape[1] != dim: + raise ValueError(f"expected candidates with dimension {dim}, got {arr.shape[1]}") + else: + raise ValueError("candidates must be a 1D vector or a 2D list/array") + + if arr.shape[0] < 1: + raise ValueError("at least one candidate is required") + if arr.shape[0] > MAX_CANDIDATES: + raise ValueError( + f"too many candidates: got {arr.shape[0]}, maximum is {MAX_CANDIDATES}" + ) + if not np.all(np.isfinite(arr)): + raise ValueError("all candidate coordinates must be finite") + x = arr[:, :node_cnt] + y = arr[:, node_cnt:] + if np.any(x < 0.0) or np.any(x > float(n_grid_x)): + raise ValueError(f"x-grid coordinates must be in [0, {n_grid_x}]") + if np.any(y < 0.0) or np.any(y > float(n_grid_y)): + raise ValueError(f"y-grid coordinates must be in [0, {n_grid_y}]") + return arr + + +def _evaluate_candidates(evaluator: Any, candidates: Any) -> tuple[float, float, int, int]: + import numpy as np + + hpwl_values, overlap_values, _macro_pos = evaluator.placer.evaluate(candidates) + hpwl_arr = np.asarray(hpwl_values, dtype=float).reshape(-1) + overlap_arr = np.asarray(overlap_values, dtype=float).reshape(-1) + if hpwl_arr.size != candidates.shape[0]: + raise RuntimeError("BBOPlace returned an unexpected number of HPWL values") + if not np.all(np.isfinite(hpwl_arr)): + raise RuntimeError("BBOPlace returned a non-finite HPWL") + best_index = int(np.argmin(hpwl_arr)) + if float(hpwl_arr[best_index]) >= INF_HPWL_THRESHOLD: + raise ValueError("BBOPlace could not legalize any submitted candidate") + overlap = float(overlap_arr[best_index]) if overlap_arr.size > best_index else math.nan + return float(hpwl_arr[best_index]), overlap, best_index, int(candidates.shape[0]) + + +def evaluate(solution_path: str) -> tuple[float, float, str, dict[str, Any]]: + try: + per_benchmark: list[dict[str, Any]] = [] + bounded_scores: list[float] = [] + raw_scores: list[float] = [] + + selected_benchmarks, evaluation_scope = _selected_benchmarks() + for benchmark in selected_benchmarks: + evaluator = _ensure_evaluator(benchmark) + info = _benchmark_info(benchmark, evaluator) + raw_candidates = _run_solution(solution_path, info) + candidates = _normalize_candidates( + raw_candidates, + dim=info["dim"], + node_cnt=info["node_cnt"], + n_grid_x=info["n_grid_x"], + n_grid_y=info["n_grid_y"], + ) + candidate_hpwl, overlap_rate, candidate_index, n_candidates = _evaluate_candidates( + evaluator, candidates + ) + baseline_hpwl = BASELINE_HPWL[benchmark] + raw_score = 100.0 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl + bounded_score = max(0.0, raw_score) + bounded_scores.append(bounded_score) + raw_scores.append(raw_score) + per_benchmark.append( + { + "benchmark": benchmark, + "candidate_hpwl": candidate_hpwl, + "baseline_hpwl": baseline_hpwl, + "raw_score": raw_score, + "score": bounded_score, + "overlap_rate": overlap_rate, + "candidate_index": candidate_index, + "n_candidates": n_candidates, + } + ) + + score = sum(bounded_scores) / len(bounded_scores) + score_unbounded = sum(raw_scores) / len(raw_scores) + message = ( + f"dataset={DATASET}; scope={evaluation_scope}; benchmarks={len(selected_benchmarks)}; " + f"mean_score={score:.6f}; mean_score_unbounded={score_unbounded:.6f}; " + "metric=MP-HPWL; baseline=1.2x relaxed MGO paper constants" + ) + metrics = { + "dataset": DATASET, + "evaluation_scope": evaluation_scope, + "benchmark_count": len(selected_benchmarks), + "full_suite_benchmark_count": len(BENCHMARKS), + "score_formula": "max(0, 100 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl)", + "mean_candidate_hpwl": sum(item["candidate_hpwl"] for item in per_benchmark) + / len(per_benchmark), + "per_benchmark": per_benchmark, + } + return score, score_unbounded, message, metrics + except subprocess.TimeoutExpired: + return 0.0, 0.0, f"timed out after {TIMEOUT_SECONDS}s", {} + except Exception as exc: + return 0.0, 0.0, f"evaluation failed: {exc}", {"traceback": traceback.format_exc()} + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + print("usage: evaluator.py /path/to/solution.py", file=sys.stderr) + return 1 + score, score_unbounded, message, metrics = evaluate(argv[1]) + print(message, file=sys.stderr) + if metrics: + print(json.dumps(metrics, indent=2), file=sys.stderr) + print(f"{score:.12f} {score_unbounded:.12f}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/2.0/problems/bboplace_ispd2005/readme b/2.0/problems/bboplace_ispd2005/readme new file mode 100644 index 00000000..a022ec2b --- /dev/null +++ b/2.0/problems/bboplace_ispd2005/readme @@ -0,0 +1,83 @@ +BBOPlace ISPD2005 +================= + +Write a Python solution that proposes macro placements for the BBOPlace MGO +formulation on the ISPD2005 benchmark suite. The hidden judge evaluates your +placements with the original BBOPlace-Bench MP-HPWL evaluator. + +Runtime and resources +--------------------- + +Your solution runs in the main agent container with: + +- 8 CPU cores +- 16 GiB memory +- 8 GiB storage +- no GPU +- Python 3 with NumPy available +- internet access may be available during the trial, but the benchmark data is + hidden and is not available in the agent workspace + +The final verifier timeout is 10800 seconds. The judge also runs on CPU only. +Do not rely on CUDA, DREAMPlace, Ray, or GPU placement libraries for scoring; +the official metric path used here is MGO with MP-HPWL. + +Your file must define one of: + +- `solve(info)` +- `generate(info)` +- `run(info)` +- `CANDIDATES`, `CANDIDATE`, or `PLACEMENT` + +The recommended interface is `solve(info)`. The judge calls it once per +benchmark. `info` contains: + +- `benchmark`: one of `adaptec1`, `adaptec2`, `adaptec3`, `adaptec4`, + `bigblue1`, `bigblue3` +- `dim`: placement vector length +- `node_cnt`: number of macros +- `n_grid_x`, `n_grid_y`: MGO grid bounds +- `max_candidates_per_submission`: 16 +- `baseline_hpwl`: the baseline HPWL used for scoring + +Return either one placement vector of length `dim`, or a 2D list/array with up +to 16 placement candidates. For MGO, the first `node_cnt` entries are x-grid +coordinates in `[0, n_grid_x]`, and the remaining `node_cnt` entries are y-grid +coordinates in `[0, n_grid_y]`. + +Score +----- + +The objective is to minimize MP-HPWL. For each benchmark: + +`raw_score = 100 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl` + +`score = max(0, raw_score)` + +The final score is the mean score over the six ISPD2005 benchmarks. The +unbounded score is the mean raw score before the zero clip. + +During iterative agent submissions, `/app/submit.sh` gives quick feedback on +`adaptec1` only. The final verifier evaluates the full six-benchmark suite. +Use the quick feedback to debug general placement logic, not as the complete +leaderboard score. + +The submit helper saves the best quick-feedback artifact it has seen. During +final verification, the verifier reruns both the current `/app/solution.py` and +that saved best iterative artifact on the full suite, then uses the better +full-suite score. A quick-feedback score is never used directly as the final +reward. + +The baseline constants are from the BBOPlace-Bench report, Table III, +`MGO + Vanilla-EA` MP-HPWL. The paper reports values in units of `x10^5`; the +judge stores raw HPWL values relaxed by `1.2x`: + +- `adaptec1`: `6.96e5` +- `adaptec2`: `73.752e5` +- `adaptec3`: `67.356e5` +- `adaptec4`: `68.148e5` +- `bigblue1`: `2.76e5` +- `bigblue3`: `62.88e5` + +The benchmark data and evaluator source are only present in the judge image. +They are not available in the agent workspace. diff --git a/2.0/problems/bboplace_ispd2005/reference.py b/2.0/problems/bboplace_ispd2005/reference.py new file mode 100644 index 00000000..529510b5 --- /dev/null +++ b/2.0/problems/bboplace_ispd2005/reference.py @@ -0,0 +1,7 @@ +"""Deterministic valid baseline for the BBOPlace ISPD2005 task.""" + +from __future__ import annotations + + +def solve(info): + return [0.0] * int(info["dim"]) diff --git a/README.md b/README.md index 8ba254e2..c23a9b0d 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Research Problems Algorithmic Problems - 2.0 Problems + 2.0 Problems

## News @@ -139,8 +139,8 @@ isolated from Frontier-CS's own `uv sync` environment. Frontier-CS 2.0 is agent-first: current 2.0 problems are meant to be run through Harbor-compatible agents rather than direct one-shot solution files. -Problem IDs are their problem directory names, such as `erdos_unit_distance` -and the small `erdos_demo`. +Problem IDs are their problem directory names, such as `erdos_unit_distance`, +the small `erdos_demo`, and BBOPlace variants such as `bboplace_ispd2005`. ```bash # List 2.0 problems @@ -151,6 +151,9 @@ uv run frontier harbor trial 2.0 erdos_unit_distance -a codex -m gpt-5.5 --json # Run the small N=10 demo task uv run frontier harbor trial 2.0 erdos_demo -a codex -m gpt-5.5 --json + +# Run a BBOPlace placement task +uv run frontier harbor trial 2.0 bboplace_ispd2005 -a codex -m gpt-5.5 --json ``` See [2.0/README.md](2.0/README.md) for the current 2.0 track. diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md index 0aa08d3b..0988da70 100644 --- a/adapters/frontier-cs-2.0/README.md +++ b/adapters/frontier-cs-2.0/README.md @@ -54,6 +54,10 @@ once per trial; both iterative submissions and the final verifier score through that same sidecar. The final verifier writes a normalized reward in `/logs/verifier/reward.txt`. +Tasks may set `runtime.docker.judge_image` in `config.yaml` when the judge needs +a different image from the agent workspace, for example to keep hidden data and +heavy evaluator dependencies out of the main container. + During the trial, the agent can call: ```bash diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py index 31461308..9f1dcc6b 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py @@ -3,6 +3,7 @@ import logging import json import shutil +import secrets from pathlib import Path from types import SimpleNamespace from typing import TYPE_CHECKING, Iterable @@ -64,6 +65,9 @@ def discover_problems(frontier_cs_root: Path) -> list[FrontierCS20Problem]: language=str(runtime.get("language", "python")), timeout_seconds=int(runtime.get("timeout_seconds", 10800)), docker_image=str(docker.get("image", "ubuntu:24.04")), + judge_docker_image=( + str(docker["judge_image"]) if "judge_image" in docker else None + ), config=config, ) ) @@ -125,8 +129,9 @@ def generate_task( task_paths.tests_dir.mkdir(parents=True, exist_ok=True) self._write_instruction(task_paths, problem) - self._write_environment(task_paths, problem) - self._write_tests(task_paths, problem) + verifier_token = secrets.token_urlsafe(32) + self._write_environment(task_paths, problem, verifier_token=verifier_token) + self._write_tests(task_paths, problem, verifier_token=verifier_token) self._write_solution(task_paths, problem) self._write_task_config(task_paths, problem) LOGGER.info(" [OK] %s", problem.problem_id) @@ -167,12 +172,19 @@ def _write_instruction(self, task_paths: "TaskPaths", problem: FrontierCS20Probl ) task_paths.instruction_path.write_text(instruction, encoding="utf-8") - def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Problem) -> None: + def _write_environment( + self, + task_paths: "TaskPaths", + problem: FrontierCS20Problem, + *, + verifier_token: str, + ) -> None: env_dir = task_paths.environment_dir dockerfile = (self.template_dir / "environment" / "Dockerfile").read_text( encoding="utf-8" ) image = self.docker_image or problem.docker_image + judge_image = self.docker_image or problem.judge_docker_image or image runtime = problem.config.get("runtime", {}) or {} apt_package_names = [ str(pkg) @@ -225,7 +237,7 @@ def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Probl str(pkg) for pkg in runtime.get("judge_apt_packages", []) or [] ) env_dir.joinpath("Dockerfile.judge").write_text( - judge_dockerfile.replace("{base_image}", image).replace( + judge_dockerfile.replace("{base_image}", judge_image).replace( "{judge_apt_packages_line}", f" {judge_apt_packages}" if judge_apt_packages else "", ).replace( @@ -244,9 +256,12 @@ def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Probl ), encoding="utf-8", ) - shutil.copy2( - self.template_dir / "environment" / "judge_server.py", - env_dir / "judge_server.py", + judge_server = ( + self.template_dir / "environment" / "judge_server.py" + ).read_text(encoding="utf-8") + (env_dir / "judge_server.py").write_text( + judge_server.replace("{verifier_token}", verifier_token), + encoding="utf-8", ) shutil.copy2( self.template_dir / "environment" / "submit.py", env_dir / "submit.py" @@ -267,11 +282,21 @@ def _write_submission_config(self, env_dir: Path, problem: FrontierCS20Problem) json.dumps(submission, indent=2), encoding="utf-8" ) - def _write_tests(self, task_paths: "TaskPaths", problem: FrontierCS20Problem) -> None: + def _write_tests( + self, + task_paths: "TaskPaths", + problem: FrontierCS20Problem, + *, + verifier_token: str, + ) -> None: tests_dir = task_paths.tests_dir shutil.copy2(self.template_dir / "tests" / "test.sh", tests_dir / "test.sh") - shutil.copy2( - self.template_dir / "tests" / "evaluate.py", tests_dir / "evaluate.py" + evaluate_py = (self.template_dir / "tests" / "evaluate.py").read_text( + encoding="utf-8" + ) + (tests_dir / "evaluate.py").write_text( + evaluate_py.replace("{verifier_token}", verifier_token), + encoding="utf-8", ) shutil.copy2(problem.problem_dir / "evaluator.py", tests_dir / "problem_evaluator.py") (tests_dir / "test.sh").chmod(0o755) diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py index 2647c9f7..43e92b7f 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py @@ -14,6 +14,7 @@ import time import traceback import threading +import secrets from datetime import datetime, timezone from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path @@ -24,6 +25,7 @@ JUDGE_SUBMISSIONS_LOG = Path("/logs/judge/submissions.jsonl") MAX_SUBMISSION_BYTES = 30_000_000 MAX_ARCHIVE_BYTES = 20_000_000 +FINAL_ROLE_TOKEN = "{verifier_token}" def load_problem_evaluator(): @@ -129,12 +131,20 @@ def normalize_result(result: Any) -> tuple[float, float, str, dict[str, Any]]: return score, score_unbounded, message, metrics -def evaluate_path(solution_path: Path) -> dict[str, Any]: +def evaluate_path(solution_path: Path, *, submission_role: str = "agent") -> dict[str, Any]: if EVALUATOR is None: raise RuntimeError("problem evaluator is not loaded") - score, score_unbounded, message, metrics = normalize_result( - EVALUATOR.evaluate(str(solution_path)) - ) + previous_role = os.environ.get("FRONTIER_SUBMISSION_ROLE") + os.environ["FRONTIER_SUBMISSION_ROLE"] = submission_role + try: + score, score_unbounded, message, metrics = normalize_result( + EVALUATOR.evaluate(str(solution_path)) + ) + finally: + if previous_role is None: + os.environ.pop("FRONTIER_SUBMISSION_ROLE", None) + else: + os.environ["FRONTIER_SUBMISSION_ROLE"] = previous_role return { "status": "done", "score": float(score), @@ -144,11 +154,11 @@ def evaluate_path(solution_path: Path) -> dict[str, Any]: } -def evaluate_code(code: str) -> dict[str, Any]: +def evaluate_code(code: str, *, submission_role: str = "agent") -> dict[str, Any]: with tempfile.TemporaryDirectory(prefix="frontier_cs_2_0_submission_") as tmp: solution_path = Path(tmp) / "solution.py" solution_path.write_text(code, encoding="utf-8") - return evaluate_path(solution_path) + return evaluate_path(solution_path, submission_role=submission_role) def is_safe_tar_member(member: tarfile.TarInfo) -> bool: @@ -156,7 +166,7 @@ def is_safe_tar_member(member: tarfile.TarInfo) -> bool: return not path.is_absolute() and ".." not in path.parts -def evaluate_archive(archive_b64: str) -> dict[str, Any]: +def evaluate_archive(archive_b64: str, *, submission_role: str = "agent") -> dict[str, Any]: archive = base64.b64decode(archive_b64.encode("ascii"), validate=True) if len(archive) > MAX_ARCHIVE_BYTES: raise ValueError("submission archive too large") @@ -168,7 +178,7 @@ def evaluate_archive(archive_b64: str) -> dict[str, Any]: if not all(is_safe_tar_member(member) for member in members): raise ValueError("unsafe path in submission archive") tar.extractall(root) - return evaluate_path(root) + return evaluate_path(root, submission_role=submission_role) class JudgeHandler(BaseHTTPRequestHandler): @@ -227,7 +237,14 @@ def do_POST(self) -> None: try: payload = json.loads(self.rfile.read(content_length).decode("utf-8")) submission_uuid = str(payload.get("submission_uuid") or "") - submission_role = str(payload.get("submission_role") or "agent") + requested_role = str(payload.get("submission_role") or "agent") + role_token = self.headers.get("X-Frontier-CS-Role-Token", "") + if requested_role == "final": + if not secrets.compare_digest(role_token, FINAL_ROLE_TOKEN): + raise PermissionError("final evaluation role is verifier-only") + submission_role = "final" + else: + submission_role = "agent" submission_kind = payload.get("submission_kind", "file") if submission_kind == "directory": archive_b64 = payload.get("archive_b64") @@ -235,7 +252,7 @@ def do_POST(self) -> None: raise ValueError( "directory submission must include archive_b64" ) - result = evaluate_archive(archive_b64) + result = evaluate_archive(archive_b64, submission_role=submission_role) log_submission( { "submission_uuid": submission_uuid, @@ -251,7 +268,7 @@ def do_POST(self) -> None: raise ValueError( "file submission must include non-empty string field 'code'" ) - result = evaluate_code(code) + result = evaluate_code(code, submission_role=submission_role) log_submission( { "submission_uuid": submission_uuid, diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py index 2e0cc44c..dad595c5 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py @@ -20,6 +20,8 @@ SOLUTION_PATH = Path("/app/solution.py") SUBMISSION_CONFIG_PATH = Path("/app/submission_config.json") SUBMISSIONS_LOG = Path("/logs/agent/submissions.jsonl") +BEST_SUBMISSION_PAYLOAD = Path("/logs/agent/best_submission_payload.json") +BEST_SUBMISSION_META = Path("/logs/agent/best_submission_meta.json") JUDGE_URL = os.environ.get("JUDGE_URL", "http://judge:8082").rstrip("/") JUDGE_TIMEOUT_SECONDS = int(os.environ.get("JUDGE_TIMEOUT_SECONDS", "10800")) @@ -44,6 +46,32 @@ def log_record(record: dict) -> None: f.write(json.dumps(record, ensure_ascii=False) + "\n") +def save_best_submission(payload: dict, metadata: dict) -> None: + previous_key: tuple[float, float] | None = None + if BEST_SUBMISSION_META.exists(): + try: + previous = json.loads(BEST_SUBMISSION_META.read_text(encoding="utf-8")) + previous_score = float(previous.get("score_raw", 0.0)) + previous_unbounded = float( + previous.get("score_unbounded", previous_score) + ) + previous_key = (previous_score, previous_unbounded) + except Exception: + previous_key = None + + score_raw = float(metadata.get("score_raw", 0.0)) + score_unbounded = float(metadata.get("score_unbounded", score_raw)) + score_key = (score_raw, score_unbounded) + if previous_key is not None and score_key <= previous_key: + return + + BEST_SUBMISSION_PAYLOAD.parent.mkdir(parents=True, exist_ok=True) + BEST_SUBMISSION_PAYLOAD.write_text(json.dumps(payload), encoding="utf-8") + BEST_SUBMISSION_META.write_text( + json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8" + ) + + def wait_for_judge() -> None: deadline = time.time() + JUDGE_TIMEOUT_SECONDS last_error: Exception | None = None @@ -177,6 +205,7 @@ def main() -> int: judge_payload = { "submission_kind": "directory", "submission_uuid": sub_uuid, + "submission_role": "agent", "archive_b64": archive_b64, } else: @@ -198,6 +227,7 @@ def main() -> int: judge_payload = { "submission_kind": "file", "submission_uuid": sub_uuid, + "submission_role": "agent", "code": code, } @@ -226,6 +256,18 @@ def main() -> int: "metrics": metrics, } ) + save_best_submission( + judge_payload, + { + "submission_uuid": sub_uuid, + "ts": now_iso(), + "score_raw": score, + "score_unbounded": score_unbounded, + "elapsed_seconds": elapsed_seconds, + "detail": message, + "metrics": metrics, + }, + ) print(f"[submit] uuid={sub_uuid}") print( diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py index 2807a349..d1e6c895 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py @@ -25,8 +25,11 @@ VERIFIER_SUBMISSIONS_LOG = Path("/logs/verifier/submissions.jsonl") VERIFIER_JUDGE_READY_LOG = Path("/logs/verifier/judge_ready.json") EVALUATION_JSON = Path("/logs/verifier/evaluation_result.json") +BEST_AGENT_PAYLOAD = Path("/logs/agent/best_submission_payload.json") +BEST_AGENT_META = Path("/logs/agent/best_submission_meta.json") JUDGE_URL = os.environ.get("JUDGE_URL", "http://judge:8082").rstrip("/") JUDGE_TIMEOUT_SECONDS = int(os.environ.get("JUDGE_TIMEOUT_SECONDS", "10800")) +FINAL_ROLE_TOKEN = "{verifier_token}" def submission_reward(record: dict) -> float | None: @@ -36,6 +39,11 @@ def submission_reward(record: dict) -> float | None: return None +def result_score_key(record: dict) -> tuple[float, float]: + score = float(record.get("score", 0.0)) + return (score, float(record.get("score_unbounded", score))) + + def best_submission() -> dict | None: submissions_log = ( VERIFIER_SUBMISSIONS_LOG @@ -60,8 +68,10 @@ def best_submission() -> dict | None: continue if record.get("status") != "done": continue - best_reward = submission_reward(best) if best is not None else None - if best is None or best_reward is None or reward > best_reward: + metrics = record.get("metrics", {}) + if isinstance(metrics, dict) and metrics.get("evaluation_scope") == "quick_feedback": + continue + if best is None or result_score_key(record) > result_score_key(best): best = record return best @@ -185,10 +195,13 @@ def wait_for_judge() -> None: def post_json(url: str, payload: dict) -> dict: body = json.dumps(payload).encode("utf-8") + headers = {"Content-Type": "application/json"} + if payload.get("submission_role") == "final": + headers["X-Frontier-CS-Role-Token"] = FINAL_ROLE_TOKEN req = request.Request( url, data=body, - headers={"Content-Type": "application/json"}, + headers=headers, method="POST", ) try: @@ -221,6 +234,34 @@ def build_judge_payload(solution_path: Path, config: dict) -> dict: } +def load_best_agent_payload() -> dict | None: + if not BEST_AGENT_PAYLOAD.exists(): + return None + try: + payload = json.loads(BEST_AGENT_PAYLOAD.read_text(encoding="utf-8")) + except Exception as exc: + print(f"WARN: failed to read best agent payload: {exc}") + return None + if not isinstance(payload, dict): + return None + payload = dict(payload) + payload["submission_role"] = "final" + payload["submission_uuid"] = str(uuid.uuid4()) + return payload + + +def describe_best_agent_payload() -> str: + if not BEST_AGENT_META.exists(): + return "best iterative artifact" + try: + meta = json.loads(BEST_AGENT_META.read_text(encoding="utf-8")) + except Exception: + return "best iterative artifact" + if not isinstance(meta, dict): + return "best iterative artifact" + return f"best iterative artifact {meta.get('submission_uuid', '')}".strip() + + def evaluate_with_judge(payload: dict) -> dict: wait_for_judge() result = post_json(f"{JUDGE_URL}/evaluate", payload) @@ -304,15 +345,46 @@ def write_best_submission_reward(reason: str) -> bool: final_result = evaluate_with_judge(build_judge_payload(solution_path, config)) copy_judge_artifacts() - final_reward = float(final_result.get("score", 0.0)) / 100.0 - best_reward = submission_reward(best) if best is not None else None - if best_reward is not None and best_reward > final_reward: + final_key = result_score_key(final_result) + best_payload = load_best_agent_payload() + if best_payload is not None: + try: + best_result = evaluate_with_judge(best_payload) + copy_judge_artifacts() + if result_score_key(best_result) > final_key: + metrics = dict(best_result.get("metrics", {}) or {}) + metrics["used_best_agent_artifact"] = 1 + best_result["metrics"] = metrics + best_result["message"] = ( + f"Using {describe_best_agent_payload()} after full-suite rerun: " + f"{best_result.get('message', '')}" + ) + write_result(best_result) + return + except Exception as exc: + print(f"WARN: failed to rerun best iterative artifact: {exc}") + if best is not None and result_score_key(best) > final_key: write_best_submission_reward("final solution scored below best submission") return write_result(final_result) except Exception as exc: print(traceback.format_exc()) copy_judge_artifacts() + best_payload = load_best_agent_payload() + if best_payload is not None: + try: + best_result = evaluate_with_judge(best_payload) + metrics = dict(best_result.get("metrics", {}) or {}) + metrics["used_best_agent_artifact"] = 1 + best_result["metrics"] = metrics + best_result["message"] = ( + f"Using {describe_best_agent_payload()} after final evaluation failed: " + f"{best_result.get('message', '')}" + ) + write_result(best_result) + return + except Exception as best_exc: + print(f"WARN: failed to rerun best iterative artifact: {best_exc}") if write_best_submission_reward(f"final evaluation failed: {exc}"): return write_reward(0.0, f"Evaluation failed: {exc}") diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/utils.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/utils.py index 879f48a2..b995ca97 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/utils.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/utils.py @@ -20,6 +20,7 @@ class FrontierCS20Problem: language: str timeout_seconds: int docker_image: str + judge_docker_image: str | None config: dict[str, Any] diff --git a/docs/bboplace_2_0_integration_plan.md b/docs/bboplace_2_0_integration_plan.md new file mode 100644 index 00000000..ef3e56f2 --- /dev/null +++ b/docs/bboplace_2_0_integration_plan.md @@ -0,0 +1,67 @@ +BBOPlace Frontier-CS 2.0 Integration Plan +========================================= + +Current direction: + +- Use the unified `frontier-cs-2.0` Harbor adapter, not a standalone + BBOPlace adapter. +- Add two algorithmic suite tasks: `bboplace_ispd2005` and + `bboplace_iccad2015`. +- Keep the agent/main container separate from the judge/data container. +- During agent iteration, score only the first benchmark in each suite for fast + general-design feedback. During final verification, score the full suite. +- The judge accepts `final` evaluation role only when the hidden verifier sends + the generated role token. Agent submissions, including hand-written judge + requests, are treated as quick-feedback submissions. +- The submit helper stores the best quick-feedback artifact. The verifier + reruns that artifact on the full suite and compares it with the current final + solution, so quick scores are never used directly as final rewards. +- Prebuild a BBOPlace judge image with the original BBOPlace-Bench runtime and + extracted ISPD2005 plus ICCAD2015 data. The evaluator uses only MGO + + MP-HPWL, so the image does not need Ray, HPO, DREAMPlace, or GPU packages. + +Evaluation flow: + +```mermaid +flowchart TD + A["Agent edits /app/solution.py"] --> B["/app/submit.sh"] + B --> C["Unified Frontier-CS 2.0 submit client"] + C --> D["Judge service in separate data image"] + D --> E["Load BBOPlace runtime and hidden benchmark data"] + E --> F["For each benchmark, call solution solve(info) as nobody"] + F --> G["Judge checks verifier-only final role token"] + G --> H["Select quick first benchmark for agent, full suite for final"] + H --> I["Validate <=16 MGO candidates and coordinate bounds"] + I --> J["Run original BBOPlace MGO MP-HPWL evaluator"] + J --> K["Score against maintained MGO baseline constants"] + K --> L["Return mean score, unbounded score, and metrics"] +``` + +Scoring: + +`score = max(0, 100 * (baseline_hpwl - candidate_hpwl) / baseline_hpwl)` + +The final task score is the mean across the benchmarks in that variant. The +unbounded score is the mean before the zero clip. + +Baseline constants: + +- ISPD2005: BBOPlace-Bench report Table III, `MGO + Vanilla-EA`, MP-HPWL, + paper unit `x10^5`, relaxed by `1.2x` for scoring. +- ICCAD2015: BBOPlace-Bench report Table V, `MGO + PSO`, MP-HPWL, paper unit + `x10^5`, relaxed by `1.2x` for scoring. + +Data status: + +- ISPD2005 checked locally from the official ISPD tarballs. The extracted data + is about 967 MiB. +- ICCAD2015 checked locally from the BBOPlace-Bench Google Drive package. The + extracted data is about 2.2 GiB. +- Parser smoke tests succeeded for `adaptec1` and `superblue1`. + +Future extension: + +- Add direct-placement single-instance tasks where the model receives one + evaluator input file and directly outputs a placement. That variant is better + suited for agent fine-tuning and should be separate from these suite-style + algorithmic tasks. diff --git a/tools/bboplace/Dockerfile.data b/tools/bboplace/Dockerfile.data new file mode 100644 index 00000000..ace95e01 --- /dev/null +++ b/tools/bboplace/Dockerfile.data @@ -0,0 +1,41 @@ +# syntax=docker/dockerfile:1 +# +# Build context expectation: +# BBOPlace-Bench/ pinned checkout of lamda-bbo/BBOPlace-Bench +# benchmarks/ extracted ispd2005 and iccad2015 data +# +# The manifest in this directory records the exact repository commit, data +# URLs, checksums, and observed sizes. + +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3 python3-numpy python3-yaml ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /opt/bboplace-bench + +COPY BBOPlace-Bench/config ./config +COPY BBOPlace-Bench/src ./src +COPY benchmarks ./benchmarks + +RUN python3 - <<'PY' +from pathlib import Path + +root = Path("/opt/bboplace-bench") +required = [ + root / "src" / "evaluator.py", + root / "config" / "benchmark.py", + root / "benchmarks" / "ispd2005" / "adaptec1", + root / "benchmarks" / "iccad2015" / "superblue1", +] +for path in required: + if not path.exists(): + raise SystemExit(f"missing required BBOPlace data image path: {path}") +PY + +RUN chmod -R go-rwx /opt/bboplace-bench/benchmarks && \ + chmod -R go-rwx /opt/bboplace-bench/src /opt/bboplace-bench/config diff --git a/tools/bboplace/README.md b/tools/bboplace/README.md new file mode 100644 index 00000000..033cbc34 --- /dev/null +++ b/tools/bboplace/README.md @@ -0,0 +1,45 @@ +BBOPlace data image +=================== + +The BBOPlace Frontier-CS 2.0 tasks use a separate judge image: + +`ghcr.io/frontiercs/frontiercs-bboplace-data:2026-06-ispd-iccad` + +The agent image stays small and does not contain the benchmark data. The judge +image must contain a pinned BBOPlace-Bench checkout plus extracted ISPD2005 and +ICCAD2015 benchmark data under `/opt/bboplace-bench`. + +Expected layout: + +```text +/opt/bboplace-bench/ + config/ + src/ + benchmarks/ + ispd2005/ + iccad2015/ +``` + +The data sources, SHA256 checksums, observed sizes, and scoring constants are +tracked in `data_manifest.json`. ISPD2005 archives contain gzipped inner files; +the final image must store the uncompressed `.aux`, `.nodes`, `.nets`, `.pl`, +`.scl`, and `.wts` files because the BBOPlace reader expects those filenames. + +The Frontier-CS evaluator uses only the BBOPlace MGO + MP-HPWL path. It avoids +the original BBOPlace `src.evaluator` import path because that path imports +Ray, HPO, DREAMPlace, and other dependencies that are not needed for this CPU +metric. The data image therefore only needs Python, NumPy, PyYAML, the +BBOPlace `src/` and `config/` trees, and the extracted benchmarks. + +The image should make benchmark files readable by the root judge process only. +Submitted solution code is run as `nobody` and receives only public benchmark +metadata through `solve(info)`. + +Local validation commands: + +```bash +docker build -t frontiercs-bboplace-data:local /path/to/context +docker tag frontiercs-bboplace-data:local ghcr.io/frontiercs/frontiercs-bboplace-data:2026-06-ispd-iccad +python3 tools/bboplace/check_constants.py +python3 tools/bboplace/check_generated_tasks.py /path/to/generated/frontier-cs-2.0 +``` diff --git a/tools/bboplace/check_constants.py b/tools/bboplace/check_constants.py new file mode 100644 index 00000000..f6d37fef --- /dev/null +++ b/tools/bboplace/check_constants.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +"""Check BBOPlace evaluator constants against the data manifest.""" + +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +MANIFEST = ROOT / "tools" / "bboplace" / "data_manifest.json" +EVALUATORS = { + "ispd2005": ROOT / "2.0" / "problems" / "bboplace_ispd2005" / "evaluator.py", + "iccad2015": ROOT / "2.0" / "problems" / "bboplace_iccad2015" / "evaluator.py", +} + + +def load_module(path: Path): + spec = importlib.util.spec_from_file_location(path.stem, path) + if spec is None or spec.loader is None: + raise RuntimeError(f"could not import {path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def main() -> int: + manifest = json.loads(MANIFEST.read_text(encoding="utf-8")) + for dataset, evaluator_path in EVALUATORS.items(): + module = load_module(evaluator_path) + expected_scored = manifest["datasets"][dataset]["benchmarks_scored"] + expected_constants = manifest["scoring_constants"][dataset]["baseline_hpwl"] + actual_scored = list(module.BENCHMARKS) + actual_constants = {key: float(value) for key, value in module.BASELINE_HPWL.items()} + if actual_scored != expected_scored: + raise SystemExit( + f"{dataset}: benchmark list mismatch: {actual_scored} != {expected_scored}" + ) + if actual_constants != expected_constants: + raise SystemExit( + f"{dataset}: baseline constants mismatch: {actual_constants} != {expected_constants}" + ) + print("BBOPlace constants match data_manifest.json") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/bboplace/check_generated_tasks.py b/tools/bboplace/check_generated_tasks.py new file mode 100644 index 00000000..3178d8cd --- /dev/null +++ b/tools/bboplace/check_generated_tasks.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""Validate generated BBOPlace Harbor tasks for the intended evaluation flow.""" + +from __future__ import annotations + +import sys +from pathlib import Path + + +EXPECTED_TASKS = ( + "frontier-cs-2-0-bboplace-ispd2005", + "frontier-cs-2-0-bboplace-iccad2015", +) +EXPECTED_JUDGE_IMAGE = "ghcr.io/frontiercs/frontiercs-bboplace-data:2026-06-ispd-iccad" + + +def require(condition: bool, message: str) -> None: + if not condition: + raise SystemExit(message) + + +def read(path: Path) -> str: + require(path.exists(), f"missing generated file: {path}") + return path.read_text(encoding="utf-8") + + +def check_task(task_dir: Path) -> None: + env_dir = task_dir / "environment" + tests_dir = task_dir / "tests" + + dockerfile = read(env_dir / "Dockerfile") + judge_dockerfile = read(env_dir / "Dockerfile.judge") + submit_py = read(env_dir / "submit.py") + judge_server = read(env_dir / "judge_server.py") + evaluate_py = read(tests_dir / "evaluate.py") + instruction = read(task_dir / "instruction.md") + + require("FROM ubuntu:24.04" in dockerfile, f"{task_dir.name}: main image changed") + require( + f"FROM {EXPECTED_JUDGE_IMAGE}" in judge_dockerfile, + f"{task_dir.name}: judge image is not the BBOPlace data image", + ) + require("{verifier_token}" not in judge_server, f"{task_dir.name}: judge token placeholder leaked") + require("{verifier_token}" not in evaluate_py, f"{task_dir.name}: test token placeholder leaked") + require("FINAL_ROLE_TOKEN" not in submit_py, f"{task_dir.name}: final token leaked to agent submit helper") + require("X-Frontier-CS-Role-Token" not in submit_py, f"{task_dir.name}: final token header leaked to agent") + require("submission_role\": \"agent\"" in submit_py, f"{task_dir.name}: submit helper does not mark agent role") + require("final evaluation role is verifier-only" in judge_server, f"{task_dir.name}: judge does not guard final role") + require("Runtime and resources" in instruction, f"{task_dir.name}: missing resource statement") + require("no GPU" in instruction, f"{task_dir.name}: missing no-GPU statement") + require( + "`max_candidates_per_submission`: 16" in instruction, + f"{task_dir.name}: candidate limit should be 16", + ) + require("quick-feedback score is never used directly" in instruction, f"{task_dir.name}: missing quick/full warning") + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + print("usage: check_generated_tasks.py /path/to/generated/frontier-cs-2.0", file=sys.stderr) + return 2 + root = Path(argv[1]) + for task_name in EXPECTED_TASKS: + check_task(root / task_name) + print("Generated BBOPlace tasks match the expected Harbor flow") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/tools/bboplace/data_manifest.json b/tools/bboplace/data_manifest.json new file mode 100644 index 00000000..4ea51214 --- /dev/null +++ b/tools/bboplace/data_manifest.json @@ -0,0 +1,138 @@ +{ + "name": "frontiercs-bboplace-data", + "bboplace_bench_repository": { + "url": "https://github.com/lamda-bbo/BBOPlace-Bench", + "commit": "4a0dde451e40a3f368e501df6f027c442fcca02a" + }, + "image": { + "expected_registry_tag": "ghcr.io/frontiercs/frontiercs-bboplace-data:2026-06-ispd-iccad", + "bboplace_root": "/opt/bboplace-bench", + "contains": [ + "/opt/bboplace-bench/src", + "/opt/bboplace-bench/config", + "/opt/bboplace-bench/benchmarks/ispd2005", + "/opt/bboplace-bench/benchmarks/iccad2015" + ] + }, + "datasets": { + "ispd2005": { + "benchmarks_in_image": [ + "adaptec1", + "adaptec2", + "adaptec3", + "adaptec4", + "bigblue1", + "bigblue2", + "bigblue3", + "bigblue4" + ], + "benchmarks_scored": [ + "adaptec1", + "adaptec2", + "adaptec3", + "adaptec4", + "bigblue1", + "bigblue3" + ], + "extracted_size_observed": "967 MiB", + "archives": [ + { + "name": "adaptec1.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/adaptec1.tar.gz", + "sha256": "b694dedfe15bffa7cb92dfbee0bc11906f5d334d211f51d82d0ac1effb6c0a08" + }, + { + "name": "adaptec2.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/adaptec2.tar.gz", + "sha256": "e5a7bc0e343a97f3d9d3a1c871636a4b51da7f64ee71d2f04e7db295655a09a2" + }, + { + "name": "adaptec3.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/adaptec3.tar.gz", + "sha256": "9ddc8f3040bd5d08609d2ff439484c3c8e3896edd9c73e5309477a2284f1edae" + }, + { + "name": "adaptec4.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/adaptec4.tar.gz", + "sha256": "ca894bcf93ace5998dd393a6b6d5f240d3c695159cdc62ab055b8edf70ef46ab" + }, + { + "name": "bigblue1.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/bigblue1.tar.gz", + "sha256": "d2a7a7df13242dc29f811dee8cbe4d8cd34bce6b502099ef176a3d5f7595a89d" + }, + { + "name": "bigblue2.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/bigblue2.tar.gz", + "sha256": "3748e13367578b014424378fb441c89086c111770f835ef5b4afc3601e28316d" + }, + { + "name": "bigblue3.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/bigblue3.tar.gz", + "sha256": "343d0c3a06eae195d92fded15dc8c8c6997db8dc76ff1da8d9c31e56bb927892" + }, + { + "name": "bigblue4.tar.gz", + "url": "https://www.ispd.cc/contests/05/ispd05dp.tarballs/bigblue4.tar.gz", + "sha256": "afd489f440d3323b643baab9082c8c9b40f350ef482d58e6ce52e431c66f6dba" + } + ] + }, + "iccad2015": { + "benchmarks_in_image": [ + "superblue1", + "superblue3", + "superblue4", + "superblue5", + "superblue7", + "superblue10", + "superblue16", + "superblue18" + ], + "benchmarks_scored": [ + "superblue1", + "superblue3", + "superblue4", + "superblue5", + "superblue7", + "superblue10", + "superblue16", + "superblue18" + ], + "archive": { + "name": "iccad2015_benchmark.zip", + "source": "BBOPlace-Bench README Google Drive file 1JEC17FmL2cM8BEAewENvRyG6aWxH53mX", + "sha256": "295120fa9f4e46013b68f90ed29603827e9768b84844c461c576881baa8e627a" + }, + "archive_size_observed": "513 MiB", + "extracted_size_observed": "2.2 GiB" + } + }, + "scoring_constants": { + "unit": "raw HPWL; source tables report MP-HPWL in x10^5; scoring baselines are relaxed by 1.2x", + "ispd2005": { + "source": "BBOPlace-Bench report Table III, MGO + Vanilla-EA, relaxed by 1.2x", + "baseline_hpwl": { + "adaptec1": 696000.0, + "adaptec2": 7375200.0, + "adaptec3": 6735600.0, + "adaptec4": 6814800.0, + "bigblue1": 276000.0, + "bigblue3": 6288000.0 + } + }, + "iccad2015": { + "source": "BBOPlace-Bench report Table V, MGO + PSO, relaxed by 1.2x", + "baseline_hpwl": { + "superblue1": 69600.0, + "superblue3": 182400.0, + "superblue4": 112800.0, + "superblue5": 451200.0, + "superblue7": 202800.0, + "superblue10": 64800.0, + "superblue16": 115200.0, + "superblue18": 57600.0 + } + } + } +}