From f98ee58bca86446dd0a19240247755949782e7e5 Mon Sep 17 00:00:00 2001 From: Qiuyang Mang Date: Wed, 27 May 2026 08:15:08 -0700 Subject: [PATCH 1/3] feat: isolate Frontier-CS 2.0 evaluator --- 2.0/problems/erdos_unit_distance/evaluator.py | 98 ++++++++++++---- 2.0/problems/erdos_unit_distance/readme | 4 +- adapters/frontier-cs-2.0/README.md | 13 ++- .../src/frontier_cs_2_0/adapter.py | 23 +++- .../task-template/environment/Dockerfile | 2 +- .../environment/Dockerfile.judge | 13 +++ .../environment/docker-compose.yaml | 17 +++ .../task-template/environment/judge_server.py | 108 ++++++++++++++++++ .../task-template/environment/submit.py | 47 ++++++-- 9 files changed, 278 insertions(+), 47 deletions(-) create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py diff --git a/2.0/problems/erdos_unit_distance/evaluator.py b/2.0/problems/erdos_unit_distance/evaluator.py index 0ee6c289..9be17a37 100644 --- a/2.0/problems/erdos_unit_distance/evaluator.py +++ b/2.0/problems/erdos_unit_distance/evaluator.py @@ -4,7 +4,10 @@ import importlib.util import math +import os import pickle +import pwd +import shutil import subprocess import sys import tempfile @@ -16,9 +19,40 @@ BASELINE_EDGES = N_POINTS TIMEOUT_SECONDS = 10800 UNIT_DISTANCE = 1.0 -DISTANCE_REL_TOL = 1e-7 -DISTANCE_ABS_TOL = 1e-9 -MIN_SEPARATION = 1e-6 +DISTANCE_REL_TOL = 1e-10 +DISTANCE_ABS_TOL = 1e-10 +MIN_SEPARATION = 1e-3 + + +def _protect_evaluator_source() -> None: + """Hide evaluator source from unprivileged submitted solutions in containers.""" + try: + evaluator_path = Path(__file__).resolve() + if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0: + evaluator_path.chmod(0o600) + except Exception: + pass + + +_protect_evaluator_source() + + +def _solution_preexec(): + """Return a preexec_fn that runs submitted code as nobody when possible.""" + if os.name != "posix": + return None + try: + if os.geteuid() != 0: + return None + nobody = pwd.getpwnam("nobody") + except Exception: + return None + + def demote() -> None: + os.setgid(nobody.pw_gid) + os.setuid(nobody.pw_uid) + + return demote def _is_number(value: Any) -> bool: @@ -72,50 +106,76 @@ def _load_points(solution_path: str) -> Any: def _run_solution(solution_path: str) -> tuple[Any, str]: with tempfile.TemporaryDirectory(prefix="erdos_unit_distance_") as tmp: + tmp_path = Path(tmp) + isolated_solution_path = tmp_path / "solution.py" result_path = Path(tmp) / "result.pkl" runner_path = Path(tmp) / "runner.py" + shutil.copy2(solution_path, isolated_solution_path) runner_path.write_text( """ +import importlib.util import pickle -import traceback from pathlib import Path solution_path = __SOLUTION_PATH__ result_path = Path(__RESULT_PATH__) +n_points = __N_POINTS__ + + +def load_points(): + spec = importlib.util.spec_from_file_location("solution", solution_path) + if spec is None or spec.loader is None: + raise RuntimeError("could not import solution") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name in ("solve", "generate_points", "run"): + fn = getattr(module, name, None) + if callable(fn): + return fn(n_points) + + points = getattr(module, "POINTS", None) + if points is not None: + return points + + raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS") try: - import importlib.util - spec = importlib.util.spec_from_file_location("evaluator", __EVALUATOR_PATH__) - evaluator = importlib.util.module_from_spec(spec) - spec.loader.exec_module(evaluator) - points = evaluator._load_points(solution_path) + points = load_points() with result_path.open("wb") as f: pickle.dump({"points": points}, f) -except Exception as exc: +except Exception: with result_path.open("wb") as f: - pickle.dump({"error": str(exc), "trace": traceback.format_exc()}, f) -""".replace("__SOLUTION_PATH__", repr(solution_path)) + pickle.dump({"error": "solution failed while generating points"}, f) +""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path))) .replace("__RESULT_PATH__", repr(str(result_path))) - .replace("__EVALUATOR_PATH__", repr(str(Path(__file__).resolve()))), + .replace("__N_POINTS__", repr(N_POINTS)), encoding="utf-8", ) + preexec_fn = _solution_preexec() + if preexec_fn is not None: + nobody = pwd.getpwnam("nobody") + os.chown(tmp, nobody.pw_uid, nobody.pw_gid) + os.chown(isolated_solution_path, nobody.pw_uid, nobody.pw_gid) + os.chown(runner_path, nobody.pw_uid, nobody.pw_gid) + os.chmod(tmp, 0o700 if preexec_fn is not None else 0o755) proc = subprocess.run( [sys.executable, str(runner_path)], capture_output=True, text=True, timeout=TIMEOUT_SECONDS, + preexec_fn=preexec_fn, ) - logs = (proc.stdout or "") + (proc.stderr or "") if proc.returncode != 0: - raise RuntimeError(f"solution runner exited with code {proc.returncode}\n{logs}") + raise RuntimeError(f"solution runner exited with code {proc.returncode}") if not result_path.exists(): raise RuntimeError("solution did not produce a result") with result_path.open("rb") as f: payload = pickle.load(f) if "error" in payload: - raise RuntimeError(payload["error"] + "\n" + payload.get("trace", "")) - return payload["points"], logs + raise RuntimeError("solution failed while generating points") + return payload["points"], "" def _validate_points(points: list[tuple[float, float]]) -> None: @@ -162,7 +222,7 @@ def _count_unit_distance_pairs(points: list[tuple[float, float]]) -> int: def evaluate(solution_path: str) -> tuple[float, float, str]: - raw_points, logs = _run_solution(solution_path) + raw_points, _ = _run_solution(solution_path) points = _to_points(raw_points) _validate_points(points) unit_pairs = _count_unit_distance_pairs(points) @@ -177,8 +237,6 @@ def evaluate(solution_path: str) -> tuple[float, float, str]: f"baseline={BASELINE_EDGES}; " f"score={score:.6f}; score_unbounded={score_unbounded:.6f}" ) - if logs: - message += "\n" + logs.strip() return score, score_unbounded, message diff --git a/2.0/problems/erdos_unit_distance/readme b/2.0/problems/erdos_unit_distance/readme index 095aff29..eaf04115 100644 --- a/2.0/problems/erdos_unit_distance/readme +++ b/2.0/problems/erdos_unit_distance/readme @@ -41,7 +41,7 @@ A solution is valid if: 1. It returns exactly 65536 points. 2. Every coordinate is a finite real number. -3. No two points are closer than `1e-6`. +3. No two points are closer than `1e-3`. The objective is translation-invariant. Very large coordinates are allowed as long as pairwise squared distances remain finite. @@ -49,7 +49,7 @@ long as pairwise squared distances remain finite. ## Objective For all unordered point pairs, count those whose squared Euclidean distance is -equal to `1` within a small floating-point tolerance. Let `M` be that count. +equal to `1` within a strict floating-point tolerance. Let `M` be that count. Maximize `M`. diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md index 8c8e1ae7..302e9025 100644 --- a/adapters/frontier-cs-2.0/README.md +++ b/adapters/frontier-cs-2.0/README.md @@ -37,9 +37,9 @@ uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-unit ## Task Contract -The agent works in `/app` and must create `/app/solution.py`. The verifier runs -the original Frontier-CS `2.0` evaluator and writes a normalized reward in -`/logs/verifier/reward.txt`. +The agent works in `/app` and must create `/app/solution.py`. The final +verifier runs the original Frontier-CS `2.0` evaluator and writes a normalized +reward in `/logs/verifier/reward.txt`. During the trial, the agent can call: @@ -47,9 +47,10 @@ During the trial, the agent can call: bash /app/submit.sh ``` -This runs the same evaluator against the current `/app/solution.py`, prints the -score and feedback, and records each attempt in -`/logs/agent/submissions.jsonl`. The final verifier mirrors that log to +This submits the current `/app/solution.py` to a black-box judge service, +prints the score and feedback, and records each attempt in +`/logs/agent/submissions.jsonl`. The evaluator source is not mounted into the +agent workspace. The final verifier mirrors that log to `/logs/verifier/submissions.jsonl` for process-reward analysis. The reported reward is the maximum of the final `/app/solution.py` score and the best successful iterative submission, so a timed-out agent can keep its best diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py index 2539668d..bc0c4bb9 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py @@ -133,11 +133,11 @@ def generate_task( def _write_instruction(self, task_paths: "TaskPaths", problem: FrontierCS20Problem) -> None: instruction = ( "You are solving a Frontier-CS 2.0 open-ended optimization problem.\n\n" - "Create a Python solution at `/app/solution.py`. The verifier will run " - "the source Frontier-CS evaluator and convert its 0-100 score into a " - "Harbor reward in [0, 1]. You can call `bash /app/submit.sh` at any " - "time to grade the current solution with the same evaluator and get " - "feedback before the final verifier run.\n\n" + "Create a Python solution at `/app/solution.py`. You can call " + "`bash /app/submit.sh` at any time to grade the current solution " + "with the same black-box judge used by the final verifier and get " + "score feedback. The evaluator implementation is intentionally not " + "available in the agent workspace.\n\n" f"Problem id: `{problem.problem_id}`\n" f"Language: `{problem.language}`\n" f"Time limit: `{problem.timeout_seconds}s`\n\n" @@ -161,8 +161,19 @@ def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Probl src = problem.problem_dir / name if src.exists(): shutil.copy2(src, env_dir / name) + + judge_dockerfile = ( + self.template_dir / "environment" / "Dockerfile.judge" + ).read_text(encoding="utf-8") + env_dir.joinpath("Dockerfile.judge").write_text( + judge_dockerfile.replace("{base_image}", image), + encoding="utf-8", + ) + for name in ("docker-compose.yaml", "judge_server.py", "submit.py"): + shutil.copy2(self.template_dir / "environment" / name, env_dir / name) + # Kept in the build context for the judge image only; the main agent + # image's Dockerfile does not copy this into /app. shutil.copy2(problem.problem_dir / "evaluator.py", env_dir / "problem_evaluator.py") - shutil.copy2(self.template_dir / "environment" / "submit.py", env_dir / "submit.py") submit_sh = env_dir / "submit.sh" shutil.copy2(self.template_dir / "environment" / "submit.sh", submit_sh) submit_sh.chmod(0o755) diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile index 3ef7a17c..7873525c 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile @@ -23,5 +23,5 @@ ENV CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000 WORKDIR /app -COPY readme config.yaml problem_evaluator.py submit.py submit.sh /app/ +COPY readme config.yaml submit.py submit.sh /app/ RUN chmod +x /app/submit.sh diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge new file mode 100644 index 00000000..6207d4fa --- /dev/null +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge @@ -0,0 +1,13 @@ +FROM {base_image} + +RUN apt-get update && \ + apt-get install -y --no-install-recommends python3 ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /judge + +COPY judge_server.py problem_evaluator.py /judge/ +RUN chmod 600 /judge/problem_evaluator.py + +EXPOSE 8082 +CMD ["python3", "/judge/judge_server.py"] diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml new file mode 100644 index 00000000..4e07cbdc --- /dev/null +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml @@ -0,0 +1,17 @@ +services: + main: + depends_on: + judge: + condition: service_started + environment: + JUDGE_URL: "http://judge:8082" + + judge: + build: + context: ${CONTEXT_DIR} + dockerfile: Dockerfile.judge + command: ["python3", "/judge/judge_server.py"] + expose: + - "8082" + environment: + PORT: "8082" diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py new file mode 100644 index 00000000..9b9ef1d1 --- /dev/null +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +"""Black-box Frontier-CS 2.0 judge service for Harbor agent submissions.""" + +from __future__ import annotations + +import importlib.util +import json +import os +import tempfile +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any + +PROBLEM_EVALUATOR_PATH = Path("/judge/problem_evaluator.py") +MAX_SUBMISSION_BYTES = 2_000_000 + + +def load_problem_evaluator(): + spec = importlib.util.spec_from_file_location( + "frontier_cs_2_0_problem_evaluator", PROBLEM_EVALUATOR_PATH + ) + if spec is None or spec.loader is None: + raise RuntimeError(f"could not load evaluator from {PROBLEM_EVALUATOR_PATH}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +EVALUATOR = load_problem_evaluator() + + +def evaluate_code(code: str) -> dict[str, Any]: + with tempfile.TemporaryDirectory(prefix="frontier_cs_2_0_submission_") as tmp: + solution_path = Path(tmp) / "solution.py" + solution_path.write_text(code, encoding="utf-8") + score, score_unbounded, message = EVALUATOR.evaluate(str(solution_path)) + return { + "status": "done", + "score": float(score), + "score_unbounded": float(score_unbounded), + "message": message, + } + + +class JudgeHandler(BaseHTTPRequestHandler): + server_version = "FrontierCS20Judge/1.0" + + def _write_json(self, status: int, payload: dict[str, Any]) -> None: + body = json.dumps(payload, ensure_ascii=False).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_GET(self) -> None: + if self.path == "/health": + self._write_json(200, {"status": "ok"}) + return + self._write_json(404, {"status": "error", "error": "not found"}) + + def do_POST(self) -> None: + if self.path != "/evaluate": + self._write_json(404, {"status": "error", "error": "not found"}) + return + + try: + content_length = int(self.headers.get("Content-Length", "0")) + except ValueError: + self._write_json(400, {"status": "error", "error": "invalid content length"}) + return + + if content_length <= 0: + self._write_json(400, {"status": "error", "error": "empty request body"}) + return + if content_length > MAX_SUBMISSION_BYTES: + self._write_json(413, {"status": "error", "error": "submission too large"}) + return + + try: + payload = json.loads(self.rfile.read(content_length).decode("utf-8")) + code = payload.get("code") + if not isinstance(code, str) or not code.strip(): + raise ValueError("request JSON must include non-empty string field 'code'") + self._write_json(200, evaluate_code(code)) + except Exception: + self._write_json( + 200, + { + "status": "error", + "score": 0.0, + "score_unbounded": 0.0, + "message": "evaluation failed", + }, + ) + + def log_message(self, fmt: str, *args: object) -> None: + return + + +def main() -> None: + port = int(os.environ.get("PORT", "8082")) + server = ThreadingHTTPServer(("0.0.0.0", port), JudgeHandler) + server.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py index eeab3c93..0f89f1f3 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py @@ -3,8 +3,8 @@ from __future__ import annotations -import importlib.util import json +import os import sys import time import traceback @@ -12,9 +12,12 @@ from datetime import datetime, timezone from pathlib import Path +import requests + SOLUTION_PATH = Path("/app/solution.py") -PROBLEM_EVALUATOR_PATH = Path("/app/problem_evaluator.py") SUBMISSIONS_LOG = Path("/logs/agent/submissions.jsonl") +JUDGE_URL = os.environ.get("JUDGE_URL", "http://judge:8082").rstrip("/") +JUDGE_TIMEOUT_SECONDS = int(os.environ.get("JUDGE_TIMEOUT_SECONDS", "10800")) def now_iso() -> str: @@ -31,15 +34,36 @@ def log_record(record: dict) -> None: f.write(json.dumps(record, ensure_ascii=False) + "\n") -def load_problem_evaluator(): - spec = importlib.util.spec_from_file_location( - "frontier_cs_2_0_problem_evaluator", PROBLEM_EVALUATOR_PATH +def wait_for_judge() -> None: + deadline = time.time() + 60 + last_error: Exception | None = None + while time.time() < deadline: + try: + response = requests.get(f"{JUDGE_URL}/health", timeout=5) + if response.status_code == 200: + return + except Exception as exc: + last_error = exc + time.sleep(1) + raise RuntimeError(f"judge service is not ready at {JUDGE_URL}: {last_error}") + + +def evaluate_with_judge(code: str) -> tuple[float, float, str]: + wait_for_judge() + response = requests.post( + f"{JUDGE_URL}/evaluate", + json={"code": code}, + timeout=JUDGE_TIMEOUT_SECONDS, + ) + response.raise_for_status() + payload = response.json() + if payload.get("status") != "done": + raise RuntimeError(str(payload.get("message") or payload.get("error") or payload)) + return ( + float(payload.get("score", 0.0)), + float(payload.get("score_unbounded", payload.get("score", 0.0))), + str(payload.get("message", "")), ) - if spec is None or spec.loader is None: - raise RuntimeError(f"could not load evaluator from {PROBLEM_EVALUATOR_PATH}") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module def main() -> int: @@ -88,8 +112,7 @@ def main() -> int: try: start = time.time() - evaluator = load_problem_evaluator() - score, score_unbounded, message = evaluator.evaluate(str(solution_path)) + score, score_unbounded, message = evaluate_with_judge(code) elapsed_seconds = time.time() - start reward = float(score) / 100.0 From ce02f7253bcc0c9e885abbdd828aedae06a057f5 Mon Sep 17 00:00:00 2001 From: Qiuyang Mang Date: Wed, 27 May 2026 07:40:47 -0700 Subject: [PATCH 2/3] feat: add Erdos demo task --- 2.0/README.md | 7 + 2.0/problems/erdos_demo/config.yaml | 7 + 2.0/problems/erdos_demo/evaluate.sh | 12 ++ 2.0/problems/erdos_demo/evaluator.py | 263 +++++++++++++++++++++++++++ 2.0/problems/erdos_demo/readme | 72 ++++++++ 2.0/problems/erdos_demo/reference.py | 16 ++ README.md | 8 +- adapters/frontier-cs-2.0/README.md | 11 ++ 8 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 2.0/problems/erdos_demo/config.yaml create mode 100644 2.0/problems/erdos_demo/evaluate.sh create mode 100644 2.0/problems/erdos_demo/evaluator.py create mode 100644 2.0/problems/erdos_demo/readme create mode 100644 2.0/problems/erdos_demo/reference.py diff --git a/2.0/README.md b/2.0/README.md index 6e56e5e6..84d3ade0 100644 --- a/2.0/README.md +++ b/2.0/README.md @@ -12,3 +12,10 @@ that as many pairs as possible have distance exactly `1`. Its problem ID is `erdos_unit_distance`, matching the problem directory name. It is inspired by the planar unit distance problem highlighted by OpenAI's May 2026 unit-distance result. + +## Erdos Unit Distance Demo + +The demo variant uses the same interface and scoring rule with only `N = 10` +points. Its problem ID is `erdos_demo`. It is intended as a quick visual sanity +check for Harborized agent workflows before running the larger +`erdos_unit_distance` task. diff --git a/2.0/problems/erdos_demo/config.yaml b/2.0/problems/erdos_demo/config.yaml new file mode 100644 index 00000000..ba4256ac --- /dev/null +++ b/2.0/problems/erdos_demo/config.yaml @@ -0,0 +1,7 @@ +tag: geometry +runtime: + language: python + timeout_seconds: 300 + environment: "Python 3.11; no external packages required" + docker: + image: ubuntu:24.04 diff --git a/2.0/problems/erdos_demo/evaluate.sh b/2.0/problems/erdos_demo/evaluate.sh new file mode 100644 index 00000000..23cd83b3 --- /dev/null +++ b/2.0/problems/erdos_demo/evaluate.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SOLUTION="/work/execution_env/solution_env/solution.py" + +if [[ ! -f "$SOLUTION" ]]; then + echo "Error: Missing $SOLUTION" >&2 + exit 1 +fi + +python "$SCRIPT_DIR/evaluator.py" "$SOLUTION" diff --git a/2.0/problems/erdos_demo/evaluator.py b/2.0/problems/erdos_demo/evaluator.py new file mode 100644 index 00000000..26c0feca --- /dev/null +++ b/2.0/problems/erdos_demo/evaluator.py @@ -0,0 +1,263 @@ +"""Evaluator for the Erdos unit distance demo problem.""" + +from __future__ import annotations + +import importlib.util +import math +import os +import pickle +import pwd +import shutil +import subprocess +import sys +import tempfile +import traceback +from pathlib import Path +from typing import Any + +N_POINTS = 10 +BASELINE_EDGES = N_POINTS +TIMEOUT_SECONDS = 300 +UNIT_DISTANCE = 1.0 +DISTANCE_REL_TOL = 1e-7 +DISTANCE_ABS_TOL = 1e-9 +MIN_SEPARATION = 1e-6 + + +def _protect_evaluator_source() -> None: + """Hide evaluator source from unprivileged submitted solutions in containers.""" + try: + evaluator_path = Path(__file__).resolve() + if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0: + evaluator_path.chmod(0o600) + except Exception: + pass + + +_protect_evaluator_source() + + +def _solution_preexec(): + """Return a preexec_fn that runs submitted code as nobody when possible.""" + if os.name != "posix": + return None + try: + if os.geteuid() != 0: + return None + nobody = pwd.getpwnam("nobody") + except Exception: + return None + + def demote() -> None: + os.setgid(nobody.pw_gid) + os.setuid(nobody.pw_uid) + + return demote + + +def _is_number(value: Any) -> bool: + if isinstance(value, bool): + return False + try: + return math.isfinite(float(value)) + except Exception: + return False + + +def _to_points(raw: Any) -> list[tuple[float, float]]: + try: + values = raw.tolist() + except Exception: + values = list(raw) + + points: list[tuple[float, float]] = [] + for index, item in enumerate(values): + try: + pair = item.tolist() + except Exception: + pair = item + if not isinstance(pair, (list, tuple)) or len(pair) != 2: + raise ValueError(f"point {index} is not a 2D coordinate pair") + x, y = pair + if not _is_number(x) or not _is_number(y): + raise ValueError(f"point {index} has a non-finite coordinate") + points.append((float(x), float(y))) + return points + + +def _load_points(solution_path: str) -> Any: + spec = importlib.util.spec_from_file_location("solution", solution_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"could not import solution from {solution_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name in ("solve", "generate_points", "run"): + fn = getattr(module, name, None) + if callable(fn): + return fn(N_POINTS) + + points = getattr(module, "POINTS", None) + if points is not None: + return points + + raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS") + + +def _run_solution(solution_path: str) -> tuple[Any, str]: + with tempfile.TemporaryDirectory(prefix="erdos_unit_distance_") as tmp: + tmp_path = Path(tmp) + isolated_solution_path = tmp_path / "solution.py" + result_path = Path(tmp) / "result.pkl" + runner_path = Path(tmp) / "runner.py" + shutil.copy2(solution_path, isolated_solution_path) + runner_path.write_text( + """ +import importlib.util +import pickle +from pathlib import Path + +solution_path = __SOLUTION_PATH__ +result_path = Path(__RESULT_PATH__) +n_points = __N_POINTS__ + + +def load_points(): + spec = importlib.util.spec_from_file_location("solution", solution_path) + if spec is None or spec.loader is None: + raise RuntimeError("could not import solution") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name in ("solve", "generate_points", "run"): + fn = getattr(module, name, None) + if callable(fn): + return fn(n_points) + + points = getattr(module, "POINTS", None) + if points is not None: + return points + + raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS") + +try: + points = load_points() + with result_path.open("wb") as f: + pickle.dump({"points": points}, f) +except Exception: + with result_path.open("wb") as f: + pickle.dump({"error": "solution failed while generating points"}, f) +""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path))) + .replace("__RESULT_PATH__", repr(str(result_path))) + .replace("__N_POINTS__", repr(N_POINTS)), + encoding="utf-8", + ) + preexec_fn = _solution_preexec() + if preexec_fn is not None: + nobody = pwd.getpwnam("nobody") + os.chown(tmp, nobody.pw_uid, nobody.pw_gid) + os.chown(isolated_solution_path, nobody.pw_uid, nobody.pw_gid) + os.chown(runner_path, nobody.pw_uid, nobody.pw_gid) + os.chmod(tmp, 0o700 if preexec_fn is not None else 0o755) + + proc = subprocess.run( + [sys.executable, str(runner_path)], + capture_output=True, + text=True, + timeout=TIMEOUT_SECONDS, + preexec_fn=preexec_fn, + ) + if proc.returncode != 0: + raise RuntimeError(f"solution runner exited with code {proc.returncode}") + if not result_path.exists(): + raise RuntimeError("solution did not produce a result") + with result_path.open("rb") as f: + payload = pickle.load(f) + if "error" in payload: + raise RuntimeError("solution failed while generating points") + return payload["points"], "" + + +def _validate_points(points: list[tuple[float, float]]) -> None: + if len(points) != N_POINTS: + raise ValueError(f"expected {N_POINTS} points, got {len(points)}") + + buckets: dict[tuple[int, int], list[tuple[float, float]]] = {} + min_sep2 = MIN_SEPARATION * MIN_SEPARATION + for index, (x, y) in enumerate(points): + if not math.isfinite(x) or not math.isfinite(y): + raise ValueError(f"point {index} has a non-finite coordinate") + key = (math.floor(x / MIN_SEPARATION), math.floor(y / MIN_SEPARATION)) + for dx in (-1, 0, 1): + for dy in (-1, 0, 1): + for px, py in buckets.get((key[0] + dx, key[1] + dy), ()): + sep2 = (x - px) * (x - px) + (y - py) * (y - py) + if sep2 < min_sep2: + raise ValueError( + f"point {index} is closer than {MIN_SEPARATION:g} to another point" + ) + buckets.setdefault(key, []).append((x, y)) + + +def _count_unit_distance_pairs(points: list[tuple[float, float]]) -> int: + buckets: dict[tuple[int, int], list[tuple[float, float]]] = {} + target2 = UNIT_DISTANCE * UNIT_DISTANCE + tol = max(DISTANCE_ABS_TOL, DISTANCE_REL_TOL * target2) + neighbor_radius = math.ceil((UNIT_DISTANCE + tol) / UNIT_DISTANCE) + 1 + unit_pairs = 0 + + for x, y in points: + key = (math.floor(x / UNIT_DISTANCE), math.floor(y / UNIT_DISTANCE)) + for dx in range(-neighbor_radius, neighbor_radius + 1): + for dy in range(-neighbor_radius, neighbor_radius + 1): + for px, py in buckets.get((key[0] + dx, key[1] + dy), ()): + d2 = (x - px) * (x - px) + (y - py) * (y - py) + if not math.isfinite(d2): + raise ValueError("pairwise distance overflowed") + if abs(d2 - target2) <= tol: + unit_pairs += 1 + buckets.setdefault(key, []).append((x, y)) + + return unit_pairs + + +def evaluate(solution_path: str) -> tuple[float, float, str]: + raw_points, _ = _run_solution(solution_path) + points = _to_points(raw_points) + _validate_points(points) + unit_pairs = _count_unit_distance_pairs(points) + + if unit_pairs <= 0: + score = 0.0 + else: + score = max(0.0, 100.0 * (unit_pairs - BASELINE_EDGES) / unit_pairs) + score_unbounded = score + message = ( + f"N={N_POINTS}; unit_pairs={unit_pairs}; unit_distance={UNIT_DISTANCE:.12g}; " + f"baseline={BASELINE_EDGES}; " + f"score={score:.6f}; score_unbounded={score_unbounded:.6f}" + ) + return score, score_unbounded, message + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + print("usage: evaluator.py /path/to/solution.py", file=sys.stderr) + return 1 + try: + score, score_unbounded, message = evaluate(argv[1]) + print(message, file=sys.stderr) + print(f"{score:.12f} {score_unbounded:.12f}") + return 0 + except subprocess.TimeoutExpired: + print(f"timed out after {TIMEOUT_SECONDS}s", file=sys.stderr) + print("0.0 0.0") + return 0 + except Exception: + print(traceback.format_exc(), file=sys.stderr) + print("0.0 0.0") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/2.0/problems/erdos_demo/readme b/2.0/problems/erdos_demo/readme new file mode 100644 index 00000000..2f8715c2 --- /dev/null +++ b/2.0/problems/erdos_demo/readme @@ -0,0 +1,72 @@ +# Erdos Unit Distance Demo + +## Problem + +Place exactly `N = 10` distinct points in the Euclidean plane so that the +number of point pairs at Euclidean distance exactly `1` is as large as possible. + +This is a tiny, visually inspectable demo version of the planar unit distance +problem. If your construction naturally has a different common distance, scale +the coordinates before returning them. + +## Program Interface + +Submit a Python file defining one of the following: + +```python +def solve(n: int) -> list[tuple[float, float]]: + ... +``` + +or: + +```python +def generate_points(n: int) -> list[tuple[float, float]]: + ... +``` + +or: + +```python +POINTS = [(0.0, 0.0), (1.0, 0.0), ...] +``` + +The returned value must contain exactly 10 two-dimensional points. No stdin is +used. + +## Validity Constraints + +A solution is valid if: + +1. It returns exactly 10 points. +2. Every coordinate is a finite real number. +3. No two points are closer than `1e-6`. + +The objective is translation-invariant. Very large coordinates are allowed as +long as pairwise squared distances remain finite. + +## Objective + +For all unordered point pairs, count those whose squared Euclidean distance is +equal to `1` within a small floating-point tolerance. Let `M` be that count. + +Maximize `M`. + +## Scoring + +The score is naturally scaled to `[0, 100)`, without clipping against a fixed +target. Let: + +```text +baseline = N +X = M +``` + +If the point set is invalid, or if `X <= baseline`, the score is `0`. Otherwise: + +```text +score = 100 * (X - baseline) / X +``` + +This makes the simple `N`-pair baseline worth `0`. With only 10 points, the +problem is intended as a quick sanity check and visual demo for agent workflows. diff --git a/2.0/problems/erdos_demo/reference.py b/2.0/problems/erdos_demo/reference.py new file mode 100644 index 00000000..e268ede6 --- /dev/null +++ b/2.0/problems/erdos_demo/reference.py @@ -0,0 +1,16 @@ +"""Baseline solution: equally spaced points on a regular polygon.""" + +from __future__ import annotations + +import math + + +def solve(n: int): + radius = 1.0 / (2.0 * math.sin(math.pi / n)) + return [ + ( + radius * math.cos(2.0 * math.pi * i / n), + radius * math.sin(2.0 * math.pi * i / n), + ) + for i in range(n) + ] diff --git a/README.md b/README.md index dafb7de4..25c6300a 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Research Problems Algorithmic Problems - 2.0 Problems + 2.0 Problems

## News @@ -139,7 +139,8 @@ isolated from Frontier-CS's own `uv sync` environment. Frontier-CS 2.0 is agent-first: current 2.0 problems are meant to be run through Harbor-compatible agents rather than direct one-shot solution files. -Problem IDs are their problem directory names, such as `erdos_unit_distance`. +Problem IDs are their problem directory names, such as `erdos_unit_distance` +and the small `erdos_demo`. ```bash # List 2.0 problems @@ -147,6 +148,9 @@ frontier list 2.0 # Run a 2.0 task with an agent through the Harbor wrapper uv run frontier harbor trial 2.0 erdos_unit_distance -a codex -m gpt-5.5 --json + +# Run the small N=10 demo task +uv run frontier harbor trial 2.0 erdos_demo -a codex -m gpt-5.5 --json ``` See [2.0/README.md](2.0/README.md) for the current 2.0 track. diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md index 302e9025..342e3b41 100644 --- a/adapters/frontier-cs-2.0/README.md +++ b/adapters/frontier-cs-2.0/README.md @@ -28,11 +28,22 @@ uv run frontier-cs-2-0 \ --overwrite ``` +Generate only the small Erdos demo task: + +```bash +uv run frontier-cs-2-0 \ + --source ../.. \ + --output-dir ../../datasets/frontier-cs-2.0 \ + --task-ids erdos_demo \ + --overwrite +``` + ## Run with Harbor ```bash uv run harbor run -p datasets/frontier-cs-2.0 uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-unit-distance +uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-demo ``` ## Task Contract From 243940ac9a210394ac3fe5a467699a3d67cda120 Mon Sep 17 00:00:00 2001 From: Qiuyang Mang Date: Wed, 27 May 2026 09:20:07 -0700 Subject: [PATCH 3/3] fix: regenerate default Harbor tasks before trials --- src/frontier_cs/cli.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/frontier_cs/cli.py b/src/frontier_cs/cli.py index fa25147f..044ac735 100644 --- a/src/frontier_cs/cli.py +++ b/src/frontier_cs/cli.py @@ -1399,29 +1399,33 @@ def run_harbor(args: argparse.Namespace) -> int: return 1 task_name = _harbor_task_name(args.track, args.problem_id) + using_default_task_path = args.task_path is None task_path = args.task_path dataset_dir = args.dataset_dir or _default_harbor_dataset_dir(args.track) if task_path is None: task_path = dataset_dir / task_name - if not task_path.exists(): - if task_path == dataset_dir / task_name and not args.no_generate: - _progress(f"Generating task {task_name}") - try: - _generate_harbor_task(args.track, args.problem_id, dataset_dir) - except RuntimeError as exc: - print(f"Error: {exc}", file=sys.stderr) - return 1 - if task_path.exists(): - pass - else: - print( - f"Error: Harbor task path not found: {task_path}\n" - "Generate Harbor tasks first, or pass --task-path / --dataset-dir.", - file=sys.stderr, - ) + should_generate = ( + using_default_task_path + and task_path == dataset_dir / task_name + and not args.no_generate + ) + if should_generate: + _progress(f"Generating task {task_name}") + try: + _generate_harbor_task(args.track, args.problem_id, dataset_dir) + except RuntimeError as exc: + print(f"Error: {exc}", file=sys.stderr) return 1 + if not task_path.exists(): + print( + f"Error: Harbor task path not found: {task_path}\n" + "Generate Harbor tasks first, or pass --task-path / --dataset-dir.", + file=sys.stderr, + ) + return 1 + trials_dir = args.trials_dir or _default_harbor_trials_dir() trials_dir.mkdir(parents=True, exist_ok=True)