From f98ee58bca86446dd0a19240247755949782e7e5 Mon Sep 17 00:00:00 2001
From: Qiuyang Mang
Date: Wed, 27 May 2026 08:15:08 -0700
Subject: [PATCH 1/3] feat: isolate Frontier-CS 2.0 evaluator
---
2.0/problems/erdos_unit_distance/evaluator.py | 98 ++++++++++++----
2.0/problems/erdos_unit_distance/readme | 4 +-
adapters/frontier-cs-2.0/README.md | 13 ++-
.../src/frontier_cs_2_0/adapter.py | 23 +++-
.../task-template/environment/Dockerfile | 2 +-
.../environment/Dockerfile.judge | 13 +++
.../environment/docker-compose.yaml | 17 +++
.../task-template/environment/judge_server.py | 108 ++++++++++++++++++
.../task-template/environment/submit.py | 47 ++++++--
9 files changed, 278 insertions(+), 47 deletions(-)
create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge
create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
diff --git a/2.0/problems/erdos_unit_distance/evaluator.py b/2.0/problems/erdos_unit_distance/evaluator.py
index 0ee6c289..9be17a37 100644
--- a/2.0/problems/erdos_unit_distance/evaluator.py
+++ b/2.0/problems/erdos_unit_distance/evaluator.py
@@ -4,7 +4,10 @@
import importlib.util
import math
+import os
import pickle
+import pwd
+import shutil
import subprocess
import sys
import tempfile
@@ -16,9 +19,40 @@
BASELINE_EDGES = N_POINTS
TIMEOUT_SECONDS = 10800
UNIT_DISTANCE = 1.0
-DISTANCE_REL_TOL = 1e-7
-DISTANCE_ABS_TOL = 1e-9
-MIN_SEPARATION = 1e-6
+DISTANCE_REL_TOL = 1e-10
+DISTANCE_ABS_TOL = 1e-10
+MIN_SEPARATION = 1e-3
+
+
+def _protect_evaluator_source() -> None:
+ """Hide evaluator source from unprivileged submitted solutions in containers."""
+ try:
+ evaluator_path = Path(__file__).resolve()
+ if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0:
+ evaluator_path.chmod(0o600)
+ except Exception:
+ pass
+
+
+_protect_evaluator_source()
+
+
+def _solution_preexec():
+ """Return a preexec_fn that runs submitted code as nobody when possible."""
+ if os.name != "posix":
+ return None
+ try:
+ if os.geteuid() != 0:
+ return None
+ nobody = pwd.getpwnam("nobody")
+ except Exception:
+ return None
+
+ def demote() -> None:
+ os.setgid(nobody.pw_gid)
+ os.setuid(nobody.pw_uid)
+
+ return demote
def _is_number(value: Any) -> bool:
@@ -72,50 +106,76 @@ def _load_points(solution_path: str) -> Any:
def _run_solution(solution_path: str) -> tuple[Any, str]:
with tempfile.TemporaryDirectory(prefix="erdos_unit_distance_") as tmp:
+ tmp_path = Path(tmp)
+ isolated_solution_path = tmp_path / "solution.py"
result_path = Path(tmp) / "result.pkl"
runner_path = Path(tmp) / "runner.py"
+ shutil.copy2(solution_path, isolated_solution_path)
runner_path.write_text(
"""
+import importlib.util
import pickle
-import traceback
from pathlib import Path
solution_path = __SOLUTION_PATH__
result_path = Path(__RESULT_PATH__)
+n_points = __N_POINTS__
+
+
+def load_points():
+ spec = importlib.util.spec_from_file_location("solution", solution_path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError("could not import solution")
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+
+ for name in ("solve", "generate_points", "run"):
+ fn = getattr(module, name, None)
+ if callable(fn):
+ return fn(n_points)
+
+ points = getattr(module, "POINTS", None)
+ if points is not None:
+ return points
+
+ raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS")
try:
- import importlib.util
- spec = importlib.util.spec_from_file_location("evaluator", __EVALUATOR_PATH__)
- evaluator = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(evaluator)
- points = evaluator._load_points(solution_path)
+ points = load_points()
with result_path.open("wb") as f:
pickle.dump({"points": points}, f)
-except Exception as exc:
+except Exception:
with result_path.open("wb") as f:
- pickle.dump({"error": str(exc), "trace": traceback.format_exc()}, f)
-""".replace("__SOLUTION_PATH__", repr(solution_path))
+ pickle.dump({"error": "solution failed while generating points"}, f)
+""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path)))
.replace("__RESULT_PATH__", repr(str(result_path)))
- .replace("__EVALUATOR_PATH__", repr(str(Path(__file__).resolve()))),
+ .replace("__N_POINTS__", repr(N_POINTS)),
encoding="utf-8",
)
+ preexec_fn = _solution_preexec()
+ if preexec_fn is not None:
+ nobody = pwd.getpwnam("nobody")
+ os.chown(tmp, nobody.pw_uid, nobody.pw_gid)
+ os.chown(isolated_solution_path, nobody.pw_uid, nobody.pw_gid)
+ os.chown(runner_path, nobody.pw_uid, nobody.pw_gid)
+ os.chmod(tmp, 0o700 if preexec_fn is not None else 0o755)
proc = subprocess.run(
[sys.executable, str(runner_path)],
capture_output=True,
text=True,
timeout=TIMEOUT_SECONDS,
+ preexec_fn=preexec_fn,
)
- logs = (proc.stdout or "") + (proc.stderr or "")
if proc.returncode != 0:
- raise RuntimeError(f"solution runner exited with code {proc.returncode}\n{logs}")
+ raise RuntimeError(f"solution runner exited with code {proc.returncode}")
if not result_path.exists():
raise RuntimeError("solution did not produce a result")
with result_path.open("rb") as f:
payload = pickle.load(f)
if "error" in payload:
- raise RuntimeError(payload["error"] + "\n" + payload.get("trace", ""))
- return payload["points"], logs
+ raise RuntimeError("solution failed while generating points")
+ return payload["points"], ""
def _validate_points(points: list[tuple[float, float]]) -> None:
@@ -162,7 +222,7 @@ def _count_unit_distance_pairs(points: list[tuple[float, float]]) -> int:
def evaluate(solution_path: str) -> tuple[float, float, str]:
- raw_points, logs = _run_solution(solution_path)
+ raw_points, _ = _run_solution(solution_path)
points = _to_points(raw_points)
_validate_points(points)
unit_pairs = _count_unit_distance_pairs(points)
@@ -177,8 +237,6 @@ def evaluate(solution_path: str) -> tuple[float, float, str]:
f"baseline={BASELINE_EDGES}; "
f"score={score:.6f}; score_unbounded={score_unbounded:.6f}"
)
- if logs:
- message += "\n" + logs.strip()
return score, score_unbounded, message
diff --git a/2.0/problems/erdos_unit_distance/readme b/2.0/problems/erdos_unit_distance/readme
index 095aff29..eaf04115 100644
--- a/2.0/problems/erdos_unit_distance/readme
+++ b/2.0/problems/erdos_unit_distance/readme
@@ -41,7 +41,7 @@ A solution is valid if:
1. It returns exactly 65536 points.
2. Every coordinate is a finite real number.
-3. No two points are closer than `1e-6`.
+3. No two points are closer than `1e-3`.
The objective is translation-invariant. Very large coordinates are allowed as
long as pairwise squared distances remain finite.
@@ -49,7 +49,7 @@ long as pairwise squared distances remain finite.
## Objective
For all unordered point pairs, count those whose squared Euclidean distance is
-equal to `1` within a small floating-point tolerance. Let `M` be that count.
+equal to `1` within a strict floating-point tolerance. Let `M` be that count.
Maximize `M`.
diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md
index 8c8e1ae7..302e9025 100644
--- a/adapters/frontier-cs-2.0/README.md
+++ b/adapters/frontier-cs-2.0/README.md
@@ -37,9 +37,9 @@ uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-unit
## Task Contract
-The agent works in `/app` and must create `/app/solution.py`. The verifier runs
-the original Frontier-CS `2.0` evaluator and writes a normalized reward in
-`/logs/verifier/reward.txt`.
+The agent works in `/app` and must create `/app/solution.py`. The final
+verifier runs the original Frontier-CS `2.0` evaluator and writes a normalized
+reward in `/logs/verifier/reward.txt`.
During the trial, the agent can call:
@@ -47,9 +47,10 @@ During the trial, the agent can call:
bash /app/submit.sh
```
-This runs the same evaluator against the current `/app/solution.py`, prints the
-score and feedback, and records each attempt in
-`/logs/agent/submissions.jsonl`. The final verifier mirrors that log to
+This submits the current `/app/solution.py` to a black-box judge service,
+prints the score and feedback, and records each attempt in
+`/logs/agent/submissions.jsonl`. The evaluator source is not mounted into the
+agent workspace. The final verifier mirrors that log to
`/logs/verifier/submissions.jsonl` for process-reward analysis. The reported
reward is the maximum of the final `/app/solution.py` score and the best
successful iterative submission, so a timed-out agent can keep its best
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
index 2539668d..bc0c4bb9 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
@@ -133,11 +133,11 @@ def generate_task(
def _write_instruction(self, task_paths: "TaskPaths", problem: FrontierCS20Problem) -> None:
instruction = (
"You are solving a Frontier-CS 2.0 open-ended optimization problem.\n\n"
- "Create a Python solution at `/app/solution.py`. The verifier will run "
- "the source Frontier-CS evaluator and convert its 0-100 score into a "
- "Harbor reward in [0, 1]. You can call `bash /app/submit.sh` at any "
- "time to grade the current solution with the same evaluator and get "
- "feedback before the final verifier run.\n\n"
+ "Create a Python solution at `/app/solution.py`. You can call "
+ "`bash /app/submit.sh` at any time to grade the current solution "
+ "with the same black-box judge used by the final verifier and get "
+ "score feedback. The evaluator implementation is intentionally not "
+ "available in the agent workspace.\n\n"
f"Problem id: `{problem.problem_id}`\n"
f"Language: `{problem.language}`\n"
f"Time limit: `{problem.timeout_seconds}s`\n\n"
@@ -161,8 +161,19 @@ def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Probl
src = problem.problem_dir / name
if src.exists():
shutil.copy2(src, env_dir / name)
+
+ judge_dockerfile = (
+ self.template_dir / "environment" / "Dockerfile.judge"
+ ).read_text(encoding="utf-8")
+ env_dir.joinpath("Dockerfile.judge").write_text(
+ judge_dockerfile.replace("{base_image}", image),
+ encoding="utf-8",
+ )
+ for name in ("docker-compose.yaml", "judge_server.py", "submit.py"):
+ shutil.copy2(self.template_dir / "environment" / name, env_dir / name)
+ # Kept in the build context for the judge image only; the main agent
+ # image's Dockerfile does not copy this into /app.
shutil.copy2(problem.problem_dir / "evaluator.py", env_dir / "problem_evaluator.py")
- shutil.copy2(self.template_dir / "environment" / "submit.py", env_dir / "submit.py")
submit_sh = env_dir / "submit.sh"
shutil.copy2(self.template_dir / "environment" / "submit.sh", submit_sh)
submit_sh.chmod(0o755)
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile
index 3ef7a17c..7873525c 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile
@@ -23,5 +23,5 @@ ENV CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000
WORKDIR /app
-COPY readme config.yaml problem_evaluator.py submit.py submit.sh /app/
+COPY readme config.yaml submit.py submit.sh /app/
RUN chmod +x /app/submit.sh
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge
new file mode 100644
index 00000000..6207d4fa
--- /dev/null
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge
@@ -0,0 +1,13 @@
+FROM {base_image}
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends python3 ca-certificates && \
+ rm -rf /var/lib/apt/lists/*
+
+WORKDIR /judge
+
+COPY judge_server.py problem_evaluator.py /judge/
+RUN chmod 600 /judge/problem_evaluator.py
+
+EXPOSE 8082
+CMD ["python3", "/judge/judge_server.py"]
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
new file mode 100644
index 00000000..4e07cbdc
--- /dev/null
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
@@ -0,0 +1,17 @@
+services:
+ main:
+ depends_on:
+ judge:
+ condition: service_started
+ environment:
+ JUDGE_URL: "http://judge:8082"
+
+ judge:
+ build:
+ context: ${CONTEXT_DIR}
+ dockerfile: Dockerfile.judge
+ command: ["python3", "/judge/judge_server.py"]
+ expose:
+ - "8082"
+ environment:
+ PORT: "8082"
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
new file mode 100644
index 00000000..9b9ef1d1
--- /dev/null
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Black-box Frontier-CS 2.0 judge service for Harbor agent submissions."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import tempfile
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
+from typing import Any
+
+PROBLEM_EVALUATOR_PATH = Path("/judge/problem_evaluator.py")
+MAX_SUBMISSION_BYTES = 2_000_000
+
+
+def load_problem_evaluator():
+ spec = importlib.util.spec_from_file_location(
+ "frontier_cs_2_0_problem_evaluator", PROBLEM_EVALUATOR_PATH
+ )
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"could not load evaluator from {PROBLEM_EVALUATOR_PATH}")
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+EVALUATOR = load_problem_evaluator()
+
+
+def evaluate_code(code: str) -> dict[str, Any]:
+ with tempfile.TemporaryDirectory(prefix="frontier_cs_2_0_submission_") as tmp:
+ solution_path = Path(tmp) / "solution.py"
+ solution_path.write_text(code, encoding="utf-8")
+ score, score_unbounded, message = EVALUATOR.evaluate(str(solution_path))
+ return {
+ "status": "done",
+ "score": float(score),
+ "score_unbounded": float(score_unbounded),
+ "message": message,
+ }
+
+
+class JudgeHandler(BaseHTTPRequestHandler):
+ server_version = "FrontierCS20Judge/1.0"
+
+ def _write_json(self, status: int, payload: dict[str, Any]) -> None:
+ body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
+ self.send_response(status)
+ self.send_header("Content-Type", "application/json")
+ self.send_header("Content-Length", str(len(body)))
+ self.end_headers()
+ self.wfile.write(body)
+
+ def do_GET(self) -> None:
+ if self.path == "/health":
+ self._write_json(200, {"status": "ok"})
+ return
+ self._write_json(404, {"status": "error", "error": "not found"})
+
+ def do_POST(self) -> None:
+ if self.path != "/evaluate":
+ self._write_json(404, {"status": "error", "error": "not found"})
+ return
+
+ try:
+ content_length = int(self.headers.get("Content-Length", "0"))
+ except ValueError:
+ self._write_json(400, {"status": "error", "error": "invalid content length"})
+ return
+
+ if content_length <= 0:
+ self._write_json(400, {"status": "error", "error": "empty request body"})
+ return
+ if content_length > MAX_SUBMISSION_BYTES:
+ self._write_json(413, {"status": "error", "error": "submission too large"})
+ return
+
+ try:
+ payload = json.loads(self.rfile.read(content_length).decode("utf-8"))
+ code = payload.get("code")
+ if not isinstance(code, str) or not code.strip():
+ raise ValueError("request JSON must include non-empty string field 'code'")
+ self._write_json(200, evaluate_code(code))
+ except Exception:
+ self._write_json(
+ 200,
+ {
+ "status": "error",
+ "score": 0.0,
+ "score_unbounded": 0.0,
+ "message": "evaluation failed",
+ },
+ )
+
+ def log_message(self, fmt: str, *args: object) -> None:
+ return
+
+
+def main() -> None:
+ port = int(os.environ.get("PORT", "8082"))
+ server = ThreadingHTTPServer(("0.0.0.0", port), JudgeHandler)
+ server.serve_forever()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
index eeab3c93..0f89f1f3 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
@@ -3,8 +3,8 @@
from __future__ import annotations
-import importlib.util
import json
+import os
import sys
import time
import traceback
@@ -12,9 +12,12 @@
from datetime import datetime, timezone
from pathlib import Path
+import requests
+
SOLUTION_PATH = Path("/app/solution.py")
-PROBLEM_EVALUATOR_PATH = Path("/app/problem_evaluator.py")
SUBMISSIONS_LOG = Path("/logs/agent/submissions.jsonl")
+JUDGE_URL = os.environ.get("JUDGE_URL", "http://judge:8082").rstrip("/")
+JUDGE_TIMEOUT_SECONDS = int(os.environ.get("JUDGE_TIMEOUT_SECONDS", "10800"))
def now_iso() -> str:
@@ -31,15 +34,36 @@ def log_record(record: dict) -> None:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
-def load_problem_evaluator():
- spec = importlib.util.spec_from_file_location(
- "frontier_cs_2_0_problem_evaluator", PROBLEM_EVALUATOR_PATH
+def wait_for_judge() -> None:
+ deadline = time.time() + 60
+ last_error: Exception | None = None
+ while time.time() < deadline:
+ try:
+ response = requests.get(f"{JUDGE_URL}/health", timeout=5)
+ if response.status_code == 200:
+ return
+ except Exception as exc:
+ last_error = exc
+ time.sleep(1)
+ raise RuntimeError(f"judge service is not ready at {JUDGE_URL}: {last_error}")
+
+
+def evaluate_with_judge(code: str) -> tuple[float, float, str]:
+ wait_for_judge()
+ response = requests.post(
+ f"{JUDGE_URL}/evaluate",
+ json={"code": code},
+ timeout=JUDGE_TIMEOUT_SECONDS,
+ )
+ response.raise_for_status()
+ payload = response.json()
+ if payload.get("status") != "done":
+ raise RuntimeError(str(payload.get("message") or payload.get("error") or payload))
+ return (
+ float(payload.get("score", 0.0)),
+ float(payload.get("score_unbounded", payload.get("score", 0.0))),
+ str(payload.get("message", "")),
)
- if spec is None or spec.loader is None:
- raise RuntimeError(f"could not load evaluator from {PROBLEM_EVALUATOR_PATH}")
- module = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(module)
- return module
def main() -> int:
@@ -88,8 +112,7 @@ def main() -> int:
try:
start = time.time()
- evaluator = load_problem_evaluator()
- score, score_unbounded, message = evaluator.evaluate(str(solution_path))
+ score, score_unbounded, message = evaluate_with_judge(code)
elapsed_seconds = time.time() - start
reward = float(score) / 100.0
From ce02f7253bcc0c9e885abbdd828aedae06a057f5 Mon Sep 17 00:00:00 2001
From: Qiuyang Mang
Date: Wed, 27 May 2026 07:40:47 -0700
Subject: [PATCH 2/3] feat: add Erdos demo task
---
2.0/README.md | 7 +
2.0/problems/erdos_demo/config.yaml | 7 +
2.0/problems/erdos_demo/evaluate.sh | 12 ++
2.0/problems/erdos_demo/evaluator.py | 263 +++++++++++++++++++++++++++
2.0/problems/erdos_demo/readme | 72 ++++++++
2.0/problems/erdos_demo/reference.py | 16 ++
README.md | 8 +-
adapters/frontier-cs-2.0/README.md | 11 ++
8 files changed, 394 insertions(+), 2 deletions(-)
create mode 100644 2.0/problems/erdos_demo/config.yaml
create mode 100644 2.0/problems/erdos_demo/evaluate.sh
create mode 100644 2.0/problems/erdos_demo/evaluator.py
create mode 100644 2.0/problems/erdos_demo/readme
create mode 100644 2.0/problems/erdos_demo/reference.py
diff --git a/2.0/README.md b/2.0/README.md
index 6e56e5e6..84d3ade0 100644
--- a/2.0/README.md
+++ b/2.0/README.md
@@ -12,3 +12,10 @@ that as many pairs as possible have distance exactly `1`. Its problem ID is
`erdos_unit_distance`, matching the problem directory name. It is inspired by
the planar unit distance problem highlighted by OpenAI's May 2026 unit-distance
result.
+
+## Erdos Unit Distance Demo
+
+The demo variant uses the same interface and scoring rule with only `N = 10`
+points. Its problem ID is `erdos_demo`. It is intended as a quick visual sanity
+check for Harborized agent workflows before running the larger
+`erdos_unit_distance` task.
diff --git a/2.0/problems/erdos_demo/config.yaml b/2.0/problems/erdos_demo/config.yaml
new file mode 100644
index 00000000..ba4256ac
--- /dev/null
+++ b/2.0/problems/erdos_demo/config.yaml
@@ -0,0 +1,7 @@
+tag: geometry
+runtime:
+ language: python
+ timeout_seconds: 300
+ environment: "Python 3.11; no external packages required"
+ docker:
+ image: ubuntu:24.04
diff --git a/2.0/problems/erdos_demo/evaluate.sh b/2.0/problems/erdos_demo/evaluate.sh
new file mode 100644
index 00000000..23cd83b3
--- /dev/null
+++ b/2.0/problems/erdos_demo/evaluate.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+SOLUTION="/work/execution_env/solution_env/solution.py"
+
+if [[ ! -f "$SOLUTION" ]]; then
+ echo "Error: Missing $SOLUTION" >&2
+ exit 1
+fi
+
+python "$SCRIPT_DIR/evaluator.py" "$SOLUTION"
diff --git a/2.0/problems/erdos_demo/evaluator.py b/2.0/problems/erdos_demo/evaluator.py
new file mode 100644
index 00000000..26c0feca
--- /dev/null
+++ b/2.0/problems/erdos_demo/evaluator.py
@@ -0,0 +1,263 @@
+"""Evaluator for the Erdos unit distance demo problem."""
+
+from __future__ import annotations
+
+import importlib.util
+import math
+import os
+import pickle
+import pwd
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+from pathlib import Path
+from typing import Any
+
+N_POINTS = 10
+BASELINE_EDGES = N_POINTS
+TIMEOUT_SECONDS = 300
+UNIT_DISTANCE = 1.0
+DISTANCE_REL_TOL = 1e-7
+DISTANCE_ABS_TOL = 1e-9
+MIN_SEPARATION = 1e-6
+
+
+def _protect_evaluator_source() -> None:
+ """Hide evaluator source from unprivileged submitted solutions in containers."""
+ try:
+ evaluator_path = Path(__file__).resolve()
+ if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0:
+ evaluator_path.chmod(0o600)
+ except Exception:
+ pass
+
+
+_protect_evaluator_source()
+
+
+def _solution_preexec():
+ """Return a preexec_fn that runs submitted code as nobody when possible."""
+ if os.name != "posix":
+ return None
+ try:
+ if os.geteuid() != 0:
+ return None
+ nobody = pwd.getpwnam("nobody")
+ except Exception:
+ return None
+
+ def demote() -> None:
+ os.setgid(nobody.pw_gid)
+ os.setuid(nobody.pw_uid)
+
+ return demote
+
+
+def _is_number(value: Any) -> bool:
+ if isinstance(value, bool):
+ return False
+ try:
+ return math.isfinite(float(value))
+ except Exception:
+ return False
+
+
+def _to_points(raw: Any) -> list[tuple[float, float]]:
+ try:
+ values = raw.tolist()
+ except Exception:
+ values = list(raw)
+
+ points: list[tuple[float, float]] = []
+ for index, item in enumerate(values):
+ try:
+ pair = item.tolist()
+ except Exception:
+ pair = item
+ if not isinstance(pair, (list, tuple)) or len(pair) != 2:
+ raise ValueError(f"point {index} is not a 2D coordinate pair")
+ x, y = pair
+ if not _is_number(x) or not _is_number(y):
+ raise ValueError(f"point {index} has a non-finite coordinate")
+ points.append((float(x), float(y)))
+ return points
+
+
+def _load_points(solution_path: str) -> Any:
+ spec = importlib.util.spec_from_file_location("solution", solution_path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"could not import solution from {solution_path}")
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+
+ for name in ("solve", "generate_points", "run"):
+ fn = getattr(module, name, None)
+ if callable(fn):
+ return fn(N_POINTS)
+
+ points = getattr(module, "POINTS", None)
+ if points is not None:
+ return points
+
+ raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS")
+
+
+def _run_solution(solution_path: str) -> tuple[Any, str]:
+ with tempfile.TemporaryDirectory(prefix="erdos_unit_distance_") as tmp:
+ tmp_path = Path(tmp)
+ isolated_solution_path = tmp_path / "solution.py"
+ result_path = Path(tmp) / "result.pkl"
+ runner_path = Path(tmp) / "runner.py"
+ shutil.copy2(solution_path, isolated_solution_path)
+ runner_path.write_text(
+ """
+import importlib.util
+import pickle
+from pathlib import Path
+
+solution_path = __SOLUTION_PATH__
+result_path = Path(__RESULT_PATH__)
+n_points = __N_POINTS__
+
+
+def load_points():
+ spec = importlib.util.spec_from_file_location("solution", solution_path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError("could not import solution")
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+
+ for name in ("solve", "generate_points", "run"):
+ fn = getattr(module, name, None)
+ if callable(fn):
+ return fn(n_points)
+
+ points = getattr(module, "POINTS", None)
+ if points is not None:
+ return points
+
+ raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS")
+
+try:
+ points = load_points()
+ with result_path.open("wb") as f:
+ pickle.dump({"points": points}, f)
+except Exception:
+ with result_path.open("wb") as f:
+ pickle.dump({"error": "solution failed while generating points"}, f)
+""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path)))
+ .replace("__RESULT_PATH__", repr(str(result_path)))
+ .replace("__N_POINTS__", repr(N_POINTS)),
+ encoding="utf-8",
+ )
+ preexec_fn = _solution_preexec()
+ if preexec_fn is not None:
+ nobody = pwd.getpwnam("nobody")
+ os.chown(tmp, nobody.pw_uid, nobody.pw_gid)
+ os.chown(isolated_solution_path, nobody.pw_uid, nobody.pw_gid)
+ os.chown(runner_path, nobody.pw_uid, nobody.pw_gid)
+ os.chmod(tmp, 0o700 if preexec_fn is not None else 0o755)
+
+ proc = subprocess.run(
+ [sys.executable, str(runner_path)],
+ capture_output=True,
+ text=True,
+ timeout=TIMEOUT_SECONDS,
+ preexec_fn=preexec_fn,
+ )
+ if proc.returncode != 0:
+ raise RuntimeError(f"solution runner exited with code {proc.returncode}")
+ if not result_path.exists():
+ raise RuntimeError("solution did not produce a result")
+ with result_path.open("rb") as f:
+ payload = pickle.load(f)
+ if "error" in payload:
+ raise RuntimeError("solution failed while generating points")
+ return payload["points"], ""
+
+
+def _validate_points(points: list[tuple[float, float]]) -> None:
+ if len(points) != N_POINTS:
+ raise ValueError(f"expected {N_POINTS} points, got {len(points)}")
+
+ buckets: dict[tuple[int, int], list[tuple[float, float]]] = {}
+ min_sep2 = MIN_SEPARATION * MIN_SEPARATION
+ for index, (x, y) in enumerate(points):
+ if not math.isfinite(x) or not math.isfinite(y):
+ raise ValueError(f"point {index} has a non-finite coordinate")
+ key = (math.floor(x / MIN_SEPARATION), math.floor(y / MIN_SEPARATION))
+ for dx in (-1, 0, 1):
+ for dy in (-1, 0, 1):
+ for px, py in buckets.get((key[0] + dx, key[1] + dy), ()):
+ sep2 = (x - px) * (x - px) + (y - py) * (y - py)
+ if sep2 < min_sep2:
+ raise ValueError(
+ f"point {index} is closer than {MIN_SEPARATION:g} to another point"
+ )
+ buckets.setdefault(key, []).append((x, y))
+
+
+def _count_unit_distance_pairs(points: list[tuple[float, float]]) -> int:
+ buckets: dict[tuple[int, int], list[tuple[float, float]]] = {}
+ target2 = UNIT_DISTANCE * UNIT_DISTANCE
+ tol = max(DISTANCE_ABS_TOL, DISTANCE_REL_TOL * target2)
+ neighbor_radius = math.ceil((UNIT_DISTANCE + tol) / UNIT_DISTANCE) + 1
+ unit_pairs = 0
+
+ for x, y in points:
+ key = (math.floor(x / UNIT_DISTANCE), math.floor(y / UNIT_DISTANCE))
+ for dx in range(-neighbor_radius, neighbor_radius + 1):
+ for dy in range(-neighbor_radius, neighbor_radius + 1):
+ for px, py in buckets.get((key[0] + dx, key[1] + dy), ()):
+ d2 = (x - px) * (x - px) + (y - py) * (y - py)
+ if not math.isfinite(d2):
+ raise ValueError("pairwise distance overflowed")
+ if abs(d2 - target2) <= tol:
+ unit_pairs += 1
+ buckets.setdefault(key, []).append((x, y))
+
+ return unit_pairs
+
+
+def evaluate(solution_path: str) -> tuple[float, float, str]:
+ raw_points, _ = _run_solution(solution_path)
+ points = _to_points(raw_points)
+ _validate_points(points)
+ unit_pairs = _count_unit_distance_pairs(points)
+
+ if unit_pairs <= 0:
+ score = 0.0
+ else:
+ score = max(0.0, 100.0 * (unit_pairs - BASELINE_EDGES) / unit_pairs)
+ score_unbounded = score
+ message = (
+ f"N={N_POINTS}; unit_pairs={unit_pairs}; unit_distance={UNIT_DISTANCE:.12g}; "
+ f"baseline={BASELINE_EDGES}; "
+ f"score={score:.6f}; score_unbounded={score_unbounded:.6f}"
+ )
+ return score, score_unbounded, message
+
+
+def main(argv: list[str]) -> int:
+ if len(argv) != 2:
+ print("usage: evaluator.py /path/to/solution.py", file=sys.stderr)
+ return 1
+ try:
+ score, score_unbounded, message = evaluate(argv[1])
+ print(message, file=sys.stderr)
+ print(f"{score:.12f} {score_unbounded:.12f}")
+ return 0
+ except subprocess.TimeoutExpired:
+ print(f"timed out after {TIMEOUT_SECONDS}s", file=sys.stderr)
+ print("0.0 0.0")
+ return 0
+ except Exception:
+ print(traceback.format_exc(), file=sys.stderr)
+ print("0.0 0.0")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main(sys.argv))
diff --git a/2.0/problems/erdos_demo/readme b/2.0/problems/erdos_demo/readme
new file mode 100644
index 00000000..2f8715c2
--- /dev/null
+++ b/2.0/problems/erdos_demo/readme
@@ -0,0 +1,72 @@
+# Erdos Unit Distance Demo
+
+## Problem
+
+Place exactly `N = 10` distinct points in the Euclidean plane so that the
+number of point pairs at Euclidean distance exactly `1` is as large as possible.
+
+This is a tiny, visually inspectable demo version of the planar unit distance
+problem. If your construction naturally has a different common distance, scale
+the coordinates before returning them.
+
+## Program Interface
+
+Submit a Python file defining one of the following:
+
+```python
+def solve(n: int) -> list[tuple[float, float]]:
+ ...
+```
+
+or:
+
+```python
+def generate_points(n: int) -> list[tuple[float, float]]:
+ ...
+```
+
+or:
+
+```python
+POINTS = [(0.0, 0.0), (1.0, 0.0), ...]
+```
+
+The returned value must contain exactly 10 two-dimensional points. No stdin is
+used.
+
+## Validity Constraints
+
+A solution is valid if:
+
+1. It returns exactly 10 points.
+2. Every coordinate is a finite real number.
+3. No two points are closer than `1e-6`.
+
+The objective is translation-invariant. Very large coordinates are allowed as
+long as pairwise squared distances remain finite.
+
+## Objective
+
+For all unordered point pairs, count those whose squared Euclidean distance is
+equal to `1` within a small floating-point tolerance. Let `M` be that count.
+
+Maximize `M`.
+
+## Scoring
+
+The score is naturally scaled to `[0, 100)`, without clipping against a fixed
+target. Let:
+
+```text
+baseline = N
+X = M
+```
+
+If the point set is invalid, or if `X <= baseline`, the score is `0`. Otherwise:
+
+```text
+score = 100 * (X - baseline) / X
+```
+
+This makes the simple `N`-pair baseline worth `0`. With only 10 points, the
+problem is intended as a quick sanity check and visual demo for agent workflows.
diff --git a/2.0/problems/erdos_demo/reference.py b/2.0/problems/erdos_demo/reference.py
new file mode 100644
index 00000000..e268ede6
--- /dev/null
+++ b/2.0/problems/erdos_demo/reference.py
@@ -0,0 +1,16 @@
+"""Baseline solution: equally spaced points on a regular polygon."""
+
+from __future__ import annotations
+
+import math
+
+
+def solve(n: int):
+ radius = 1.0 / (2.0 * math.sin(math.pi / n))
+ return [
+ (
+ radius * math.cos(2.0 * math.pi * i / n),
+ radius * math.sin(2.0 * math.pi * i / n),
+ )
+ for i in range(n)
+ ]
diff --git a/README.md b/README.md
index dafb7de4..25c6300a 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
-
+
## News
@@ -139,7 +139,8 @@ isolated from Frontier-CS's own `uv sync` environment.
Frontier-CS 2.0 is agent-first: current 2.0 problems are meant to be run
through Harbor-compatible agents rather than direct one-shot solution files.
-Problem IDs are their problem directory names, such as `erdos_unit_distance`.
+Problem IDs are their problem directory names, such as `erdos_unit_distance`
+and the small `erdos_demo`.
```bash
# List 2.0 problems
@@ -147,6 +148,9 @@ frontier list 2.0
# Run a 2.0 task with an agent through the Harbor wrapper
uv run frontier harbor trial 2.0 erdos_unit_distance -a codex -m gpt-5.5 --json
+
+# Run the small N=10 demo task
+uv run frontier harbor trial 2.0 erdos_demo -a codex -m gpt-5.5 --json
```
See [2.0/README.md](2.0/README.md) for the current 2.0 track.
diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md
index 302e9025..342e3b41 100644
--- a/adapters/frontier-cs-2.0/README.md
+++ b/adapters/frontier-cs-2.0/README.md
@@ -28,11 +28,22 @@ uv run frontier-cs-2-0 \
--overwrite
```
+Generate only the small Erdos demo task:
+
+```bash
+uv run frontier-cs-2-0 \
+ --source ../.. \
+ --output-dir ../../datasets/frontier-cs-2.0 \
+ --task-ids erdos_demo \
+ --overwrite
+```
+
## Run with Harbor
```bash
uv run harbor run -p datasets/frontier-cs-2.0
uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-unit-distance
+uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-demo
```
## Task Contract
From 243940ac9a210394ac3fe5a467699a3d67cda120 Mon Sep 17 00:00:00 2001
From: Qiuyang Mang
Date: Wed, 27 May 2026 09:20:07 -0700
Subject: [PATCH 3/3] fix: regenerate default Harbor tasks before trials
---
src/frontier_cs/cli.py | 36 ++++++++++++++++++++----------------
1 file changed, 20 insertions(+), 16 deletions(-)
diff --git a/src/frontier_cs/cli.py b/src/frontier_cs/cli.py
index fa25147f..044ac735 100644
--- a/src/frontier_cs/cli.py
+++ b/src/frontier_cs/cli.py
@@ -1399,29 +1399,33 @@ def run_harbor(args: argparse.Namespace) -> int:
return 1
task_name = _harbor_task_name(args.track, args.problem_id)
+ using_default_task_path = args.task_path is None
task_path = args.task_path
dataset_dir = args.dataset_dir or _default_harbor_dataset_dir(args.track)
if task_path is None:
task_path = dataset_dir / task_name
- if not task_path.exists():
- if task_path == dataset_dir / task_name and not args.no_generate:
- _progress(f"Generating task {task_name}")
- try:
- _generate_harbor_task(args.track, args.problem_id, dataset_dir)
- except RuntimeError as exc:
- print(f"Error: {exc}", file=sys.stderr)
- return 1
- if task_path.exists():
- pass
- else:
- print(
- f"Error: Harbor task path not found: {task_path}\n"
- "Generate Harbor tasks first, or pass --task-path / --dataset-dir.",
- file=sys.stderr,
- )
+ should_generate = (
+ using_default_task_path
+ and task_path == dataset_dir / task_name
+ and not args.no_generate
+ )
+ if should_generate:
+ _progress(f"Generating task {task_name}")
+ try:
+ _generate_harbor_task(args.track, args.problem_id, dataset_dir)
+ except RuntimeError as exc:
+ print(f"Error: {exc}", file=sys.stderr)
return 1
+ if not task_path.exists():
+ print(
+ f"Error: Harbor task path not found: {task_path}\n"
+ "Generate Harbor tasks first, or pass --task-path / --dataset-dir.",
+ file=sys.stderr,
+ )
+ return 1
+
trials_dir = args.trials_dir or _default_harbor_trials_dir()
trials_dir.mkdir(parents=True, exist_ok=True)