From f98ee58bca86446dd0a19240247755949782e7e5 Mon Sep 17 00:00:00 2001
From: Qiuyang Mang <joyemang33@gmail.com>
Date: Wed, 27 May 2026 08:15:08 -0700
Subject: [PATCH 1/3] feat: isolate Frontier-CS 2.0 evaluator

---
 2.0/problems/erdos_unit_distance/evaluator.py |  98 ++++++++++++----
 2.0/problems/erdos_unit_distance/readme       |   4 +-
 adapters/frontier-cs-2.0/README.md            |  13 ++-
 .../src/frontier_cs_2_0/adapter.py            |  23 +++-
 .../task-template/environment/Dockerfile      |   2 +-
 .../environment/Dockerfile.judge              |  13 +++
 .../environment/docker-compose.yaml           |  17 +++
 .../task-template/environment/judge_server.py | 108 ++++++++++++++++++
 .../task-template/environment/submit.py       |  47 ++++++--
 9 files changed, 278 insertions(+), 47 deletions(-)
 create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge
 create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
 create mode 100644 adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py

diff --git a/2.0/problems/erdos_unit_distance/evaluator.py b/2.0/problems/erdos_unit_distance/evaluator.py
index 0ee6c289..9be17a37 100644
--- a/2.0/problems/erdos_unit_distance/evaluator.py
+++ b/2.0/problems/erdos_unit_distance/evaluator.py
@@ -4,7 +4,10 @@
 
 import importlib.util
 import math
+import os
 import pickle
+import pwd
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -16,9 +19,40 @@
 BASELINE_EDGES = N_POINTS
 TIMEOUT_SECONDS = 10800
 UNIT_DISTANCE = 1.0
-DISTANCE_REL_TOL = 1e-7
-DISTANCE_ABS_TOL = 1e-9
-MIN_SEPARATION = 1e-6
+DISTANCE_REL_TOL = 1e-10
+DISTANCE_ABS_TOL = 1e-10
+MIN_SEPARATION = 1e-3
+
+
+def _protect_evaluator_source() -> None:
+    """Hide evaluator source from unprivileged submitted solutions in containers."""
+    try:
+        evaluator_path = Path(__file__).resolve()
+        if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0:
+            evaluator_path.chmod(0o600)
+    except Exception:
+        pass
+
+
+_protect_evaluator_source()
+
+
+def _solution_preexec():
+    """Return a preexec_fn that runs submitted code as nobody when possible."""
+    if os.name != "posix":
+        return None
+    try:
+        if os.geteuid() != 0:
+            return None
+        nobody = pwd.getpwnam("nobody")
+    except Exception:
+        return None
+
+    def demote() -> None:
+        os.setgid(nobody.pw_gid)
+        os.setuid(nobody.pw_uid)
+
+    return demote
 
 
 def _is_number(value: Any) -> bool:
@@ -72,50 +106,76 @@ def _load_points(solution_path: str) -> Any:
 
 def _run_solution(solution_path: str) -> tuple[Any, str]:
     with tempfile.TemporaryDirectory(prefix="erdos_unit_distance_") as tmp:
+        tmp_path = Path(tmp)
+        isolated_solution_path = tmp_path / "solution.py"
         result_path = Path(tmp) / "result.pkl"
         runner_path = Path(tmp) / "runner.py"
+        shutil.copy2(solution_path, isolated_solution_path)
         runner_path.write_text(
             """
+import importlib.util
 import pickle
-import traceback
 from pathlib import Path
 
 solution_path = __SOLUTION_PATH__
 result_path = Path(__RESULT_PATH__)
+n_points = __N_POINTS__
+
+
+def load_points():
+    spec = importlib.util.spec_from_file_location("solution", solution_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError("could not import solution")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    for name in ("solve", "generate_points", "run"):
+        fn = getattr(module, name, None)
+        if callable(fn):
+            return fn(n_points)
+
+    points = getattr(module, "POINTS", None)
+    if points is not None:
+        return points
+
+    raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS")
 
 try:
-    import importlib.util
-    spec = importlib.util.spec_from_file_location("evaluator", __EVALUATOR_PATH__)
-    evaluator = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(evaluator)
-    points = evaluator._load_points(solution_path)
+    points = load_points()
     with result_path.open("wb") as f:
         pickle.dump({"points": points}, f)
-except Exception as exc:
+except Exception:
     with result_path.open("wb") as f:
-        pickle.dump({"error": str(exc), "trace": traceback.format_exc()}, f)
-""".replace("__SOLUTION_PATH__", repr(solution_path))
+        pickle.dump({"error": "solution failed while generating points"}, f)
+""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path)))
             .replace("__RESULT_PATH__", repr(str(result_path)))
-            .replace("__EVALUATOR_PATH__", repr(str(Path(__file__).resolve()))),
+            .replace("__N_POINTS__", repr(N_POINTS)),
             encoding="utf-8",
         )
+        preexec_fn = _solution_preexec()
+        if preexec_fn is not None:
+            nobody = pwd.getpwnam("nobody")
+            os.chown(tmp, nobody.pw_uid, nobody.pw_gid)
+            os.chown(isolated_solution_path, nobody.pw_uid, nobody.pw_gid)
+            os.chown(runner_path, nobody.pw_uid, nobody.pw_gid)
+        os.chmod(tmp, 0o700 if preexec_fn is not None else 0o755)
 
         proc = subprocess.run(
             [sys.executable, str(runner_path)],
             capture_output=True,
             text=True,
             timeout=TIMEOUT_SECONDS,
+            preexec_fn=preexec_fn,
         )
-        logs = (proc.stdout or "") + (proc.stderr or "")
         if proc.returncode != 0:
-            raise RuntimeError(f"solution runner exited with code {proc.returncode}\n{logs}")
+            raise RuntimeError(f"solution runner exited with code {proc.returncode}")
         if not result_path.exists():
             raise RuntimeError("solution did not produce a result")
         with result_path.open("rb") as f:
             payload = pickle.load(f)
         if "error" in payload:
-            raise RuntimeError(payload["error"] + "\n" + payload.get("trace", ""))
-        return payload["points"], logs
+            raise RuntimeError("solution failed while generating points")
+        return payload["points"], ""
 
 
 def _validate_points(points: list[tuple[float, float]]) -> None:
@@ -162,7 +222,7 @@ def _count_unit_distance_pairs(points: list[tuple[float, float]]) -> int:
 
 
 def evaluate(solution_path: str) -> tuple[float, float, str]:
-    raw_points, logs = _run_solution(solution_path)
+    raw_points, _ = _run_solution(solution_path)
     points = _to_points(raw_points)
     _validate_points(points)
     unit_pairs = _count_unit_distance_pairs(points)
@@ -177,8 +237,6 @@ def evaluate(solution_path: str) -> tuple[float, float, str]:
         f"baseline={BASELINE_EDGES}; "
         f"score={score:.6f}; score_unbounded={score_unbounded:.6f}"
     )
-    if logs:
-        message += "\n" + logs.strip()
     return score, score_unbounded, message
 
 
diff --git a/2.0/problems/erdos_unit_distance/readme b/2.0/problems/erdos_unit_distance/readme
index 095aff29..eaf04115 100644
--- a/2.0/problems/erdos_unit_distance/readme
+++ b/2.0/problems/erdos_unit_distance/readme
@@ -41,7 +41,7 @@ A solution is valid if:
 
 1. It returns exactly 65536 points.
 2. Every coordinate is a finite real number.
-3. No two points are closer than `1e-6`.
+3. No two points are closer than `1e-3`.
 
 The objective is translation-invariant. Very large coordinates are allowed as
 long as pairwise squared distances remain finite.
@@ -49,7 +49,7 @@ long as pairwise squared distances remain finite.
 ## Objective
 
 For all unordered point pairs, count those whose squared Euclidean distance is
-equal to `1` within a small floating-point tolerance. Let `M` be that count.
+equal to `1` within a strict floating-point tolerance. Let `M` be that count.
 
 Maximize `M`.
 
diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md
index 8c8e1ae7..302e9025 100644
--- a/adapters/frontier-cs-2.0/README.md
+++ b/adapters/frontier-cs-2.0/README.md
@@ -37,9 +37,9 @@ uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-unit
 
 ## Task Contract
 
-The agent works in `/app` and must create `/app/solution.py`. The verifier runs
-the original Frontier-CS `2.0` evaluator and writes a normalized reward in
-`/logs/verifier/reward.txt`.
+The agent works in `/app` and must create `/app/solution.py`. The final
+verifier runs the original Frontier-CS `2.0` evaluator and writes a normalized
+reward in `/logs/verifier/reward.txt`.
 
 During the trial, the agent can call:
 
@@ -47,9 +47,10 @@ During the trial, the agent can call:
 bash /app/submit.sh
 ```
 
-This runs the same evaluator against the current `/app/solution.py`, prints the
-score and feedback, and records each attempt in
-`/logs/agent/submissions.jsonl`. The final verifier mirrors that log to
+This submits the current `/app/solution.py` to a black-box judge service,
+prints the score and feedback, and records each attempt in
+`/logs/agent/submissions.jsonl`. The evaluator source is not mounted into the
+agent workspace. The final verifier mirrors that log to
 `/logs/verifier/submissions.jsonl` for process-reward analysis. The reported
 reward is the maximum of the final `/app/solution.py` score and the best
 successful iterative submission, so a timed-out agent can keep its best
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
index 2539668d..bc0c4bb9 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/adapter.py
@@ -133,11 +133,11 @@ def generate_task(
     def _write_instruction(self, task_paths: "TaskPaths", problem: FrontierCS20Problem) -> None:
         instruction = (
             "You are solving a Frontier-CS 2.0 open-ended optimization problem.\n\n"
-            "Create a Python solution at `/app/solution.py`. The verifier will run "
-            "the source Frontier-CS evaluator and convert its 0-100 score into a "
-            "Harbor reward in [0, 1]. You can call `bash /app/submit.sh` at any "
-            "time to grade the current solution with the same evaluator and get "
-            "feedback before the final verifier run.\n\n"
+            "Create a Python solution at `/app/solution.py`. You can call "
+            "`bash /app/submit.sh` at any time to grade the current solution "
+            "with the same black-box judge used by the final verifier and get "
+            "score feedback. The evaluator implementation is intentionally not "
+            "available in the agent workspace.\n\n"
             f"Problem id: `{problem.problem_id}`\n"
             f"Language: `{problem.language}`\n"
             f"Time limit: `{problem.timeout_seconds}s`\n\n"
@@ -161,8 +161,19 @@ def _write_environment(self, task_paths: "TaskPaths", problem: FrontierCS20Probl
             src = problem.problem_dir / name
             if src.exists():
                 shutil.copy2(src, env_dir / name)
+
+        judge_dockerfile = (
+            self.template_dir / "environment" / "Dockerfile.judge"
+        ).read_text(encoding="utf-8")
+        env_dir.joinpath("Dockerfile.judge").write_text(
+            judge_dockerfile.replace("{base_image}", image),
+            encoding="utf-8",
+        )
+        for name in ("docker-compose.yaml", "judge_server.py", "submit.py"):
+            shutil.copy2(self.template_dir / "environment" / name, env_dir / name)
+        # Kept in the build context for the judge image only; the main agent
+        # image's Dockerfile does not copy this into /app.
         shutil.copy2(problem.problem_dir / "evaluator.py", env_dir / "problem_evaluator.py")
-        shutil.copy2(self.template_dir / "environment" / "submit.py", env_dir / "submit.py")
         submit_sh = env_dir / "submit.sh"
         shutil.copy2(self.template_dir / "environment" / "submit.sh", submit_sh)
         submit_sh.chmod(0o755)
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile
index 3ef7a17c..7873525c 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile
@@ -23,5 +23,5 @@ ENV CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000
 
 WORKDIR /app
 
-COPY readme config.yaml problem_evaluator.py submit.py submit.sh /app/
+COPY readme config.yaml submit.py submit.sh /app/
 RUN chmod +x /app/submit.sh
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge
new file mode 100644
index 00000000..6207d4fa
--- /dev/null
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile.judge
@@ -0,0 +1,13 @@
+FROM {base_image}
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends python3 ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /judge
+
+COPY judge_server.py problem_evaluator.py /judge/
+RUN chmod 600 /judge/problem_evaluator.py
+
+EXPOSE 8082
+CMD ["python3", "/judge/judge_server.py"]
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
new file mode 100644
index 00000000..4e07cbdc
--- /dev/null
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/docker-compose.yaml
@@ -0,0 +1,17 @@
+services:
+  main:
+    depends_on:
+      judge:
+        condition: service_started
+    environment:
+      JUDGE_URL: "http://judge:8082"
+
+  judge:
+    build:
+      context: ${CONTEXT_DIR}
+      dockerfile: Dockerfile.judge
+    command: ["python3", "/judge/judge_server.py"]
+    expose:
+      - "8082"
+    environment:
+      PORT: "8082"
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
new file mode 100644
index 00000000..9b9ef1d1
--- /dev/null
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Black-box Frontier-CS 2.0 judge service for Harbor agent submissions."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import tempfile
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
+from typing import Any
+
+PROBLEM_EVALUATOR_PATH = Path("/judge/problem_evaluator.py")
+MAX_SUBMISSION_BYTES = 2_000_000
+
+
+def load_problem_evaluator():
+    spec = importlib.util.spec_from_file_location(
+        "frontier_cs_2_0_problem_evaluator", PROBLEM_EVALUATOR_PATH
+    )
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"could not load evaluator from {PROBLEM_EVALUATOR_PATH}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+EVALUATOR = load_problem_evaluator()
+
+
+def evaluate_code(code: str) -> dict[str, Any]:
+    with tempfile.TemporaryDirectory(prefix="frontier_cs_2_0_submission_") as tmp:
+        solution_path = Path(tmp) / "solution.py"
+        solution_path.write_text(code, encoding="utf-8")
+        score, score_unbounded, message = EVALUATOR.evaluate(str(solution_path))
+    return {
+        "status": "done",
+        "score": float(score),
+        "score_unbounded": float(score_unbounded),
+        "message": message,
+    }
+
+
+class JudgeHandler(BaseHTTPRequestHandler):
+    server_version = "FrontierCS20Judge/1.0"
+
+    def _write_json(self, status: int, payload: dict[str, Any]) -> None:
+        body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def do_GET(self) -> None:
+        if self.path == "/health":
+            self._write_json(200, {"status": "ok"})
+            return
+        self._write_json(404, {"status": "error", "error": "not found"})
+
+    def do_POST(self) -> None:
+        if self.path != "/evaluate":
+            self._write_json(404, {"status": "error", "error": "not found"})
+            return
+
+        try:
+            content_length = int(self.headers.get("Content-Length", "0"))
+        except ValueError:
+            self._write_json(400, {"status": "error", "error": "invalid content length"})
+            return
+
+        if content_length <= 0:
+            self._write_json(400, {"status": "error", "error": "empty request body"})
+            return
+        if content_length > MAX_SUBMISSION_BYTES:
+            self._write_json(413, {"status": "error", "error": "submission too large"})
+            return
+
+        try:
+            payload = json.loads(self.rfile.read(content_length).decode("utf-8"))
+            code = payload.get("code")
+            if not isinstance(code, str) or not code.strip():
+                raise ValueError("request JSON must include non-empty string field 'code'")
+            self._write_json(200, evaluate_code(code))
+        except Exception:
+            self._write_json(
+                200,
+                {
+                    "status": "error",
+                    "score": 0.0,
+                    "score_unbounded": 0.0,
+                    "message": "evaluation failed",
+                },
+            )
+
+    def log_message(self, fmt: str, *args: object) -> None:
+        return
+
+
+def main() -> None:
+    port = int(os.environ.get("PORT", "8082"))
+    server = ThreadingHTTPServer(("0.0.0.0", port), JudgeHandler)
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
index eeab3c93..0f89f1f3 100644
--- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
+++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py
@@ -3,8 +3,8 @@
 
 from __future__ import annotations
 
-import importlib.util
 import json
+import os
 import sys
 import time
 import traceback
@@ -12,9 +12,12 @@
 from datetime import datetime, timezone
 from pathlib import Path
 
+import requests
+
 SOLUTION_PATH = Path("/app/solution.py")
-PROBLEM_EVALUATOR_PATH = Path("/app/problem_evaluator.py")
 SUBMISSIONS_LOG = Path("/logs/agent/submissions.jsonl")
+JUDGE_URL = os.environ.get("JUDGE_URL", "http://judge:8082").rstrip("/")
+JUDGE_TIMEOUT_SECONDS = int(os.environ.get("JUDGE_TIMEOUT_SECONDS", "10800"))
 
 
 def now_iso() -> str:
@@ -31,15 +34,36 @@ def log_record(record: dict) -> None:
         f.write(json.dumps(record, ensure_ascii=False) + "\n")
 
 
-def load_problem_evaluator():
-    spec = importlib.util.spec_from_file_location(
-        "frontier_cs_2_0_problem_evaluator", PROBLEM_EVALUATOR_PATH
+def wait_for_judge() -> None:
+    deadline = time.time() + 60
+    last_error: Exception | None = None
+    while time.time() < deadline:
+        try:
+            response = requests.get(f"{JUDGE_URL}/health", timeout=5)
+            if response.status_code == 200:
+                return
+        except Exception as exc:
+            last_error = exc
+        time.sleep(1)
+    raise RuntimeError(f"judge service is not ready at {JUDGE_URL}: {last_error}")
+
+
+def evaluate_with_judge(code: str) -> tuple[float, float, str]:
+    wait_for_judge()
+    response = requests.post(
+        f"{JUDGE_URL}/evaluate",
+        json={"code": code},
+        timeout=JUDGE_TIMEOUT_SECONDS,
+    )
+    response.raise_for_status()
+    payload = response.json()
+    if payload.get("status") != "done":
+        raise RuntimeError(str(payload.get("message") or payload.get("error") or payload))
+    return (
+        float(payload.get("score", 0.0)),
+        float(payload.get("score_unbounded", payload.get("score", 0.0))),
+        str(payload.get("message", "")),
     )
-    if spec is None or spec.loader is None:
-        raise RuntimeError(f"could not load evaluator from {PROBLEM_EVALUATOR_PATH}")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
 
 
 def main() -> int:
@@ -88,8 +112,7 @@ def main() -> int:
 
     try:
         start = time.time()
-        evaluator = load_problem_evaluator()
-        score, score_unbounded, message = evaluator.evaluate(str(solution_path))
+        score, score_unbounded, message = evaluate_with_judge(code)
         elapsed_seconds = time.time() - start
         reward = float(score) / 100.0
 

From ce02f7253bcc0c9e885abbdd828aedae06a057f5 Mon Sep 17 00:00:00 2001
From: Qiuyang Mang <joyemang33@gmail.com>
Date: Wed, 27 May 2026 07:40:47 -0700
Subject: [PATCH 2/3] feat: add Erdos demo task

---
 2.0/README.md                        |   7 +
 2.0/problems/erdos_demo/config.yaml  |   7 +
 2.0/problems/erdos_demo/evaluate.sh  |  12 ++
 2.0/problems/erdos_demo/evaluator.py | 263 +++++++++++++++++++++++++++
 2.0/problems/erdos_demo/readme       |  72 ++++++++
 2.0/problems/erdos_demo/reference.py |  16 ++
 README.md                            |   8 +-
 adapters/frontier-cs-2.0/README.md   |  11 ++
 8 files changed, 394 insertions(+), 2 deletions(-)
 create mode 100644 2.0/problems/erdos_demo/config.yaml
 create mode 100644 2.0/problems/erdos_demo/evaluate.sh
 create mode 100644 2.0/problems/erdos_demo/evaluator.py
 create mode 100644 2.0/problems/erdos_demo/readme
 create mode 100644 2.0/problems/erdos_demo/reference.py

diff --git a/2.0/README.md b/2.0/README.md
index 6e56e5e6..84d3ade0 100644
--- a/2.0/README.md
+++ b/2.0/README.md
@@ -12,3 +12,10 @@ that as many pairs as possible have distance exactly `1`. Its problem ID is
 `erdos_unit_distance`, matching the problem directory name. It is inspired by
 the planar unit distance problem highlighted by OpenAI's May 2026 unit-distance
 result.
+
+## Erdos Unit Distance Demo
+
+The demo variant uses the same interface and scoring rule with only `N = 10`
+points. Its problem ID is `erdos_demo`. It is intended as a quick visual sanity
+check for Harborized agent workflows before running the larger
+`erdos_unit_distance` task.
diff --git a/2.0/problems/erdos_demo/config.yaml b/2.0/problems/erdos_demo/config.yaml
new file mode 100644
index 00000000..ba4256ac
--- /dev/null
+++ b/2.0/problems/erdos_demo/config.yaml
@@ -0,0 +1,7 @@
+tag: geometry
+runtime:
+  language: python
+  timeout_seconds: 300
+  environment: "Python 3.11; no external packages required"
+  docker:
+    image: ubuntu:24.04
diff --git a/2.0/problems/erdos_demo/evaluate.sh b/2.0/problems/erdos_demo/evaluate.sh
new file mode 100644
index 00000000..23cd83b3
--- /dev/null
+++ b/2.0/problems/erdos_demo/evaluate.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+SOLUTION="/work/execution_env/solution_env/solution.py"
+
+if [[ ! -f "$SOLUTION" ]]; then
+  echo "Error: Missing $SOLUTION" >&2
+  exit 1
+fi
+
+python "$SCRIPT_DIR/evaluator.py" "$SOLUTION"
diff --git a/2.0/problems/erdos_demo/evaluator.py b/2.0/problems/erdos_demo/evaluator.py
new file mode 100644
index 00000000..26c0feca
--- /dev/null
+++ b/2.0/problems/erdos_demo/evaluator.py
@@ -0,0 +1,263 @@
+"""Evaluator for the Erdos unit distance demo problem."""
+
+from __future__ import annotations
+
+import importlib.util
+import math
+import os
+import pickle
+import pwd
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+from pathlib import Path
+from typing import Any
+
+N_POINTS = 10
+BASELINE_EDGES = N_POINTS
+TIMEOUT_SECONDS = 300
+UNIT_DISTANCE = 1.0
+DISTANCE_REL_TOL = 1e-7
+DISTANCE_ABS_TOL = 1e-9
+MIN_SEPARATION = 1e-6
+
+
+def _protect_evaluator_source() -> None:
+    """Hide evaluator source from unprivileged submitted solutions in containers."""
+    try:
+        evaluator_path = Path(__file__).resolve()
+        if str(evaluator_path).startswith(("/judge/", "/tests/")) and os.geteuid() == 0:
+            evaluator_path.chmod(0o600)
+    except Exception:
+        pass
+
+
+_protect_evaluator_source()
+
+
+def _solution_preexec():
+    """Return a preexec_fn that runs submitted code as nobody when possible."""
+    if os.name != "posix":
+        return None
+    try:
+        if os.geteuid() != 0:
+            return None
+        nobody = pwd.getpwnam("nobody")
+    except Exception:
+        return None
+
+    def demote() -> None:
+        os.setgid(nobody.pw_gid)
+        os.setuid(nobody.pw_uid)
+
+    return demote
+
+
+def _is_number(value: Any) -> bool:
+    if isinstance(value, bool):
+        return False
+    try:
+        return math.isfinite(float(value))
+    except Exception:
+        return False
+
+
+def _to_points(raw: Any) -> list[tuple[float, float]]:
+    try:
+        values = raw.tolist()
+    except Exception:
+        values = list(raw)
+
+    points: list[tuple[float, float]] = []
+    for index, item in enumerate(values):
+        try:
+            pair = item.tolist()
+        except Exception:
+            pair = item
+        if not isinstance(pair, (list, tuple)) or len(pair) != 2:
+            raise ValueError(f"point {index} is not a 2D coordinate pair")
+        x, y = pair
+        if not _is_number(x) or not _is_number(y):
+            raise ValueError(f"point {index} has a non-finite coordinate")
+        points.append((float(x), float(y)))
+    return points
+
+
+def _load_points(solution_path: str) -> Any:
+    spec = importlib.util.spec_from_file_location("solution", solution_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"could not import solution from {solution_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    for name in ("solve", "generate_points", "run"):
+        fn = getattr(module, name, None)
+        if callable(fn):
+            return fn(N_POINTS)
+
+    points = getattr(module, "POINTS", None)
+    if points is not None:
+        return points
+
+    raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS")
+
+
+def _run_solution(solution_path: str) -> tuple[Any, str]:
+    with tempfile.TemporaryDirectory(prefix="erdos_unit_distance_") as tmp:
+        tmp_path = Path(tmp)
+        isolated_solution_path = tmp_path / "solution.py"
+        result_path = Path(tmp) / "result.pkl"
+        runner_path = Path(tmp) / "runner.py"
+        shutil.copy2(solution_path, isolated_solution_path)
+        runner_path.write_text(
+            """
+import importlib.util
+import pickle
+from pathlib import Path
+
+solution_path = __SOLUTION_PATH__
+result_path = Path(__RESULT_PATH__)
+n_points = __N_POINTS__
+
+
+def load_points():
+    spec = importlib.util.spec_from_file_location("solution", solution_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError("could not import solution")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    for name in ("solve", "generate_points", "run"):
+        fn = getattr(module, name, None)
+        if callable(fn):
+            return fn(n_points)
+
+    points = getattr(module, "POINTS", None)
+    if points is not None:
+        return points
+
+    raise RuntimeError("solution must define solve(n), generate_points(n), run(n), or POINTS")
+
+try:
+    points = load_points()
+    with result_path.open("wb") as f:
+        pickle.dump({"points": points}, f)
+except Exception:
+    with result_path.open("wb") as f:
+        pickle.dump({"error": "solution failed while generating points"}, f)
+""".replace("__SOLUTION_PATH__", repr(str(isolated_solution_path)))
+            .replace("__RESULT_PATH__", repr(str(result_path)))
+            .replace("__N_POINTS__", repr(N_POINTS)),
+            encoding="utf-8",
+        )
+        preexec_fn = _solution_preexec()
+        if preexec_fn is not None:
+            nobody = pwd.getpwnam("nobody")
+            os.chown(tmp, nobody.pw_uid, nobody.pw_gid)
+            os.chown(isolated_solution_path, nobody.pw_uid, nobody.pw_gid)
+            os.chown(runner_path, nobody.pw_uid, nobody.pw_gid)
+        os.chmod(tmp, 0o700 if preexec_fn is not None else 0o755)
+
+        proc = subprocess.run(
+            [sys.executable, str(runner_path)],
+            capture_output=True,
+            text=True,
+            timeout=TIMEOUT_SECONDS,
+            preexec_fn=preexec_fn,
+        )
+        if proc.returncode != 0:
+            raise RuntimeError(f"solution runner exited with code {proc.returncode}")
+        if not result_path.exists():
+            raise RuntimeError("solution did not produce a result")
+        with result_path.open("rb") as f:
+            payload = pickle.load(f)
+        if "error" in payload:
+            raise RuntimeError("solution failed while generating points")
+        return payload["points"], ""
+
+
+def _validate_points(points: list[tuple[float, float]]) -> None:
+    if len(points) != N_POINTS:
+        raise ValueError(f"expected {N_POINTS} points, got {len(points)}")
+
+    buckets: dict[tuple[int, int], list[tuple[float, float]]] = {}
+    min_sep2 = MIN_SEPARATION * MIN_SEPARATION
+    for index, (x, y) in enumerate(points):
+        if not math.isfinite(x) or not math.isfinite(y):
+            raise ValueError(f"point {index} has a non-finite coordinate")
+        key = (math.floor(x / MIN_SEPARATION), math.floor(y / MIN_SEPARATION))
+        for dx in (-1, 0, 1):
+            for dy in (-1, 0, 1):
+                for px, py in buckets.get((key[0] + dx, key[1] + dy), ()):
+                    sep2 = (x - px) * (x - px) + (y - py) * (y - py)
+                    if sep2 < min_sep2:
+                        raise ValueError(
+                            f"point {index} is closer than {MIN_SEPARATION:g} to another point"
+                        )
+        buckets.setdefault(key, []).append((x, y))
+
+
+def _count_unit_distance_pairs(points: list[tuple[float, float]]) -> int:
+    buckets: dict[tuple[int, int], list[tuple[float, float]]] = {}
+    target2 = UNIT_DISTANCE * UNIT_DISTANCE
+    tol = max(DISTANCE_ABS_TOL, DISTANCE_REL_TOL * target2)
+    neighbor_radius = math.ceil((UNIT_DISTANCE + tol) / UNIT_DISTANCE) + 1
+    unit_pairs = 0
+
+    for x, y in points:
+        key = (math.floor(x / UNIT_DISTANCE), math.floor(y / UNIT_DISTANCE))
+        for dx in range(-neighbor_radius, neighbor_radius + 1):
+            for dy in range(-neighbor_radius, neighbor_radius + 1):
+                for px, py in buckets.get((key[0] + dx, key[1] + dy), ()):
+                    d2 = (x - px) * (x - px) + (y - py) * (y - py)
+                    if not math.isfinite(d2):
+                        raise ValueError("pairwise distance overflowed")
+                    if abs(d2 - target2) <= tol:
+                        unit_pairs += 1
+        buckets.setdefault(key, []).append((x, y))
+
+    return unit_pairs
+
+
+def evaluate(solution_path: str) -> tuple[float, float, str]:
+    raw_points, _ = _run_solution(solution_path)
+    points = _to_points(raw_points)
+    _validate_points(points)
+    unit_pairs = _count_unit_distance_pairs(points)
+
+    if unit_pairs <= 0:
+        score = 0.0
+    else:
+        score = max(0.0, 100.0 * (unit_pairs - BASELINE_EDGES) / unit_pairs)
+    score_unbounded = score
+    message = (
+        f"N={N_POINTS}; unit_pairs={unit_pairs}; unit_distance={UNIT_DISTANCE:.12g}; "
+        f"baseline={BASELINE_EDGES}; "
+        f"score={score:.6f}; score_unbounded={score_unbounded:.6f}"
+    )
+    return score, score_unbounded, message
+
+
+def main(argv: list[str]) -> int:
+    if len(argv) != 2:
+        print("usage: evaluator.py /path/to/solution.py", file=sys.stderr)
+        return 1
+    try:
+        score, score_unbounded, message = evaluate(argv[1])
+        print(message, file=sys.stderr)
+        print(f"{score:.12f} {score_unbounded:.12f}")
+        return 0
+    except subprocess.TimeoutExpired:
+        print(f"timed out after {TIMEOUT_SECONDS}s", file=sys.stderr)
+        print("0.0 0.0")
+        return 0
+    except Exception:
+        print(traceback.format_exc(), file=sys.stderr)
+        print("0.0 0.0")
+        return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv))
diff --git a/2.0/problems/erdos_demo/readme b/2.0/problems/erdos_demo/readme
new file mode 100644
index 00000000..2f8715c2
--- /dev/null
+++ b/2.0/problems/erdos_demo/readme
@@ -0,0 +1,72 @@
+# Erdos Unit Distance Demo
+
+## Problem
+
+Place exactly `N = 10` distinct points in the Euclidean plane so that the
+number of point pairs at Euclidean distance exactly `1` is as large as possible.
+
+This is a tiny, visually inspectable demo version of the planar unit distance
+problem. If your construction naturally has a different common distance, scale
+the coordinates before returning them.
+
+## Program Interface
+
+Submit a Python file defining one of the following:
+
+```python
+def solve(n: int) -> list[tuple[float, float]]:
+    ...
+```
+
+or:
+
+```python
+def generate_points(n: int) -> list[tuple[float, float]]:
+    ...
+```
+
+or:
+
+```python
+POINTS = [(0.0, 0.0), (1.0, 0.0), ...]
+```
+
+The returned value must contain exactly 10 two-dimensional points. No stdin is
+used.
+
+## Validity Constraints
+
+A solution is valid if:
+
+1. It returns exactly 10 points.
+2. Every coordinate is a finite real number.
+3. No two points are closer than `1e-6`.
+
+The objective is translation-invariant. Very large coordinates are allowed as
+long as pairwise squared distances remain finite.
+
+## Objective
+
+For all unordered point pairs, count those whose squared Euclidean distance is
+equal to `1` within a small floating-point tolerance. Let `M` be that count.
+
+Maximize `M`.
+
+## Scoring
+
+The score is naturally scaled to `[0, 100)`, without clipping against a fixed
+target. Let:
+
+```text
+baseline = N
+X = M
+```
+
+If the point set is invalid, or if `X <= baseline`, the score is `0`. Otherwise:
+
+```text
+score = 100 * (X - baseline) / X
+```
+
+This makes the simple `N`-pair baseline worth `0`. With only 10 points, the
+problem is intended as a quick sanity check and visual demo for agent workflows.
diff --git a/2.0/problems/erdos_demo/reference.py b/2.0/problems/erdos_demo/reference.py
new file mode 100644
index 00000000..e268ede6
--- /dev/null
+++ b/2.0/problems/erdos_demo/reference.py
@@ -0,0 +1,16 @@
+"""Baseline solution: equally spaced points on a regular polygon."""
+
+from __future__ import annotations
+
+import math
+
+
+def solve(n: int):
+    radius = 1.0 / (2.0 * math.sin(math.pi / n))
+    return [
+        (
+            radius * math.cos(2.0 * math.pi * i / n),
+            radius * math.sin(2.0 * math.pi * i / n),
+        )
+        for i in range(n)
+    ]
diff --git a/README.md b/README.md
index dafb7de4..25c6300a 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
   </a>
   <img src="https://img.shields.io/badge/Research_Problems-68-blue" alt="Research Problems">
   <img src="https://img.shields.io/badge/Algorithmic_Problems-188-green" alt="Algorithmic Problems">
-  <img src="https://img.shields.io/badge/2.0_Problems-1-purple" alt="2.0 Problems">
+  <img src="https://img.shields.io/badge/2.0_Problems-2-purple" alt="2.0 Problems">
 </p>
 
 ## News
@@ -139,7 +139,8 @@ isolated from Frontier-CS's own `uv sync` environment.
 
 Frontier-CS 2.0 is agent-first: current 2.0 problems are meant to be run
 through Harbor-compatible agents rather than direct one-shot solution files.
-Problem IDs are their problem directory names, such as `erdos_unit_distance`.
+Problem IDs are their problem directory names, such as `erdos_unit_distance`
+and the small `erdos_demo`.
 
 ```bash
 # List 2.0 problems
@@ -147,6 +148,9 @@ frontier list 2.0
 
 # Run a 2.0 task with an agent through the Harbor wrapper
 uv run frontier harbor trial 2.0 erdos_unit_distance -a codex -m gpt-5.5 --json
+
+# Run the small N=10 demo task
+uv run frontier harbor trial 2.0 erdos_demo -a codex -m gpt-5.5 --json
 ```
 
 See [2.0/README.md](2.0/README.md) for the current 2.0 track.
diff --git a/adapters/frontier-cs-2.0/README.md b/adapters/frontier-cs-2.0/README.md
index 302e9025..342e3b41 100644
--- a/adapters/frontier-cs-2.0/README.md
+++ b/adapters/frontier-cs-2.0/README.md
@@ -28,11 +28,22 @@ uv run frontier-cs-2-0 \
   --overwrite
 ```
 
+Generate only the small Erdos demo task:
+
+```bash
+uv run frontier-cs-2-0 \
+  --source ../.. \
+  --output-dir ../../datasets/frontier-cs-2.0 \
+  --task-ids erdos_demo \
+  --overwrite
+```
+
 ## Run with Harbor
 
 ```bash
 uv run harbor run -p datasets/frontier-cs-2.0
 uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-unit-distance
+uv run harbor trial start -p datasets/frontier-cs-2.0/frontier-cs-2-0-erdos-demo
 ```
 
 ## Task Contract

From 243940ac9a210394ac3fe5a467699a3d67cda120 Mon Sep 17 00:00:00 2001
From: Qiuyang Mang <joyemang33@gmail.com>
Date: Wed, 27 May 2026 09:20:07 -0700
Subject: [PATCH 3/3] fix: regenerate default Harbor tasks before trials

---
 src/frontier_cs/cli.py | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/frontier_cs/cli.py b/src/frontier_cs/cli.py
index fa25147f..044ac735 100644
--- a/src/frontier_cs/cli.py
+++ b/src/frontier_cs/cli.py
@@ -1399,29 +1399,33 @@ def run_harbor(args: argparse.Namespace) -> int:
         return 1
 
     task_name = _harbor_task_name(args.track, args.problem_id)
+    using_default_task_path = args.task_path is None
     task_path = args.task_path
     dataset_dir = args.dataset_dir or _default_harbor_dataset_dir(args.track)
     if task_path is None:
         task_path = dataset_dir / task_name
 
-    if not task_path.exists():
-        if task_path == dataset_dir / task_name and not args.no_generate:
-            _progress(f"Generating task {task_name}")
-            try:
-                _generate_harbor_task(args.track, args.problem_id, dataset_dir)
-            except RuntimeError as exc:
-                print(f"Error: {exc}", file=sys.stderr)
-                return 1
-        if task_path.exists():
-            pass
-        else:
-            print(
-                f"Error: Harbor task path not found: {task_path}\n"
-                "Generate Harbor tasks first, or pass --task-path / --dataset-dir.",
-                file=sys.stderr,
-            )
+    should_generate = (
+        using_default_task_path
+        and task_path == dataset_dir / task_name
+        and not args.no_generate
+    )
+    if should_generate:
+        _progress(f"Generating task {task_name}")
+        try:
+            _generate_harbor_task(args.track, args.problem_id, dataset_dir)
+        except RuntimeError as exc:
+            print(f"Error: {exc}", file=sys.stderr)
             return 1
 
+    if not task_path.exists():
+        print(
+            f"Error: Harbor task path not found: {task_path}\n"
+            "Generate Harbor tasks first, or pass --task-path / --dataset-dir.",
+            file=sys.stderr,
+        )
+        return 1
+
     trials_dir = args.trials_dir or _default_harbor_trials_dir()
     trials_dir.mkdir(parents=True, exist_ok=True)