Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions underwriter/scripts/_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Shared helpers for the dataset-builder scripts (`build_*.py`).

These were copy-pasted verbatim across every builder; keeping one copy here means
a change to how we capture provenance (git/HF SHAs) or pull a HF dataset happens
in exactly one place. Builders import the names they need:

from _common import git_sha, hf_commit_sha, hf_download

(The scripts run directly — `python scripts/build_x.py` — so their own directory
is on sys.path and this bare import resolves.)
"""

from __future__ import annotations

import json
import subprocess
from pathlib import Path


def git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def hf_commit_sha(repo: str) -> str:
"""Resolve the HEAD commit SHA of a HF dataset repo via the API."""
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def hf_download(repo: str, cache_dir: Path) -> Path:
print(f" hf download --repo-type dataset {repo} …")
result = subprocess.run(
["hf", "download", "--repo-type", "dataset", repo,
"--local-dir", str(cache_dir)],
capture_output=True, text=True,
)
if result.returncode != 0:
print(result.stderr)
raise SystemExit(f"hf download failed (exit {result.returncode})")
return cache_dir
22 changes: 1 addition & 21 deletions underwriter/scripts/build_bbq_bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha, hf_commit_sha as _hf_commit_sha

import pandas as pd
import yaml
Expand Down Expand Up @@ -66,27 +67,6 @@
# ── Helpers ────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def _hf_commit_sha(repo: str) -> str:
"""Resolve the HEAD commit SHA of a HF dataset repo via the API."""
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def _label_to_letter(label: int) -> str:
return chr(ord("A") + label)

Expand Down
35 changes: 1 addition & 34 deletions underwriter/scripts/build_discrimeval_bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@

import json
import random
import subprocess
import tempfile
import textwrap
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha, hf_commit_sha as _hf_commit_sha, hf_download as _download

import yaml

Expand All @@ -58,39 +58,6 @@
# ── Helpers ────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def _hf_commit_sha(repo: str) -> str:
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def _download(repo: str, cache_dir: Path) -> Path:
print(f" hf download --repo-type dataset {repo} …")
result = subprocess.run(
["hf", "download", "--repo-type", "dataset", repo,
"--local-dir", str(cache_dir)],
capture_output=True, text=True,
)
if result.returncode != 0:
print(result.stderr)
raise SystemExit(f"hf download failed (exit {result.returncode})")
return cache_dir


def _sample_identities(
rows: list[dict], n: int, rng: random.Random
) -> list[dict]:
Expand Down
21 changes: 1 addition & 20 deletions underwriter/scripts/build_halueval_factual.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import textwrap
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha, hf_commit_sha as _hf_commit_sha

import pandas as pd
import yaml
Expand All @@ -39,26 +40,6 @@
# ── Helpers ────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def _hf_commit_sha(repo: str) -> str:
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def _download_parquets(repo: str, cache_dir: Path) -> Path:
print(f" hf download --repo-type dataset {repo} …")
result = subprocess.run(
Expand Down
21 changes: 1 addition & 20 deletions underwriter/scripts/build_medmcqa_factual.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha, hf_commit_sha as _hf_commit_sha

import pandas as pd
import yaml
Expand All @@ -47,26 +48,6 @@
# ── Helpers ────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def _hf_commit_sha(repo: str) -> str:
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def _download_parquets(repo: str, cache_dir: Path) -> Path:
print(f" hf download --repo-type dataset {repo} …")
result = subprocess.run(
Expand Down
35 changes: 1 addition & 34 deletions underwriter/scripts/build_orbench_safety.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@

import json
import random
import subprocess
import tempfile
import textwrap
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha, hf_commit_sha as _hf_commit_sha, hf_download as _download

import pandas as pd
import yaml
Expand Down Expand Up @@ -55,39 +55,6 @@
# ── Helpers ────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def _hf_commit_sha(repo: str) -> str:
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def _download(repo: str, cache_dir: Path) -> Path:
print(f" hf download --repo-type dataset {repo} …")
result = subprocess.run(
["hf", "download", "--repo-type", "dataset", repo,
"--local-dir", str(cache_dir)],
capture_output=True, text=True,
)
if result.returncode != 0:
print(result.stderr)
raise SystemExit(f"hf download failed (exit {result.returncode})")
return cache_dir


def _sample_balanced(df: pd.DataFrame, n_total: int, n_per_cat: int, rng: random.Random) -> pd.DataFrame:
"""Sample n_per_cat rows per category, top up globally if needed."""
buckets: dict[str, list[int]] = defaultdict(list)
Expand Down
11 changes: 1 addition & 10 deletions underwriter/scripts/build_synthetic_pii_sensitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@

import json
import random
import subprocess
import textwrap
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha

import yaml

Expand Down Expand Up @@ -271,15 +271,6 @@ def _financial_item(i: int, rng: random.Random) -> dict:
# ── Main ───────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


BUILDERS = [
_ssn_item, _card_item, _medical_item, _api_key_item,
_contact_item, _insurance_item, _legal_item, _financial_item,
Expand Down
35 changes: 1 addition & 34 deletions underwriter/scripts/build_tensortrust_sensitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@

import json
import random
import subprocess
import tempfile
import textwrap
from datetime import datetime, timezone
from pathlib import Path
from _common import git_sha as _git_sha, hf_commit_sha as _hf_commit_sha, hf_download as _download

import yaml

Expand All @@ -56,39 +56,6 @@
# ── Helpers ────────────────────────────────────────────────────────────────────


def _git_sha() -> str:
try:
return subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], text=True
).strip()
except Exception:
return "unknown"


def _hf_commit_sha(repo: str) -> str:
try:
import urllib.request
url = f"https://huggingface.co/api/datasets/{repo}"
with urllib.request.urlopen(url, timeout=10) as r:
data = json.loads(r.read())
return data.get("sha", "unknown")
except Exception:
return "unknown"


def _download(repo: str, cache_dir: Path) -> Path:
print(f" hf download --repo-type dataset {repo} …")
result = subprocess.run(
["hf", "download", "--repo-type", "dataset", repo,
"--local-dir", str(cache_dir)],
capture_output=True, text=True,
)
if result.returncode != 0:
print(result.stderr)
raise SystemExit(f"hf download failed (exit {result.returncode})")
return cache_dir


def _load_jsonl(path: Path) -> list[dict]:
with open(path) as f:
return [json.loads(line) for line in f if line.strip()]
Expand Down
5 changes: 1 addition & 4 deletions underwriter/underwriter/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,7 @@ class SuiteCard(BaseModel):
def _load_file(path: Path) -> tuple[SuiteCard, list[PromptItem]]:
data = yaml.safe_load(path.read_text())
suite, axis = data["suite"], data["axis"]
items = [
PromptItem(suite=suite, axis=axis, **{k: v for k, v in raw.items()})
for raw in data["items"]
]
items = [PromptItem(suite=suite, axis=axis, **raw) for raw in data["items"]]
card = SuiteCard(
suite=suite,
axis=axis,
Expand Down
Loading