diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..c51dd50 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,45 @@ +name: CodeQL + +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + - cron: '45 5 * * 0' + workflow_dispatch: + +permissions: {} + +jobs: + analyze: + name: Analyze Python + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read + security-events: write + actions: read + + steps: + - name: Harden the runner + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 + with: + egress-policy: audit + + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Initialize CodeQL + uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + with: + languages: python + build-mode: none + queries: security-extended,security-and-quality + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + with: + category: "/language:python" diff --git a/.github/workflows/gitleaks.yml b/.github/workflows/gitleaks.yml new file mode 100644 index 0000000..b31f990 --- /dev/null +++ b/.github/workflows/gitleaks.yml @@ -0,0 +1,37 @@ +name: Gitleaks + +on: + pull_request: + branches: [main] + push: + branches: [main] + schedule: + - cron: '0 3 * * *' + workflow_dispatch: + +permissions: {} + +jobs: + gitleaks: + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + actions: read + + steps: + - name: Harden the runner + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Run Gitleaks + uses: gitleaks/gitleaks-action@ff98106e4c7b2bc287b24eaf42907196329070c7 # v2.3.9 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml new file mode 100644 index 0000000..1f4e4ba --- /dev/null +++ b/.github/workflows/mypy.yml @@ -0,0 +1,38 @@ +name: Mypy + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + +permissions: + contents: read + +jobs: + mypy: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Harden the runner + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install mypy pyyaml types-PyYAML huggingface_hub + + - name: Run mypy + run: mypy --config-file=pyproject.toml diff --git a/.github/workflows/ossf-scorecard.yml b/.github/workflows/ossf-scorecard.yml new file mode 100644 index 0000000..cd18d33 --- /dev/null +++ b/.github/workflows/ossf-scorecard.yml @@ -0,0 +1,45 @@ +name: OSSF Scorecard + +on: + branch_protection_rule: + schedule: + - cron: '0 8 * * 1' + push: + branches: [main] + workflow_dispatch: + +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + security-events: write + id-token: write + contents: read + actions: read + + steps: + - name: Harden the runner + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run OSSF Scorecard + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 + with: + results_file: results.sarif + results_format: sarif + publish_results: true + + - name: Upload SARIF + uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + with: + sarif_file: results.sarif + category: ossf-scorecard diff --git a/.github/workflows/osv-scanner.yml b/.github/workflows/osv-scanner.yml new file mode 100644 index 0000000..ca44021 --- /dev/null +++ b/.github/workflows/osv-scanner.yml @@ -0,0 +1,26 @@ +name: OSV-Scanner + +on: + pull_request: + branches: [main] + push: + branches: [main] + schedule: + - cron: '39 12 * * 1' + workflow_dispatch: + +permissions: {} + +jobs: + scan: + permissions: + security-events: write + contents: read + actions: read + # yamllint disable-line rule:line-length + uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@c5996e0193a3df57d695c1b8a1dec2a4c62e8730" # v2.3.3 + with: + scan-args: |- + -r + --skip-git + ./ diff --git a/.github/workflows/semgrep.yml b/.github/workflows/semgrep.yml new file mode 100644 index 0000000..719eef9 --- /dev/null +++ b/.github/workflows/semgrep.yml @@ -0,0 +1,60 @@ +name: Semgrep + +on: + pull_request: + branches: [main] + push: + branches: [main] + schedule: + - cron: '0 4 * * 1' + workflow_dispatch: + +permissions: {} + +jobs: + semgrep: + name: Semgrep scan + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read + security-events: write + actions: read + + steps: + - name: Harden the runner + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.11' + + - name: Install Semgrep + run: pip install semgrep + + - name: Run Semgrep + run: | + semgrep scan \ + --config p/default \ + --config p/python \ + --config p/security-audit \ + --config p/secrets \ + --config p/owasp-top-ten \ + --sarif --output=semgrep.sarif \ + --error + continue-on-error: true + + - name: Upload SARIF + if: always() + uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + with: + sarif_file: semgrep.sarif + category: semgrep diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml new file mode 100644 index 0000000..29721bb --- /dev/null +++ b/.github/workflows/zizmor.yml @@ -0,0 +1,45 @@ +name: Zizmor + +on: + pull_request: + paths: + - '.github/workflows/**' + push: + branches: [main] + paths: + - '.github/workflows/**' + schedule: + - cron: '0 9 * * 1' + workflow_dispatch: + +permissions: + contents: read + +jobs: + zizmor: + name: Lint workflows + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + security-events: write + actions: read + + steps: + - name: Harden Runner + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run zizmor + uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 + with: + inputs: .github/workflows/ + min-severity: low + advanced-security: true + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.yamllint.yaml b/.yamllint.yaml index b8431cd..74a9ca2 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -2,13 +2,17 @@ extends: default rules: line-length: - max: 200 - level: warning + max: 120 + level: error document-start: disable truthy: check-keys: false + level: error comments: min-spaces-from-content: 1 indentation: spaces: 2 indent-sequences: consistent + empty-lines: + max: 2 + trailing-spaces: enable diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f07b3c0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,58 @@ +[project] +name = "ldr-benchmarks" +version = "0.0.0" +description = "Community benchmark scripts for Local Deep Research" +requires-python = ">=3.11" + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "UP", # pyupgrade + "SIM", # flake8-simplify + "S", # flake8-bandit (security) + "C4", # flake8-comprehensions + "RUF", # ruff-specific + "PTH", # use pathlib + "PL", # pylint + "TRY", # tryceratops +] +ignore = [ + "PLR0913", # too many arguments + "PLR2004", # magic value comparison + "TRY003", # long exception messages + "S603", # subprocess-without-shell-equals-true (we use list args) + "S607", # start-process-with-partial-path (git is on PATH) +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["S101", "PLR2004"] # asserts and magic values OK in tests +"scripts/validate_yamls.py" = ["PLR0912", "PLR0915"] # validator naturally branchy + +[tool.mypy] +python_version = "3.11" +files = ["scripts"] +warn_unused_configs = true +warn_redundant_casts = true +warn_unused_ignores = true +no_implicit_optional = true +check_untyped_defs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +disallow_untyped_decorators = true +warn_return_any = true + +[[tool.mypy.overrides]] +module = "yaml" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "huggingface_hub" +ignore_missing_imports = true diff --git a/results/simpleqa/source-based/serper/qwen3-4b_2026-02-18.yaml b/results/simpleqa/source-based/serper/qwen3-4b_2026-02-18.yaml index 9a5088a..dff23a3 100644 --- a/results/simpleqa/source-based/serper/qwen3-4b_2026-02-18.yaml +++ b/results/simpleqa/source-based/serper/qwen3-4b_2026-02-18.yaml @@ -36,7 +36,7 @@ configuration: temperature: 0.7 max_tokens: 30000 # Current setting at download time local_provider_context_window_size: 4096 # Current setting at download time - context_window_unrestricted: Yes # Current setting at download time + context_window_unrestricted: true # Current setting at download time # Versions versions: @@ -46,8 +46,8 @@ versions: # Test Details test_details: date_tested: 2026-02-18 - rate_limiting_issues: no # yes/no - search_failures: no # number of failed searches, if any + rate_limiting_issues: false # yes/no + search_failures: false # number of failed searches, if any # Notes notes: | diff --git a/scripts/build_leaderboards.py b/scripts/build_leaderboards.py index fd90b03..a66307a 100644 --- a/scripts/build_leaderboards.py +++ b/scripts/build_leaderboards.py @@ -21,7 +21,9 @@ import re import subprocess import sys +from collections.abc import Iterator from pathlib import Path +from typing import Any, TypedDict try: import yaml @@ -29,6 +31,12 @@ sys.exit("Missing dependency: pip install pyyaml") +class BenchmarkEntry(TypedDict): + canonical_id: str + path_slug: str + accepted_names: set[str] + + def infer_contributor_from_git(path: Path) -> str: """Return the author name of the commit that first added `path` to git, or an empty string if git is unavailable or the file is not tracked. @@ -68,7 +76,7 @@ def infer_contributor_from_git(path: Path) -> str: # Whitelist mirroring LDR's DatasetRegistry. Must stay in sync with # scripts/validate_yamls.py. Keeping it duplicated rather than importing # so each script stays runnable standalone. -BENCHMARKS = [ +BENCHMARKS: list[BenchmarkEntry] = [ { "canonical_id": "simpleqa", "path_slug": "simpleqa", @@ -132,11 +140,11 @@ def lookup_benchmark_slug(name: str) -> str: ] -def parse_accuracy(raw) -> tuple[float | None, int | None, int | None]: +def parse_accuracy(raw: object) -> tuple[float | None, int | None, int | None]: """Return (percent, correct, total) from fields like '91.2% (182/200)'.""" if raw is None: return None, None, None - if isinstance(raw, (int, float)): + if isinstance(raw, int | float): return float(raw), None, None s = str(raw).strip() pct_match = re.search(r"([\d.]+)\s*%", s) @@ -149,7 +157,9 @@ def parse_accuracy(raw) -> tuple[float | None, int | None, int | None]: return pct, correct, total -def iter_strategy_blocks(results_block: dict): +def iter_strategy_blocks( + results_block: dict[str, Any], +) -> Iterator[tuple[str, dict[str, Any]]]: """Yield (strategy_key, strategy_dict) for every strategy in results.""" for key, value in results_block.items(): if key in RESERVED_RESULT_KEYS: @@ -158,7 +168,7 @@ def iter_strategy_blocks(results_block: dict): yield key, value -def rows_from_yaml(path: Path) -> list[dict]: +def rows_from_yaml(path: Path) -> list[dict[str, Any]]: try: data = yaml.safe_load(path.read_text(encoding="utf-8")) except Exception as e: @@ -200,7 +210,7 @@ def rows_from_yaml(path: Path) -> list[dict]: contributor = "" contributor_source = "" - rows = [] + rows: list[dict[str, Any]] = [] for strategy_key, strategy_block in iter_strategy_blocks(results_block): pct, correct, total = parse_accuracy(strategy_block.get("accuracy")) rows.append({ @@ -230,7 +240,10 @@ def rows_from_yaml(path: Path) -> list[dict]: "date_tested": date_tested, "contributor": contributor, "contributor_source": contributor_source, - "notes": (data.get("notes", "") or "").strip().splitlines()[0] if data.get("notes") else "", + "notes": ( + (data.get("notes", "") or "").strip().splitlines()[0] + if data.get("notes") else "" + ), "source_file": str(path.as_posix()), }) if not rows: @@ -242,12 +255,13 @@ def slugify(name: str) -> str: return re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") or "unknown" -def write_csv(path: Path, rows: list[dict]) -> None: +def write_csv(path: Path, rows: list[dict[str, Any]]) -> None: path.parent.mkdir(parents=True, exist_ok=True) - def sort_key(r): + + def sort_key(r: dict[str, Any]) -> tuple[float, str, str]: pct = r["accuracy_pct"] pct_val = -float(pct) if pct not in ("", None) else 1.0 - return (pct_val, str(r.get("date_tested", "")), r.get("model", "")) + return (pct_val, str(r.get("date_tested", "")), str(r.get("model", ""))) rows_sorted = sorted(rows, key=sort_key) with path.open("w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=COLUMNS) @@ -272,8 +286,8 @@ def main() -> int: write_csv(args.out_dir / "all.csv", []) return 0 - all_rows: list[dict] = [] - per_dataset: dict[str, list[dict]] = {} + all_rows: list[dict[str, Any]] = [] + per_dataset: dict[str, list[dict[str, Any]]] = {} for path in yaml_files: for row in rows_from_yaml(path): all_rows.append(row) diff --git a/scripts/validate_yamls.py b/scripts/validate_yamls.py index 9b355fa..c214195 100644 --- a/scripts/validate_yamls.py +++ b/scripts/validate_yamls.py @@ -23,6 +23,7 @@ import re import sys from pathlib import Path +from typing import TypedDict try: import yaml @@ -30,13 +31,20 @@ sys.exit("Missing dependency: pip install pyyaml") +class BenchmarkEntry(TypedDict): + canonical_id: str + path_slug: str + accepted_names: set[str] + restricted: bool + + # Whitelist of supported benchmarks. Mirrors LDR's DatasetRegistry # (src/local_deep_research/benchmarks/datasets/__init__.py). To add a # new benchmark here, add its canonical_id to match LDR's registry, # list the display names contributors may write in `results.dataset:`, # set the path_slug used in results/{slug}/..., and set restricted=True # if per-question examples must not be shared publicly. -BENCHMARKS = [ +BENCHMARKS: list[BenchmarkEntry] = [ { "canonical_id": "simpleqa", "path_slug": "simpleqa", @@ -57,22 +65,15 @@ }, ] -# Derived lookup tables. -_BENCHMARK_BY_NAME: dict[str, dict] = {} -for _b in BENCHMARKS: - for _n in _b["accepted_names"]: - _BENCHMARK_BY_NAME[_n] = _b - _BENCHMARK_BY_NAME[_b["path_slug"]] = _b - -def lookup_benchmark(name: str) -> dict | None: +def lookup_benchmark(name: str) -> BenchmarkEntry | None: """Look up a benchmark whitelist entry by any accepted form of the name.""" if not name: return None key = re.sub(r"[^a-z0-9]+", "", str(name).lower()) # Try each entry's normalized accepted names + path_slug. for b in BENCHMARKS: - candidates = set(b["accepted_names"]) | {b["path_slug"], b["canonical_id"]} + candidates = b["accepted_names"] | {b["path_slug"], b["canonical_id"]} if any(re.sub(r"[^a-z0-9]+", "", c.lower()) == key for c in candidates): return b return None @@ -95,10 +96,10 @@ def slugify(name: str) -> str: return re.sub(r"[^a-z0-9]+", "-", str(name).lower()).strip("-") -def parse_accuracy(raw) -> bool: +def parse_accuracy(raw: object) -> bool: if raw is None: return False - if isinstance(raw, (int, float)): + if isinstance(raw, int | float): return True return bool(re.search(r"[\d.]+", str(raw))) @@ -118,9 +119,9 @@ def check_file(path: Path, results_root: Path) -> list[str]: try: data = yaml.safe_load(text) except yaml.YAMLError as e: - return errors + [f"invalid YAML: {e}"] + return [*errors, f"invalid YAML: {e}"] if not isinstance(data, dict): - return errors + ["top-level YAML must be a mapping"] + return [*errors, "top-level YAML must be a mapping"] for key in REQUIRED_TOP_LEVEL: if key not in data: @@ -143,8 +144,8 @@ def check_file(path: Path, results_root: Path) -> list[str]: f"BENCHMARKS in scripts/validate_yamls.py." ) - strategy_blocks = [ - k for k, v in results_block.items() + strategy_blocks: list[str] = [ + str(k) for k, v in results_block.items() if k not in RESERVED_RESULT_KEYS and isinstance(v, dict) ] if not strategy_blocks: @@ -155,7 +156,10 @@ def check_file(path: Path, results_root: Path) -> list[str]: if "accuracy" not in block: errors.append(f"results.{strat}.accuracy is missing") elif not parse_accuracy(block.get("accuracy")): - errors.append(f"results.{strat}.accuracy could not be parsed: {block.get('accuracy')!r}") + errors.append( + f"results.{strat}.accuracy could not be parsed: " + f"{block.get('accuracy')!r}" + ) # Path convention: results/{dataset}/{strategy}/{search_engine}/{file}.yaml try: @@ -196,13 +200,12 @@ def check_file(path: Path, results_root: Path) -> list[str]: ) # Restricted benchmarks must not contain examples - if benchmark_entry and benchmark_entry["restricted"]: - if "examples" in data: - errors.append( - f"dataset '{benchmark_entry['canonical_id']}' is restricted — " - f"per-question examples are not allowed for this benchmark. " - f"Remove the 'examples:' block before submitting." - ) + if benchmark_entry and benchmark_entry["restricted"] and "examples" in data: + errors.append( + f"dataset '{benchmark_entry['canonical_id']}' is restricted — " + f"per-question examples are not allowed for this benchmark. " + f"Remove the 'examples:' block before submitting." + ) return errors @@ -220,7 +223,10 @@ def main() -> int: if not args.results_dir.exists(): print(f"results dir not found: {args.results_dir}", file=sys.stderr) return 1 - yaml_files = sorted(args.results_dir.rglob("*.yaml")) + sorted(args.results_dir.rglob("*.yml")) + yaml_files = ( + sorted(args.results_dir.rglob("*.yaml")) + + sorted(args.results_dir.rglob("*.yml")) + ) if not yaml_files: print("no YAML files to validate")