diff --git a/.gitignore b/.gitignore index 77e347cf5..536877328 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ bert.pt.json work runs *.parquet +!data/titaneval/*.parquet *.json cache/ fastspeech_output diff --git a/TASK_NAMING.md b/TASK_NAMING.md index 1c20f44a1..1915e6bf6 100644 --- a/TASK_NAMING.md +++ b/TASK_NAMING.md @@ -1,292 +1,651 @@ # Task Naming Conventions -Every task name ends with an explicit metric-type suffix. There are no bare/default names. +Every task name ends with an explicit metric-type suffix and a shot count: `{task}:{suffix}|{n_shot}`. -## Evaluation Suites +## Task Overview -### Table 1 — BPB Prepare Suite +**48 unique English tasks** (`:cf`/`:mcf`/`:gen` variants of the same task count as one): -Tasks used to measure BPB during pre-training. All tasks run in logprob mode (`generation_size=-1`). CF tasks also produce `acc` and `acc_norm` in the same pass. +| Category | # | Task names | +|---|---|---| +| Code BPB (§3) | 3 | `humaneval`, `mbpp`, `mt_mbpp` | +| Math (§3+§4) | 2 | `math` (BPB+gen), `math_500` | +| CoT Reasoning (§4) | 5 | `gsm8k`, `gsm_plus`, `gsm_symbolic`, `bigbench_hard`, `agieval_eng_em` | +| English MC QA (§5) | 31 | `mmlu`, `mmlu_pro`, `arc`, `commonsenseqa`, `siqa`, `piqa`, `sciq`, `hellaswag`, `winogrande`, `swag`, `openbookqa`, `qasc`, `boolq`, `med_mcqa`, `med_qa`, `pubmedqa`, `headqa`, `gpqa`, `jeopardy_mc`, `truthfulqa`, `cybermetric`, `secqa`, `mascqa`, `formationeval`, `teleqna`, `labbench`, `preflight`, `chembench`, `esgenius`, `xfinbench`, `geobench` | +| English GenQA (§6) | 10 | `coqa`, `drop`, `jeopardy`, `natural_questions`, `squad`, `squad_v2`, `triviaqa`, `popqa`, `wikifact`, `simpleqa` | +| Lambada & Basic Skills (§7) | 2 | `lambada`, `basic_skills` | +| **Total** | **53** | | +--- + +--- + +## 1. Suffix Reference + +| Suffix | `generation_size` | Metrics reported | Description | +|--------|-------------------|-----------------|-------------| +| `:cf` | `-1` | `acc`, `acc_norm`, `target_bpb`* | Logprob on full answer texts; BPB merged in | +| `:mcf` | `-1` | `acc`, `acc_norm` | Logprob on label tokens only (`A`, `B`, …) | +| `:mcf_em` | `1` | `em` | Greedy-decode the label token, exact match | +| `:gen` | task-specific | `f1`, `em` (normalized) | Generate free text; scored with normalized F1 + EM | +| `:bpb` | `-1` | `target_bpb` | Standalone BPB; used for code, math, or decoupled from `:gen` / `:cf` | + +\*BPB merged into `:cf` applies to **English MC QA tasks (§5) only**. For lambada and basic_skills (§7), BPB uses a different prompt and is a separate `:bpb` config — `:cf` for those tasks does **not** include BPB. + +**Gen tasks**: `:gen` and `:bpb` are separate configs (different prompts). EM and F1 for `:gen` use +`harness_triviaqa_normalizer` (lowercase + remove punctuation) on both gold and prediction. +Exception: `drop:gen` uses `Metrics.drop` (span/number/date-aware normalization). + +--- + +## 2. How to Run (CLI) + +```bash +# Single task +lighteval litellm config.yaml "arc:challenge:cf|5" + +# All variants of one task +lighteval litellm config.yaml "arc:challenge|5" + +# All subsets of one task + one metric +lighteval litellm config.yaml "mmlu:cf|5" # 57 subsets +lighteval litellm config.yaml "wikifact:gen|5" # 81 relation subsets + +# All subsets × all metrics +lighteval litellm config.yaml "mmlu|5" # 57 × 3 = 171 tasks +lighteval litellm config.yaml "arc|5" # 2 × 3 = 6 tasks + +# Multilingual (requires --load-multilingual flag in runner) +lighteval litellm config.yaml "global_mmlu:cf|5" +lighteval litellm config.yaml "mlmm_arc:deu:mcf|5" ``` -# Code + +--- + +## 3. Code & Math BPB Tasks + +BPB over the gold continuation only (`generation_size=-1`). No accuracy metric. + +| Task | Dataset | Eval | FS | ICL | +|---|---|---|---|---| +| `humaneval:{lang}:bpb` | `openai/openai_humaneval` | test | — | 3 | +| `mbpp:bpb` | `google-research-datasets/mbpp` (sanitized) | test | — | 3 | +| `mt_mbpp:{lang}:bpb` (17) | `allenai/multilingual_mbpp` | test | — | 3 | +| `math:{subset}:bpb` (7) | `EleutherAI/hendrycks_math` | test | — | 4 | + +**MT-MBPP languages (17):** `bash`, `c`, `cpp`, `csharp`, `go`, `haskell`, `java`, `javascript`, +`matlab`, `php`, `python`, `r`, `ruby`, `rust`, `scala`, `swift`, `typescript`. + +**Math subsets (7):** `algebra`, `counting_and_probability`, `geometry`, `intermediate_algebra`, +`number_theory`, `prealgebra`, `precalculus`. + +```bash humaneval:bpb|3 mbpp:bpb|3 -mt_mbpp:bpb|3 # expands to all 17 language subtasks - -# Math -math:algebra:bpb|4 -math:counting_and_probability:bpb|4 -math:geometry:bpb|4 -math:intermediate_algebra:bpb|4 -math:number_theory:bpb|4 -math:prealgebra:bpb|4 -math:precalculus:bpb|4 - -# QA — CF (BPB merged in) -arc:easy:cf|5 -arc:challenge:cf|5 -mmlu:cf|5 # expands to all 57 subsets -commonsenseqa:cf|5 -hellaswag:cf|5 -winogrande:cf|5 -siqa:cf|5 -piqa:cf|5 -sciq:cf|5 -basic_skills:cf|5 # expands to all 6 subsets -lambada:cf # 0-shot -med_mcqa:cf|5 - -# QA — standalone BPB (no fixed answer choices) -coqa:bpb # 0-shot -drop:bpb|5 -jeopardy:bpb|5 -natural_questions:bpb|5 -squad:bpb|5 +mt_mbpp:bpb|3 # all 17 languages +mt_mbpp:python:bpb|3 # single language +math:bpb|4 # all 7 subsets +math:algebra:bpb|4 # single subset ``` --- -### Table 2 — Full Evaluation Suite +## 4. Math & CoT Reasoning Tasks -#### Math (CoT generation) +All tasks in this section generate free text and score with extractive match metrics. -``` -gsm8k|8 +| Task | Dataset | Few-shot split | Rec. ICL | gen_size | Metric | +|---|---|---|---|---|---| +| `math:{subset}:gen` (7) | `EleutherAI/hendrycks_math` | `train` | 4 | 1024 | `expr_gold_metric` | +| `math_500` | `HuggingFaceH4/MATH-500` | `test`¹ | 4 | 1024 | `expr_gold_metric` | +| `gsm8k` | `openai/gsm8k` | `train` | 8 | 512 | `expr_gold_metric` | +| `gsm_plus` | `qintongli/GSM-Plus` | `testmini` | 8 | 512 | `expr_gold_metric` | +| `gsm_symbolic:{main,p1,p2}` | `apple/GSM-Symbolic` | `test`¹ | 8 | 512 | `expr_gold_metric` | +| `bigbench_hard:{subset}` (27) | `lukaemon/bbh` | `train` | 3 | 1024 | `bbh_cot_exact_match` | +| `agieval_eng_em:{subset}` (7) | `lighteval/agi_eval_en` | `dev` | 0 | 512 | `gpqa_instruct_metric` | + +¹ Test-only datasets (no train split): `math_500` (`HuggingFaceH4/MATH-500`), `gsm_symbolic` (`apple/GSM-Symbolic`). Few-shot examples are drawn from the test pool via random sampling (potential leakage). For leakage-free math few-shot, prefer `math:gen|4` (draws from `hendrycks_math` train) and `gsm8k|8` (draws from GSM8K train). + +> **4k context note:** `gsm8k|8` ≈ 3–4k tokens of context (borderline); `math:gen|4` ≈ 4k+ (too long). Use `gsm8k|4` and `math:gen|1` for 4k-ctx models. + +**`expr_gold_metric`** — extracts mathematical expressions / LaTeX (including `\boxed{}`) from model output; scores with symbolic equivalence. + +**`bbh_cot_exact_match`** — extracts text after "the answer is" from CoT output; exact match. + +**`gpqa_instruct_metric`** (AGIEval) — extracts letter choice (A–E) from CoT output. + +```bash +math:gen|4 # all 7 subsets, 4-shot from train +math:algebra:gen|4 # single subset +math_500|4 # 4-shot drawn from test pool (see ¹) +gsm8k|8 # 8-shot from train (standard) +gsm_plus|8 # 8-shot from testmini gsm_symbolic:main|8 gsm_symbolic:p1|8 gsm_symbolic:p2|8 -math:algebra:gen|4 -math:counting_and_probability:gen|4 -math:geometry:gen|4 -math:intermediate_algebra:gen|4 -math:number_theory:gen|4 -math:prealgebra:gen|4 -math:precalculus:gen|4 -math_500 +bigbench_hard|3 # all 27 subsets +bigbench_hard:boolean_expressions|3 +agieval_eng_em|0 # all 7 subsets +agieval_eng_em:aqua_rat|0 # single subset ``` -#### STEM QA + Non-STEM QA (MC) +**Math subsets (7):** `algebra`, `counting_and_probability`, `geometry`, `intermediate_algebra`, +`number_theory`, `prealgebra`, `precalculus`. +**AGIEval (English) subsets (7):** `aqua_rat`, `logiqa-en`, `lsat-ar`, `lsat-lr`, `lsat-rc`, `sat-en`, `sat-math`. + +--- + +## 5. English MC QA Tasks + +All tasks in this section expose three variants: +- `:cf|N` → `acc`, `acc_norm`, `target_bpb` +- `:mcf|N` → `acc`, `acc_norm` +- `:mcf_em|N` → `em` + +Exceptions are noted per task. + +### MMLU + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `lighteval/mmlu` | test | dev | 5 | + +57 subjects across STEM, Humanities, Social Sciences, Other. + +```bash +mmlu:cf|5 # all 57 subsets +mmlu:mcf|5 +mmlu:mcf_em|5 +mmlu:abstract_algebra:cf|5 # single subject +mmlu|5 # all 57 × 3 variants ``` -arc:easy:mcf|5 -arc:challenge:mcf|5 -mmlu:mcf|5 # expands to all 57 subsets (STEM + Humanities + Social Sci + Other) -med_mcqa:mcf|5 -med_qa:mcf|5 -sciq:mcf|5 + +### MMLU-Pro + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `TIGER-Lab/MMLU-Pro` | test | validation | 5 | up to 10 options | + +```bash +mmlu_pro:cf|5 +mmlu_pro:mcf|5 +mmlu_pro:mcf_em|5 +mmlu_pro:cot|5 # chain-of-thought + extractive match (separate config) +``` + +### ARC + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `allenai/ai2_arc` | test | train | 5 | + +```bash +arc:cf|5 # both easy + challenge +arc:mcf|5 +arc:mcf_em|5 +arc:easy:cf|5 # single subset +arc:challenge:mcf_em|5 +``` + +### CommonsenseQA + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `tau/commonsense_qa` | validation | train | 5 | + +```bash +commonsenseqa:cf|5 commonsenseqa:mcf|5 -piqa:mcf|5 +commonsenseqa:mcf_em|5 +``` + +### SIQA + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `lighteval/siqa` | validation | train | 5 | + +```bash +siqa:cf|5 siqa:mcf|5 -jeopardy_mc:mcf|5 +siqa:mcf_em|5 ``` -#### GenQA / RC (completion/generation) +### PIQA + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `lighteval/piqa` | validation | train | 5 | +```bash +piqa:cf|5 +piqa:mcf|5 +piqa:mcf_em|5 ``` -hellaswag:cf|5 # RC per-char norm -winogrande:cf|5 # RC unnormalized -lambada:cf # RC per-char norm, 0-shot -basic_skills:cf|5 # RC per-token norm, expands to all 6 subsets -drop:gen|5 -jeopardy:gen|5 -natural_questions:gen|5 -squad:gen|5 -coqa:gen # 0-shot + +### SciQ + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `allenai/sciq` | test | train | 5 | + +```bash +sciq:cf|5 +sciq:mcf|5 +sciq:mcf_em|5 ``` -#### Held-out Suite +### HellaSwag + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `Rowan/hellaswag` | validation | train | 5 | sentence completion | +```bash +hellaswag:cf|5 +hellaswag:mcf|5 +hellaswag:mcf_em|5 ``` -mmlu_pro:mcf|5 -bigbench_hard|3 # expands to all 27 BBH subsets + +### WinoGrande + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `allenai/winogrande` (xl) | validation | train | 5 | cloze / pronoun resolution | + +```bash +winogrande:cf|5 +winogrande:mcf|5 +winogrande:mcf_em|5 +winogrande:bpb|5 # OLMO-style partial evaluation BPB (separate config) ``` ---- +### SWAG + +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `allenai/swag` (regular) | validation | train | 5 | -## Suffix Reference +```bash +swag:cf|5 +swag:mcf|5 +swag:mcf_em|5 +``` -| Suffix | Metric type | `generation_size` | Metrics reported | Description | -| --------- | --------------------------- | ------------------- | ------------------------------- | ---------------------------------------------------------------------------- | -| `:mcf_em` | Greedy generation (MC only) | `1`–`5` | `exact_match` | MC tasks only: generate label token, compare with EM | -| `:cf` | Completion formulation | `-1` (logprob only) | `acc`, `acc_norm`, `target_bpb` | Score full answer text via log p(choice\|context); BPB merged in for MC tasks | -| `:mcf` | Multiple-choice formulation | `-1` (logprob only) | `acc`, `acc_norm` | Score label tokens only (`A`, `B`, …) | -| `:bpb` | Bits-per-byte (standalone) | `-1` (logprob only) | `target_bpb` | Used only where CF is not applicable (MATH, free-form GenQA) | -| `:gen` | Greedy generation + F1/EM | `50`–`1024` | `f1`, `em` (task-specific) | Actual text generation; answer scored with F1 or extractive match | +### OpenBookQA -**Note on BPB for multiple-choice tasks**: BPB is **merged into `:cf`** for all MC tasks — running `:cf` produces `{acc, acc_norm, target_bpb}` in one pass. There are no standalone `:bpb` tasks for ARC, MMLU, HellaSwag, etc. +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `allenai/openbookqa` (main) | test | train | 5 | -## How to reference tasks in CLI +```bash +openbookqa:cf|5 +openbookqa:mcf|5 +openbookqa:mcf_em|5 +``` -Task names follow the pattern `::` (or `:` for single-subset tasks). +### QASC -Use the prefix before the first `:` as a superset to run all subsets at once: +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `allenai/qasc` | validation | train | 5 | 8-choice, requires two facts | +```bash +qasc:cf|5 +qasc:mcf|5 +qasc:mcf_em|5 ``` -# Single task -lighteval litellm config.yaml "arc:challenge:cf|5" -# All ARC variants with CF metric (2 subsets × 1 metric) -lighteval litellm config.yaml "arc:cf|5" +### BoolQ -# All metrics for one ARC subset -lighteval litellm config.yaml "arc:challenge|5" +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `lighteval/boolq_helm` | validation | train | 5 | binary yes/no | -# All MMLU subsets for a given metric (57 subsets) -lighteval litellm config.yaml "mmlu:cf|5" -lighteval litellm config.yaml "mmlu:mcf_em|5" -lighteval litellm config.yaml "mmlu:mcf|5" +```bash +boolq:cf|5 +boolq:mcf|5 +boolq:mcf_em|5 +``` -# All metrics for one MMLU subset -lighteval litellm config.yaml "mmlu:abstract_algebra|5" +### MedMCQA -# All MMLU tasks (all subsets × all metrics) -lighteval litellm config.yaml "mmlu|5" +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `lighteval/med_mcqa` | validation | train | 5 | -# All MT-MBPP language subtasks -lighteval litellm config.yaml "mt_mbpp:bpb|3" +```bash +med_mcqa:cf|5 +med_mcqa:mcf|5 +med_mcqa:mcf_em|5 ``` -## Task inventory (CLAUDE.md tasks) +### MedQA (USMLE) -### Code BPB tasks +| Dataset | Eval | FS | ICL | +|---|---|---|---| +| `bigbio/med_qa` (med_qa_en_source) | test | train | 5 | -| Task | Dataset | Subset | Eval split | ICL | Metric | -|------|---------|--------|------------|-----|--------| -| `humaneval:bpb` | `openai/openai_humaneval` | default | test | 3 | `target_bpb` | -| `mbpp:bpb` | `google-research-datasets/mbpp` | sanitized | test | 3 | `target_bpb` | -| `mt_mbpp:{lang}:bpb` (17) | `allenai/multilingual_mbpp` | `{lang}` | test | 3 | `target_bpb` | +```bash +med_qa:cf|5 +med_qa:mcf|5 +med_qa:mcf_em|5 +``` -**MT-MBPP superset**: `mt_mbpp:bpb|3` expands to all 17 language subtasks. +### PubMedQA -**17 languages**: `bash`, `c`, `cpp`, `csharp`, `go`, `haskell`, `java`, `javascript`, `matlab`, `php`, `python`, `r`, `ruby`, `rust`, `scala`, `swift`, `typescript`. +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `qiaojin/PubMedQA` (pqa_labeled) | train | train | 5 | 3-choice: yes/no/maybe | -### Math +```bash +pubmedqa:cf|5 +pubmedqa:mcf|5 +pubmedqa:mcf_em|5 +``` -| Task | Dataset | Eval split | ICL | Metric | -|------|---------|------------|-----|--------| -| `math:{subset}:bpb` (7) | `EleutherAI/hendrycks_math` | test | 4 | `target_bpb` | -| `math:{subset}:gen` (7) | `EleutherAI/hendrycks_math` | test | 4 | `expr_gold_metric` | +### HeadQA -Subsets: `algebra`, `counting_and_probability`, `geometry`, `intermediate_algebra`, `number_theory`, `prealgebra`, `precalculus`. +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `lighteval/headqa_harness` | test | train | 5 | en + es subsets | -### MMLU +```bash +headqa:en:cf|5 +headqa:es:cf|5 +headqa:cf|5 # both subsets +headqa:mcf|5 +headqa:mcf_em|5 +``` + +### GPQA (Diamond) + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `Idavidrein/gpqa` (gpqa_diamond) | train | train | 0 | gated; choices shuffled by question hash | + +```bash +gpqa:diamond:cf|0 +gpqa:diamond:mcf|0 +gpqa:diamond:mcf_em|0 +gpqa:diamond|0 # all 3 variants +``` + +Note: `gpqa:diamond` (instruct CoT, `gpqa_instruct_pass_at_k`) and `gpqa:main` / `gpqa:extended` +(instruct reasoning) are separate configs for instruction-tuned evaluation. -| Superset | Expands to | # tasks | -|----------|------------|---------| -| `mmlu` | all subsets × all metrics | 171 (57×3) | -| `mmlu:cf` | `mmlu:{subset}:cf` for all 57 subsets | 57 | -| `mmlu:mcf_em` | `mmlu:{subset}:mcf_em` for all 57 subsets | 57 | -| `mmlu:mcf` | `mmlu:{subset}:mcf` for all 57 subsets | 57 | -| `mmlu_redux:cf` | all redux subsets with CF | 57 | - -Dataset: `lighteval/mmlu`. Each `:cf` task reports `{acc, acc_norm, target_bpb}`. - -### Multiple-choice QA tasks - -| Task | Dataset | Eval split | ICL | `:cf` metrics | `:mcf` metrics | -|------|---------|------------|-----|---------------|----------------| -| `arc:challenge` / `arc:easy` | `allenai/ai2_arc` | test | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `commonsenseqa` | `tau/commonsense_qa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `hellaswag` | `Rowan/hellaswag` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `winogrande` | `allenai/winogrande` (xl) | validation | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `siqa` | `lighteval/siqa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `piqa` | `lighteval/piqa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `sciq` | `allenai/sciq` | test | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `med_mcqa` | `lighteval/med_mcqa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `openbookqa` | `allenai/openbookqa` (main) | test | 5 | acc, acc_norm, bpb | acc, acc_norm | -| `jeopardy_mc:cf` | `allenai/jeopardy_mc` | test | 0 | acc, acc_norm, bpb | — | -| `jeopardy_mc:mcf` | `allenai/jeopardy_mc` | test | 0 | — | acc, acc_norm | - -Note: `siqa` and `piqa` use `lighteval/*` wrapper repos (same data as `allenai/social_i_qa` / `ybisk/piqa`); both sources require the script fallback in `download_dataset_worker`. +### Jeopardy MC + +OLMo Gen2MC — dedicated MC dataset derived from Jeopardy. For the generative form see `jeopardy` in §6. + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `allenai/jeopardy_mc` | test | — | 0 | CF + MCF only (no mcf_em) | + +```bash +jeopardy_mc:cf|0 +jeopardy_mc:mcf|0 +``` ### TruthfulQA (MC2) -| Task | Dataset | Eval split | ICL | Metric | -|------|---------|------------|-----|--------| -| `truthfulqa:mc2:cf` | `truthfulqa/truthful_qa` (multiple_choice) | validation | 0 (built-in primer) | `truthfulqa_mc2` (single score, logprob) | - -MC2 = normalized probability mass on the set of true answers. Reuses the exact `truthfulqa:mc` prompt + computation but emits only mc2; the two-key `truthfulqa:mc` (`truthfulqa_mc1` + `truthfulqa_mc2`) is also available. - -### Single-answer completion tasks (CF only) - -| Task | Dataset | Eval split | ICL | Metrics | -|------|---------|------------|-----|---------| -| `lambada:cf` | `EleutherAI/lambada_openai` | test | 0 | acc_norm (char), bpb | -| `basic_skills:{subset}:cf` (6) | `allenai/basic-skills` | validation | 5 | acc_norm (token), bpb | - -**Basic Skills subsets**: `arithmetic`, `string_operations`, `coding`, `logical_reasoning`, `common_knowledge`, `pattern`. - -**Lambada cloze variants** (perplexity only): - -| Task | Dataset | Prompt | -|------|---------|--------| -| `lambada:standard_cloze` | `cimec/lambada` | `{context} ____. ->` | -| `lambada:openai_cloze` | `EleutherAI/lambada_openai` | `{context} ____. ->` | - -### Free-form GenQA tasks - -| Task | Dataset | Eval split | ICL | `:bpb` | `:gen` gen_size | `:gen` metrics | -|------|---------|------------|-----|--------|-----------------|----------------| -| `coqa` | `EleutherAI/coqa` (parquet) | validation | 0 | `target_bpb` | 50 | f1, em | -| `drop` | `lighteval/drop_harness` | validation | 5 | `target_bpb` | 100 | em, f1 (DROP) | -| `jeopardy` | `soldni/jeopardy` (mosaicml_gauntlet, 2117) | train | 5 | `target_bpb` | 50 | f1, em | -| `natural_questions` | `google-research-datasets/nq_open` | validation | 5 | `target_bpb` | 50 | f1, em | -| `squad` | `allenai/squad` (v1.1) | validation | 5 | `target_bpb` | 50 | f1, em | -| `squad_v2` | `rajpurkar/squad_v2` (answerable-only) | validation | 5 | `target_bpb` | 200 | f1, em | -| `triviaqa` | `mandarjoshi/trivia_qa` (rc.nocontext) | validation | 5 | `target_bpb` | 20 | f1, em | -| `popqa` | `akariasai/PopQA` | test | 5 | `target_bpb` | 8 | f1, em | -| `wikifact:{subset}` (81) | `lighteval/wikifact` | test | 5 | `target_bpb` | 8 | f1, em | - -Prompt formats: -- **CoQA**: `Passage: {story}\n\nFinal question:\n\nQuestion: {q}\nAnswer:` — stop `["\n\n"]` -- **DROP**: `Passage: {passage}\nQuestion: {question}\nAnswer:` — stop `["\n"]` -- **Jeopardy**: `Category: {cat}\nQuestion: {q}\nAnswer:` — stop `["\n\n", "Question:", "Category:"]` -- **NaturalQs**: `Question: {question}\nAnswer:` — stop `["Question:", "Q:", "\n\n"]` -- **SQuAD**: `Title: {title}\n\nBackground: {context}\n\nQuestion: {question}\n\nAnswer:` — stop `["Title:", "\n\n"]` -- **SQuAD v2**: QA template (answerable-only via `hf_filter`) — stop `["\n", "Question:", "question:"]` -- **TriviaQA**: `Question: {question}\nAnswer:` — stop `["\n", ".", ","]`; gold = canonical value + aliases -- **PopQA**: `{question} ` — stop `["\n"]`; gold = `possible_answers` aliases -- **WikiFact**: `{question} ` — stop `["\n"]`; 81 relation subsets — superset `wikifact:gen` / `wikifact:bpb` - -### CoT generation tasks - -| Task | Dataset | ICL | gen_size | Metric | Subsets | -|------|---------|-----|----------|--------|---------| -| `gsm8k` | `openai/gsm8k` | 8 | 512 | `expr_gold_metric` | — | -| `gsm_symbolic:{main,p1,p2}` | `apple/GSM-Symbolic` | 8 | 512 | `expr_gold_metric` | 3 | -| `math_500` | `HuggingFaceH4/MATH-500` | 0 | 1024 | `expr_gold_metric` | — | -| `bigbench_hard:{subset}` (27) | `lukaemon/bbh` | 3 | 1024 | em (after extraction) | 27 | -| `agieval_eng_em:{subset}` (7) | `lighteval/agi_eval_en` | 0 | 512 | `gpqa_instruct` (CoT, extractive) | 7 | - -**AGIEval (English) subsets**: `aqua_rat`, `logiqa-en`, `lsat-ar`, `lsat-lr`, `lsat-rc`, `sat-en`, `sat-math`. +| Dataset | Eval | FS | ICL | Metric | Notes | +|---|---|---|---|---|---| +| `truthfulqa/truthful_qa` (multiple_choice) | validation | — | 0 | `truthfulqa_mc2` | single score; built-in 5-QA primer | ---- +MC2 = normalized probability mass on the set of true answers. Higher is better. +The two-key variant `truthfulqa:mc` (reports both `truthfulqa_mc1` + `truthfulqa_mc2`) is also available. + +```bash +truthfulqa:mc2:cf|0 +``` -## Metric definitions +### CyberMetric + SecQA -**`:mcf_em`** — greedy decode (temperature=0), compare output to gold with exact_match. +| Task | Dataset | Eval | FS | ICL | +|---|---|---|---|---| +| `cybermetric` | `tihanyin/CyberMetric` | train | train | 0 | +| `secqa:v1` / `secqa:v2` | `zefang-liu/secqa` | test | test | 0 | -**`:cf`** (completion formulation) — score full candidate answer text: +```bash +cybermetric:cf|0 +cybermetric:mcf|0 +cybermetric:mcf_em|0 +secqa:v1:cf|0 +secqa:v2:cf|0 +secqa:cf|0 # both versions ``` -score_i = log p(answer_i | prompt) + +### MaScQA + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `heegyu/mascqa` | test | test | 0 | choices embedded in question text | + +```bash +mascqa:cf|0 +mascqa:mcf|0 +mascqa:mcf_em|0 ``` -Prediction = argmax. Normalizations: per-char (`LogProbCharNorm`) → `acc_norm`. BPB also computed from gold choice logprob in the same pass. -**`:mcf`** (multiple-choice formulation) — prompt shows labeled options, score only the label token: +### FormationEval + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `AlmazErmilov/FormationEval` | test | test | 0 | petroleum engineering | + +```bash +formationeval:cf|0 +formationeval:mcf|0 +formationeval:mcf_em|0 ``` -score_i = log p(" A" | prompt) # or " B", " C", " D" + +### TeleQnA + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `netop/TeleQnA` | test | test | 0 | **gated** — must be pre-cached | + +```bash +teleqna:cf|0 +teleqna:mcf|0 +teleqna:mcf_em|0 ``` -Prediction = argmax label score. Reports `acc` and `acc_norm`. -**`:bpb`** (bits-per-byte, standalone) — no choice ranking, gold continuation only: +### LAB-Bench (TableQA) + +| Dataset | Eval | FS | ICL | Notes | +|---|---|---|---|---| +| `futurehouse/LAB-Bench` (TableQA) | train | train | 0 | biology; tables provided as images (text-only variant) | + +```bash +labbench:cf|0 +labbench:mcf|0 +labbench:mcf_em|0 ``` -BPB = -log2 p(gold | prompt) / bytes_utf8(gold) + +### TitanEval English Domain Tasks + +Loaded from local parquet (`data/titaneval/`). Source: TitanEval-MCQ benchmark suite. +Each task has its own file: `tasks/{task}.py` (e.g., `tasks/preflight.py`). +All expose `:cf` (acc + acc_norm_char + **BPB merged**), `:mcf` (acc + acc_norm_char), `:mcf_em` (exact match, greedy decode). + +> **Few-shot note:** These tasks have **test split only** — no dedicated few-shot split exists. +> The config sets `few_shots_split="test"`, `few_shots_select="random_sampling"` to allow +> CLI-level overrides, but running with `|N` (N > 0) draws examples from the test set itself +> (leakage risk). **Recommended: use `|0` (0-shot) for all TitanEval tasks.** + +| Task | Domain | Rows | ICL | Notes | +|---|---|---|---|---| +| `preflight` | Aviation safety | 300 | 0 | | +| `chembench` | Chemistry (analytical/organic/physical) | 2,542 | 0 | | +| `esgenius` | ESG / sustainability | 1,136 | 0 | | +| `xfinbench` | Finance (cross-lingual, EN subset) | 588 | 0 | 4 rows filtered (missing choices) | +| `geobench` | Geoscience | 1,390 | 0 | | + +```bash +preflight:cf|0 +preflight:mcf|0 +preflight:mcf_em|0 +chembench:cf|0 +chembench:mcf|0 +esgenius:cf|0 +xfinbench:cf|0 +geobench:cf|0 +``` + +--- + +## 6. English GenQA Tasks + +All tasks in this section expose two variants: +- `:gen|N` → `f1`, `em` (normalized: lowercase + remove punctuation on both gold and prediction) +- `:bpb|N` → `target_bpb` (decoupled: same query, scores only the first gold continuation) + +Exception: `drop:gen` uses `Metrics.drop` (span/number/date-aware normalization), not the standard normalized EM/F1. + +**OLMo Gen2MC note:** OLMo's Base Main Suite reformulates DROP, CoQA, SQuAD, NaturalQs, and Jeopardy as MC (Gen2MC, §A.4.2). Our codebase implements **gen-only** variants for all of these except Jeopardy — which has a dedicated MC dataset as `jeopardy_mc` (§5). + +| Task | Dataset | Eval | FS | ICL | gen_size | Notes | +|---|---|---|---|---|---|---| +| `coqa` | `EleutherAI/coqa` | validation | eval | 0 | 50 | OLMo Gen2MC | +| `drop` | `lighteval/drop_harness` | validation | train | 5 | 100 | OLMo Gen2MC; uses `Metrics.drop` | +| `jeopardy` | `soldni/jeopardy` | train | train | 5 | 50 | OLMo Gen2MC; MC form → `jeopardy_mc` (§5) | +| `natural_questions` | `google-research-datasets/nq_open` | validation | train | 5 | 50 | OLMo Gen2MC | +| `squad` | `allenai/squad` (v1.1) | validation | train | 5 | 50 | OLMo Gen2MC | +| `squad_v2` | `rajpurkar/squad_v2` (answerable-only) | validation | train | 5 | 200 | OLMo Gen2MC | +| `triviaqa` | `mandarjoshi/trivia_qa` (rc.nocontext) | validation | train | 5 | 20 | | +| `popqa` | `akariasai/PopQA` | test | test | 5 | 8 | | +| `wikifact:{subset}` (81) | `lighteval/wikifact` | test | test | 5 | 8 | | +| `simpleqa` | `lighteval/SimpleQA` | test | few_shot | 0 | 50 | | + +```bash +# CoQA (0-shot conversation QA) +coqa:gen +coqa:bpb + +# DROP (discrete reasoning) +drop:gen|5 +drop:bpb|5 + +# Jeopardy (gen form; MC form → jeopardy_mc in §5) +jeopardy:gen|5 +jeopardy:bpb|5 + +# NaturalQuestions +natural_questions:gen|5 +natural_questions:bpb|5 + +# SQuAD v1.1 +squad:gen|5 +squad:bpb|5 + +# SQuAD v2 (unanswerable questions excluded via hf_filter) +squad_v2:gen|5 +squad_v2:bpb|5 + +# TriviaQA +triviaqa:gen|5 +triviaqa:bpb|5 + +# PopQA +popqa:gen|5 +popqa:bpb|5 + +# WikiFact (81 relation subsets) +wikifact:gen|5 # all 81 subsets +wikifact:bpb|5 # all 81 subsets +wikifact:author:gen|5 # single subset + +# SimpleQA +simpleqa:gen|0 +simpleqa:bpb|0 ``` -Lower is better. Used for MATH and free-form GenQA. For MC tasks, BPB is reported inside `:cf`. -**`:gen`** (greedy generation) — autoregressively decode up to `generation_size` tokens (temperature=0): +**Prompt formats:** +- CoQA: `Passage: {story}\n\nFinal question:\n\nQuestion: {q}\nAnswer:` — stop `["\n\n"]` +- DROP: `Passage: {passage}\nQuestion: {question}\nAnswer:` — stop `["\n\n", "Passage:", "Question:"]` +- Jeopardy: `Category: {cat}\nQuestion: {q}\nAnswer:` — stop `["\n\n", "Question:", "Category:"]` +- NaturalQs: `Question: {question}\nAnswer:` — stop `["Question:", "Q:", "\n\n"]` +- SQuAD: `Title: {title}\n\nBackground: {context}\n\nQuestion: {question}\n\nAnswer:` — stop `["Title:", "\n\n"]` +- SQuAD v2: QA template (same prompt as SQuAD) — stop `["\n", "Question:", "question:"]` +- TriviaQA: `Question: {question}\nAnswer:` — stop `["\n", ".", ","]`; all aliases as gold +- PopQA: `{question} ` — stop `["\n"]`; `possible_answers` list as gold +- WikiFact: `{question} ` — stop `["\n"]`; `references` list as gold +- SimpleQA: `Question: {question}\nAnswer:` — stop `["\n"]` + +--- + +## 7. Lambada & Basic Skills + +These tasks use rank-choice or cloze formulations. BPB is **decoupled** (separate `:bpb` config with a different prompt) — it is **not** merged into `:cf`. + +### Lambada + +| Dataset | Eval | ICL | Config | Metrics | +|---|---|---|---|---| +| `cimec/lambada` | test | 0 | `lambada:cf` | `acc_norm` (char-norm) | +| | | | `lambada:bpb` | `target_bpb` (decoupled) | +| | | | `lambada:standard_cloze` | `target_perplexity` | +| `EleutherAI/lambada_openai` | test | 0 | `lambada:openai_cloze` | `target_perplexity` | + +`lambada:cf` uses a distractor format (gold last word vs 3 sampled distractors, scored by char-norm logprob). +`lambada:bpb` scores the full passage continuation directly. + +```bash +lambada:cf # rank-choice, acc_norm +lambada:bpb # BPB decoupled ``` -y = argmax_v p(v | prompt, y_ **Few-shot note:** These tasks have **test split only** — no dedicated few-shot split exists. +> The config sets `few_shots_split="test"`, `few_shots_select="random_sampling"` to allow +> CLI-level overrides, but running with `|N` (N > 0) draws examples from the test set itself +> (leakage risk). **Recommended: use `|0` (0-shot) for all TitanEval tasks.** + +**Chinese (zh) — uses `Language.CHINESE` prompt template:** + +| Task | Domain | Rows | ICL | +|---|---|---|---| +| `camb` | Civil aviation maintenance | 7,969 | 0 | +| `jecqa` | Chinese law exam | 1,998 | 0 | +| `lexeval` | Legal evaluation | 10,920 | 0 | +| `aecbench` | Architectural / civil engineering | 6,386 | 0 | + +**French (fr) — uses `Language.FRENCH` prompt template:** + +| Task | Domain | Rows | ICL | +|---|---|---|---| +| `frenchmedmcqa` | French medical licensing exam | 3,105 | 0 | +| `mediqal` | French medical QA | 27,634 | 0 | + +**Arabic (ar) — uses `Language.ARABIC` prompt template:** + +| Task | Domain | Rows | ICL | +|---|---|---|---| +| `arabicmmlu` | Arabic MMLU | 14,455 | 0 | +| `arastem` | Arabic STEM | 10,819 | 0 | + +```bash +# Requires --load-multilingual; 0-shot recommended (test-only split) +camb:cf|0 +camb:mcf|0 +camb:mcf_em|0 +jecqa:cf|0 +lexeval:cf|0 +aecbench:cf|0 +frenchmedmcqa:cf|0 +mediqal:cf|0 +arabicmmlu:cf|0 +arastem:cf|0 +``` + +--- + ## Two-level averaging ### Level 1 — per-language average (automatic) @@ -540,6 +651,37 @@ xquad:bpb|5 # XQuAD, all 12 langs, BPB multi_wiki_qa:gen|0 # MultiWikiQA, all 54 slices (incl. eng) multi_wiki_qa:bpb|0 # MultiWikiQA, all 54 slices, BPB +# --- Multilingual / single-language MMLU-style --- +cmmlu:cf|5 # Chinese MMLU, all 67 subjects, CF (acc+norm+bpb) +cmmlu:mcf|5 # Chinese MMLU, all 67 subjects, MCF +cmmlu:mcf_em|5 # Chinese MMLU, all 67 subjects, greedy +cmmlu:agronomy:cf|5 # single subject + +turkishmmlu:cf|5 # Turkish MMLU, all 9 subjects, CF +turkishmmlu:mcf|5 +turkishmmlu:mcf_em|5 +turkishmmlu:biology:cf|5 # single subject + +exams:cf|0 # EXAMS multilingual, all 16 langs, CF +exams:mcf|0 +exams:mcf_em|0 +exams:fra:cf|0 # single language + +medexpqa:cf|0 # MedExpQA, all 4 langs (spa/fra/ita/eng), CF +medexpqa:mcf|0 +medexpqa:mcf_em|0 +medexpqa:spa:cf|0 # single language + +agrieval:cf|5 # AgriEval Chinese agriculture, CF +agrieval:mcf|5 +agrieval:mcf_em|5 +crop:cf|5 # CROP Chinese crop science, CF +crop:mcf|5 +crop:mcf_em|5 +fineval:cf|5 # FinEval Chinese finance, CF +fineval:mcf|5 +fineval:mcf_em|5 + # --- Translation --- wmt24pp|0 # WMT24++, all English-centric language slices wmt24pp:de_DE|0 # WMT24++, German slice (both directions) diff --git a/data/titaneval/aecbench.parquet b/data/titaneval/aecbench.parquet new file mode 100644 index 000000000..34fa55c56 --- /dev/null +++ b/data/titaneval/aecbench.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f957409cf2d768a39490558386d9057b011d05dc9c1a4d25eee379e6d96a2570 +size 626336 diff --git a/data/titaneval/arabicmmlu.parquet b/data/titaneval/arabicmmlu.parquet new file mode 100644 index 000000000..b03ccb8b4 --- /dev/null +++ b/data/titaneval/arabicmmlu.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4031ff341db38d4d0418b9fe86f4297cbb9b2ed69b7ce029fed4cf5222eaa98 +size 1682400 diff --git a/data/titaneval/arastem.parquet b/data/titaneval/arastem.parquet new file mode 100644 index 000000000..041d7922e --- /dev/null +++ b/data/titaneval/arastem.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b950241f8d13a63af185242104e19d5ceefe0a5a1c08016c271bb9df45526b +size 1108408 diff --git a/data/titaneval/camb.parquet b/data/titaneval/camb.parquet new file mode 100644 index 000000000..d612c4469 --- /dev/null +++ b/data/titaneval/camb.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97e5dbfa8d1f4ded38acc60503b478fc2317d934a5447eebe078438500919c7 +size 779540 diff --git a/data/titaneval/chembench.parquet b/data/titaneval/chembench.parquet new file mode 100644 index 000000000..cb171ad10 --- /dev/null +++ b/data/titaneval/chembench.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc79af658739be361525b5a2aaecef42a077a6380f79872f30ba88b5b5ed4eab +size 344067 diff --git a/data/titaneval/cybermetric.parquet b/data/titaneval/cybermetric.parquet new file mode 100644 index 000000000..e5d3300a6 --- /dev/null +++ b/data/titaneval/cybermetric.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c6ccd81a038464b03649993e9813296b343e709f69cad4721a4e75b587b4f8 +size 1547418 diff --git a/data/titaneval/esgenius.parquet b/data/titaneval/esgenius.parquet new file mode 100644 index 000000000..8bcfc5b0a --- /dev/null +++ b/data/titaneval/esgenius.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048be850ec42630d9c1ac98597c0d1222a2def9e12b1ab3828726e5ed93da465 +size 2156197 diff --git a/data/titaneval/frenchmedmcqa.parquet b/data/titaneval/frenchmedmcqa.parquet new file mode 100644 index 000000000..c9efcd7b2 --- /dev/null +++ b/data/titaneval/frenchmedmcqa.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8347b970c914118f10293c6255d09ff10f3c5f6983f6b7af2aec4651f9ef0f8e +size 692547 diff --git a/data/titaneval/geobench.parquet b/data/titaneval/geobench.parquet new file mode 100644 index 000000000..9c8c97e37 --- /dev/null +++ b/data/titaneval/geobench.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ff5c41b5f1947413a632e11ea060f6a8dcf77fb37ba3fb12aefd177b32b114 +size 240640 diff --git a/data/titaneval/headqa.parquet b/data/titaneval/headqa.parquet new file mode 100644 index 000000000..d9fb4202e --- /dev/null +++ b/data/titaneval/headqa.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809ec8ed0fa1b17f36542486d8593af6ee85d92212be40218c98653ee7a4687 +size 3129133 diff --git a/data/titaneval/jecqa.parquet b/data/titaneval/jecqa.parquet new file mode 100644 index 000000000..009bd4f5a --- /dev/null +++ b/data/titaneval/jecqa.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b158d449d187dcf58a3da32e1918f438cc77c32f9fbccad482d0c2c69bc48ae +size 1126051 diff --git a/data/titaneval/lexeval.parquet b/data/titaneval/lexeval.parquet new file mode 100644 index 000000000..3adf48b2e --- /dev/null +++ b/data/titaneval/lexeval.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a3243bc682ccc3d7a01f6a5c3625e1ed08dc82223f518fca3bd6b6f8c43aec +size 8383762 diff --git a/data/titaneval/med_qa.bak b/data/titaneval/med_qa.bak new file mode 100644 index 000000000..f55bf4313 Binary files /dev/null and b/data/titaneval/med_qa.bak differ diff --git a/data/titaneval/med_qa.parquet b/data/titaneval/med_qa.parquet new file mode 100644 index 000000000..625931986 --- /dev/null +++ b/data/titaneval/med_qa.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b57906cb75851cec4346d1264778c8cc0607a5bebafb3c4997caf1646d9520c9 +size 6094281 diff --git a/data/titaneval/mediqal.parquet b/data/titaneval/mediqal.parquet new file mode 100644 index 000000000..7cea2c620 --- /dev/null +++ b/data/titaneval/mediqal.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:332eb15d841ad60540f0738a6a1136d9133d552f9b7dd5c8e95fa369d837eab7 +size 9999318 diff --git a/data/titaneval/preflight.parquet b/data/titaneval/preflight.parquet new file mode 100644 index 000000000..48243a469 --- /dev/null +++ b/data/titaneval/preflight.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3e45d42693af51110fe40a4401e2b0325b95127050810c9bac1c56bc866ea25 +size 61282 diff --git a/data/titaneval/xfinbench.parquet b/data/titaneval/xfinbench.parquet new file mode 100644 index 000000000..b4bcae7d5 --- /dev/null +++ b/data/titaneval/xfinbench.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0a3a93b9dfa3c5f1b0f248ecb11107fb51fd985447250e360961ea1ab704995 +size 131528 diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index b707b4fdd..dc7dc9e80 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -63,6 +63,7 @@ ) from lighteval.metrics.normalizations import ( bigbench_normalizer, + harness_triviaqa_normalizer, remove_braces, remove_braces_and_strip, ) @@ -339,6 +340,20 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) + # Normalized EM for open-domain gen QA (lowercase + remove punctuation on both gold and pred). + # Use for free-text fact-retrieval tasks (TriviaQA, WikiFact, PopQA, SQuAD, etc.) + # instead of exact_match, which is case-sensitive and punctuation-sensitive. + qa_em = SampleLevelMetric( + metric_name="em", + sample_level_fn=ExactMatches( + normalize_gold=harness_triviaqa_normalizer, + normalize_pred=harness_triviaqa_normalizer, + strip_strings=True, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) expr_gold_metric = SampleLevelMetric( metric_name="extractive_match", sample_level_fn=MultilingualExtractiveMatchMetric( @@ -462,6 +477,18 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) + # Normalized F1 for open-domain gen QA (lowercase + remove punctuation on both gold and pred). + qa_f1 = SampleLevelMetric( + metric_name="f1", + sample_level_fn=F1_score( + normalize_gold=harness_triviaqa_normalizer, + normalize_pred=harness_triviaqa_normalizer, + strip_strings=True, + ), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=np.mean, + higher_is_better=True, + ) f1_score_macro = CorpusLevelMetric( metric_name="f1", sample_level_fn=GenerativePreparator(), diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 370e3a249..8764c41f3 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -295,6 +295,133 @@ def _build_split(split: str) -> Dataset: ) +def _load_titaneval_dataset(task_name: str) -> DatasetDict: + """Load a titaneval task from the local parquet copy in data/titaneval/. + + If the parquet is missing for known HF-derived benchmarks (med_qa, headqa, + cybermetric), attempts to auto-download and cache before raising. + """ + data_dir = Path(__file__).parent.parent.parent.parent / "data" / "titaneval" + parquet_path = data_dir / f"{task_name}.parquet" + + if not parquet_path.exists(): + _try_cache_titaneval(task_name, parquet_path) + + if not parquet_path.exists(): + raise FileNotFoundError( + f"titaneval_local: parquet not found at {parquet_path}. " + "Copy it from titaneval-mcq/benchmarks/ into data/titaneval/ " + "or run scripts/cache_broken_hf_to_titaneval.py." + ) + return load_dataset("parquet", data_files={"test": str(parquet_path)}) + + +def _try_cache_titaneval(task_name: str, target: Path) -> None: + """Auto-download known HF datasets if the titaneval parquet doesn't exist.""" + import json + import zipfile + + import pandas as pd + from huggingface_hub import hf_hub_download + + COLUMNS = ["benchmark", "domain", "question", "choices", "answer_index", "answer_text"] + + def _save(rows, name): + target.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame(rows, columns=COLUMNS).to_parquet(target, index=False) + + try: + if task_name == "med_qa": + zip_path = hf_hub_download("bigbio/med_qa", "data_clean.zip", repo_type="dataset") + rows = [] + with zipfile.ZipFile(zip_path, "r") as zf: + for arc in [ + "data_clean/questions/US/4_options/phrases_no_exclude_train.jsonl", + "data_clean/questions/US/4_options/phrases_no_exclude_dev.jsonl", + "data_clean/questions/US/4_options/phrases_no_exclude_test.jsonl", + ]: + with zf.open(arc) as fh: + for line in fh.read().decode("utf-8").strip().split("\n"): + obj = json.loads(line) + opts = obj["options"] + labels = sorted(opts.keys()) + rows.append({ + "benchmark": "med_qa", + "domain": obj.get("meta_info", ""), + "question": obj["question"], + "choices": [opts[k] for k in labels], + "answer_index": labels.index(obj["answer_idx"]), + "answer_text": opts[obj["answer_idx"]], + }) + _save(rows, "med_qa") + logger.info(f"Auto-cached {len(rows)} rows → {target}") + + elif task_name == "headqa": + rows = [] + for lang in ("en", "es"): + data_files = { + s: f"https://huggingface.co/datasets/EleutherAI/headqa/resolve/main/{lang}/{s}.parquet" + for s in ("train", "validation", "test") + } + ds = load_dataset("parquet", data_files=data_files) + for split_ds in ds.values(): + for item in split_ds: + answers = item["answers"] + choices = [a["atext"] for a in answers] + aid = item["ra"] + idx = next((i for i, a in enumerate(answers) if a["aid"] == aid), -1) + if idx < 0: + continue + rows.append({ + "benchmark": "headqa", + "domain": f"{item.get('category', '')}/{lang}", + "question": item["qtext"], + "choices": choices, + "answer_index": idx, + "answer_text": choices[idx], + }) + _save(rows, "headqa") + logger.info(f"Auto-cached {len(rows)} rows → {target}") + + elif task_name == "cybermetric": + rows = [] + for fname in [ + "CyberMetric-10000-v1.json", + "CyberMetric-2000-v1.json", + "CyberMetric-500-v1.json", + "CyberMetric-80-v1.json", + ]: + local = hf_hub_download("tihanyin/CyberMetric", fname, repo_type="dataset") + with open(local) as fh: + data = json.load(fh) + for q in data.get("questions", []): + answers = q.get("answers", {}) + labels = sorted(answers.keys()) + sol = q.get("solution") or q.get("correct_solution", "") + idx = labels.index(sol) if sol in labels else -1 + if idx < 0: + continue + rows.append({ + "benchmark": "cybermetric", + "domain": "cybersecurity", + "question": q["question"], + "choices": [answers[k] for k in labels], + "answer_index": idx, + "answer_text": answers[sol], + }) + if len(rows) >= 10000: + break + _save(rows, "cybermetric") + logger.info(f"Auto-cached {len(rows)} rows → {target}") + + except Exception: + logger.warning( + f"Auto-cache failed for '{task_name}'. " + "Run scripts/cache_broken_hf_to_titaneval.py manually.", + exc_info=True, + ) + + @dataclass class LightevalTaskConfig: """Configuration dataclass for a LightevalTask. @@ -764,6 +891,12 @@ def download_dataset_worker( dataset = dataset.filter(task.dataset_filter) return dataset # type: ignore + if task.dataset_path == "titaneval_local": + dataset = _load_titaneval_dataset(task.dataset_config_name) + if task.dataset_filter is not None: + dataset = dataset.filter(task.dataset_filter) + return dataset # type: ignore + try: dataset = load_dataset( path=task.dataset_path, @@ -804,6 +937,12 @@ def download_dataset_worker( dataset = dataset.filter(task.dataset_filter) return dataset # type: ignore + if task.dataset_path == "titaneval_local": + dataset = _load_titaneval_dataset(task.dataset_config_name) + if task.dataset_filter is not None: + dataset = dataset.filter(task.dataset_filter) + return dataset # type: ignore + if _is_script_err: dataset = _load_hub_raw_dataset_files( dataset_path=task.dataset_path, diff --git a/src/lighteval/tasks/multilingual/tasks/aecbench.py b/src/lighteval/tasks/multilingual/tasks/aecbench.py new file mode 100644 index 000000000..a4d7774ce --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/aecbench.py @@ -0,0 +1,66 @@ +""" +name: +AECBench Architectural Engineering QA + +dataset: +titaneval_local (local parquet — data/titaneval/aecbench.parquet) + +abstract: +Chinese architectural and civil engineering multiple-choice benchmark. 6,386 +questions from TitanEval-MCQ, 0-shot. + +languages: +chinese + +tags: +engineering, multiple-choice, qa, chinese +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"aecbench:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="aecbench", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/agrieval.py b/src/lighteval/tasks/multilingual/tasks/agrieval.py new file mode 100644 index 000000000..a69ba77b4 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/agrieval.py @@ -0,0 +1,88 @@ +""" +name: +AgriEval + CROP + +datasets: +PaperHarvester/AgriEval +AI4Agr/CROP-benchmark + +abstract: +Chinese agricultural knowledge benchmarks. AgriEval covers plant science, +animal production, and related domains (variable 2–7 choices). CROP covers +crop science with 4-choice questions. Both are Chinese-language. + +Single-language Chinese tasks — no language suffix in task name. + +languages: +chinese + +tags: +agriculture, knowledge, multilingual, multiple-choice, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _agrieval_adapter(line): + opts = line["options"] + keys = sorted(opts.keys()) + choices = [opts[k] for k in keys] + gold = keys.index(line["answer"]) + return {"question": line["question"], "choices": choices, "gold_idx": gold} + + +def _crop_adapter(line): + choices = [line[f"Option {l}"] for l in "ABCD"] + gold = list("ABCD").index(line["Answer"]) + return {"question": line["Question"], "choices": choices, "gold_idx": gold} + + +def _configs(name, repo, subset, split, adapter, fs_select): + return [ + LightevalTaskConfig( + name=f"{name}:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, adapter, formulation=formulation), + hf_repo=repo, + hf_subset=subset, + hf_avail_splits=[split], + evaluation_splits=[split], + few_shots_split=split, + few_shots_select=fs_select, + # single-choice only; AgriEval has 单选/多选 + hf_filter=(lambda line: line.get("question_type") == "单选") if name == "agrieval" else None, + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] + ] + + +TASKS_TABLE = ( + _configs("agrieval", "PaperHarvester/AgriEval", "default", "train", _agrieval_adapter, "random_sampling") + + _configs("crop", "AI4Agr/CROP-benchmark", "default", "test", _crop_adapter, "random_sampling") +) diff --git a/src/lighteval/tasks/multilingual/tasks/arabicmmlu.py b/src/lighteval/tasks/multilingual/tasks/arabicmmlu.py new file mode 100644 index 000000000..7bcd4c718 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/arabicmmlu.py @@ -0,0 +1,66 @@ +""" +name: +ArabicMMLU Arabic MMLU + +dataset: +titaneval_local (local parquet — data/titaneval/arabicmmlu.parquet) + +abstract: +Arabic MMLU multiple-choice benchmark covering diverse academic subjects. +14,455 questions from TitanEval-MCQ, 0-shot. + +languages: +arabic + +tags: +general, multiple-choice, qa, arabic +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"arabicmmlu:{suffix}", + prompt_function=get_mcq_prompt_function(Language.ARABIC, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="arabicmmlu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/arastem.py b/src/lighteval/tasks/multilingual/tasks/arastem.py new file mode 100644 index 000000000..4763293c3 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/arastem.py @@ -0,0 +1,66 @@ +""" +name: +AraSTeM Arabic STEM QA + +dataset: +titaneval_local (local parquet — data/titaneval/arastem.parquet) + +abstract: +Arabic STEM multiple-choice benchmark covering science, technology, engineering, +and math. 10,819 questions from TitanEval-MCQ, 0-shot. + +languages: +arabic + +tags: +stem, multiple-choice, qa, arabic +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"arastem:{suffix}", + prompt_function=get_mcq_prompt_function(Language.ARABIC, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="arastem", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/camb.py b/src/lighteval/tasks/multilingual/tasks/camb.py new file mode 100644 index 000000000..dfdfd3987 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/camb.py @@ -0,0 +1,66 @@ +""" +name: +CAMB Civil Aviation Maintenance QA + +dataset: +titaneval_local (local parquet — data/titaneval/camb.parquet) + +abstract: +Chinese civil aviation maintenance multiple-choice benchmark. 7,969 questions +from TitanEval-MCQ, 0-shot. + +languages: +chinese + +tags: +aviation, multiple-choice, qa, chinese +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"camb:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="camb", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/cmmlu.py b/src/lighteval/tasks/multilingual/tasks/cmmlu.py index 566fad0f2..0a62a04ad 100644 --- a/src/lighteval/tasks/multilingual/tasks/cmmlu.py +++ b/src/lighteval/tasks/multilingual/tasks/cmmlu.py @@ -6,7 +6,8 @@ haonan-li/cmmlu abstract: -Cmmlu multilingual benchmark. +CMMLU is a comprehensive Chinese evaluation benchmark covering 67 topics from +basic to advanced professional level. Named after the Chinese MMLU. languages: chinese @@ -15,125 +16,78 @@ knowledge, multilingual, multiple-choice paper: +https://arxiv.org/abs/2307.09009 """ from string import ascii_uppercase -from lighteval.metrics.dynamic_metrics import ( - LogLikelihoodAccMetric, -) -from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation from lighteval.tasks.templates.multichoice import get_mcq_prompt_function -from lighteval.tasks.templates.utils.formulation import ( - CFFormulation, - HybridFormulation, - MCFFormulation, -) +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation from lighteval.utils.language import Language +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] CMMLU_SUBSETS = [ - "agronomy", - "anatomy", - "ancient_chinese", - "arts", - "astronomy", - "business_ethics", - "chinese_civil_service_exam", - "chinese_driving_rule", - "chinese_food_culture", - "chinese_foreign_policy", - "chinese_history", - "chinese_literature", - "chinese_teacher_qualification", - "clinical_knowledge", - "college_actuarial_science", - "college_education", - "college_engineering_hydrology", - "college_law", - "college_mathematics", - "college_medical_statistics", - "college_medicine", - "computer_science", - "computer_security", - "conceptual_physics", - "construction_project_management", - "economics", - "education", - "electrical_engineering", - "elementary_chinese", - "elementary_commonsense", - "elementary_information_and_technology", - "elementary_mathematics", - "ethnology", - "food_science", - "genetics", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_geography", - "high_school_mathematics", - "high_school_physics", - "high_school_politics", - "human_sexuality", - "international_law", - "journalism", - "jurisprudence", - "legal_and_moral_basis", - "logical", - "machine_learning", - "management", - "marketing", - "marxist_theory", - "modern_chinese", - "nutrition", - "philosophy", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_study", - "sociology", - "sports_science", - "traditional_chinese_medicine", - "virology", - "world_history", - "world_religions", + "agronomy", "anatomy", "ancient_chinese", "arts", "astronomy", + "business_ethics", "chinese_civil_service_exam", "chinese_driving_rule", + "chinese_food_culture", "chinese_foreign_policy", "chinese_history", + "chinese_literature", "chinese_teacher_qualification", "clinical_knowledge", + "college_actuarial_science", "college_education", "college_engineering_hydrology", + "college_law", "college_mathematics", "college_medical_statistics", + "college_medicine", "computer_science", "computer_security", "conceptual_physics", + "construction_project_management", "economics", "education", + "electrical_engineering", "elementary_chinese", "elementary_commonsense", + "elementary_information_and_technology", "elementary_mathematics", "ethnology", + "food_science", "genetics", "global_facts", "high_school_biology", + "high_school_chemistry", "high_school_geography", "high_school_mathematics", + "high_school_physics", "high_school_politics", "human_sexuality", + "international_law", "journalism", "jurisprudence", "legal_and_moral_basis", + "logical", "machine_learning", "management", "marketing", "marxist_theory", + "modern_chinese", "nutrition", "philosophy", "professional_accounting", + "professional_law", "professional_medicine", "professional_psychology", + "public_relations", "security_study", "sociology", "sports_science", + "traditional_chinese_medicine", "virology", "world_history", "world_religions", ] +def _adapter(line): + return { + "question": line["Question"], + "choices": [line["A"], line["B"], line["C"], line["D"]], + "gold_idx": ascii_uppercase.index(line["Answer"]), + } + + TASKS_TABLE = [ LightevalTaskConfig( - name=f"cmmlu_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.CHINESE, - lambda line: { - "question": line["Question"], - "choices": [line["A"], line["B"], line["C"], line["D"]], - "gold_idx": ascii_uppercase.index(line["Answer"]), - }, - formulation=formulation, - ), + name=f"cmmlu:{subset}:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation), hf_repo="haonan-li/cmmlu", hf_subset=subset, evaluation_splits=("test",), few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=1, ) for subset in CMMLU_SUBSETS - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), ] ] diff --git a/src/lighteval/tasks/multilingual/tasks/exams.py b/src/lighteval/tasks/multilingual/tasks/exams.py index 28e40e989..8a12d6ada 100644 --- a/src/lighteval/tasks/multilingual/tasks/exams.py +++ b/src/lighteval/tasks/multilingual/tasks/exams.py @@ -6,7 +6,8 @@ mhardalov/exams abstract: -Exams multilingual benchmark. +EXAMS is a multilingual benchmark for school-level subject knowledge across +16 languages and multiple subjects per language. languages: albanian, arabic, bulgarian, croatian, french, german, hungarian, italian, @@ -17,177 +18,90 @@ knowledge, multilingual, multiple-choice paper: +https://arxiv.org/abs/2011.03080 """ -from functools import partial - from langcodes import Language as LangCodeLanguage from langcodes import standardize_tag -from lighteval.metrics.dynamic_metrics import ( - LogLikelihoodAccMetric, -) -from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset from lighteval.tasks.templates.multichoice import get_mcq_prompt_function -from lighteval.tasks.templates.utils.formulation import ( - CFFormulation, - HybridFormulation, - MCFFormulation, -) +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation from lighteval.utils.language import Language +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +# Languages with EXAMS data; all subjects for each language are aggregated. +_LANGUAGES = [ + Language.ARABIC, + Language.BULGARIAN, + Language.CROATIAN, + Language.HUNGARIAN, + Language.ITALIAN, + Language.SERBIAN, + Language.FRENCH, + Language.GERMAN, + Language.SPANISH, + Language.LITHUANIAN, + Language.ALBANIAN, + Language.MACEDONIAN, + Language.TURKISH, + Language.POLISH, + Language.PORTUGUESE, + Language.VIETNAMESE, +] + + +def _lang_name(language: Language) -> str: + return LangCodeLanguage(standardize_tag(language.value)).language_name() + + +def _make_filter(language: Language): + lang_name = _lang_name(language) + return lambda line: ( + line["answerKey"] != "@" + and line["info"]["language"] == lang_name + ) + -exams_subjects_by_lang: dict[Language, set[str]] = { - Language.ARABIC: {"Biology", "Islamic Studies", "Physics", "Science", "Social"}, - Language.BULGARIAN: {"Biology", "Chemistry", "Geography", "History", "Philosophy", "Physics"}, - Language.CROATIAN: { - "Biology", - "Chemistry", - "Ethics", - "Fine Arts", - "Geography", - "Geology", - "History", - "Informatics", - "Philosophy", - "Physics", - "Politics", - "Psychology", - "Religion", - "Sociology", - }, - Language.HUNGARIAN: { - "Agriculture", - "Agriculture (Mechanical knowledge)", - "Biology", - "Chemistry", - "Economics", - "Economics & Marketing", - "Economics Basics (Business)", - "Economics Basics (Theoretical)", - "Forestry", - "Geography", - "Landscaping", - "Physics", - "Politics", - "Tourism", - }, - Language.ITALIAN: { - "Biology", - "Chemistry", - "Ethics", - "Geography", - "Geology", - "History", - "Informatics", - "Philosophy", - "Physics", - "Politics", - "Psychology", - "Sociology", - }, - Language.SERBIAN: { - "Biology", - "Chemistry", - "Ethics", - "Geography", - "Geology", - "History", - "Informatics", - "Philosophy", - "Physics", - "Politics", - "Psychology", - "Religion", - "Sociology", - }, - Language.FRENCH: {"Economics", "Economics & Marketing", "Economics Basics (Theoretical)", "Geography", "Physics"}, - Language.GERMAN: { - "Chemistry", - "Economics", - "Economics & Marketing", - "Economics Basics (Theoretical)", - "Geography", - "Physics", - "Tourism", - }, - Language.SPANISH: {"Geography", "Physics"}, - Language.LITHUANIAN: {"Geology", "History"}, - Language.ALBANIAN: { - "Biology", - "Business", - "Chemistry", - "Fine Arts", - "History", - "Philosophy", - "Physics", - "Sociology", - }, - Language.MACEDONIAN: { - "Biology", - "Business", - "Chemistry", - "Fine Arts", - "History", - "Philosophy", - "Physics", - "Sociology", - }, - Language.TURKISH: { - "Biology", - "Business", - "Chemistry", - "Geography", - "History", - "Philosophy", - "Physics", - "Sociology", - }, - Language.POLISH: {"Professional"}, - Language.PORTUGUESE: {"Biology", "Economics", "Geology", "Philosophy"}, - Language.VIETNAMESE: {"Biology", "Chemistry", "Citizenship", "Geography", "History", "Physics"}, -} +def _adapter(line): + return { + "question": line["question"]["stem"], + "choices": line["question"]["choices"]["text"], + "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]), + } TASKS_TABLE = [ LightevalTaskConfig( - name=f"exams_{language.value}_{formulation.name.lower()}:{normalize_subset(subject)}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"]["stem"], - "choices": line["question"]["choices"]["text"], - "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]), - }, - formulation=formulation, - ), + name=f"exams:{language.value}:{suffix}", + prompt_function=get_mcq_prompt_function(language, _adapter, formulation=formulation), hf_repo="mhardalov/exams", hf_subset="multilingual", - # Weird bug in dataset - hf_filter=partial( - lambda language, subject, line: line["answerKey"] != "@" - and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name() - and line["info"]["subject"] == subject, - language, - subject, - ), + hf_filter=_make_filter(language), evaluation_splits=("test",), few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=1, ) - for language in exams_subjects_by_lang.keys() - for subject in exams_subjects_by_lang[language] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), + for language in _LANGUAGES + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), ] ] diff --git a/src/lighteval/tasks/multilingual/tasks/fineval.py b/src/lighteval/tasks/multilingual/tasks/fineval.py new file mode 100644 index 000000000..afdf8acd8 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/fineval.py @@ -0,0 +1,69 @@ +""" +name: +FinEval + +dataset: +SUFE-AIFLM-Lab/FinEval + +abstract: +FinEval is a Chinese financial knowledge benchmark covering finance, economics, +accounting, and professional certificates, drawn from university-level exams. +Single-language Chinese task — no language suffix in task name. + +languages: +chinese + +tags: +finance, knowledge, multilingual, multiple-choice, qa + +paper: +https://arxiv.org/abs/2308.09975 +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _fineval_adapter(line): + choices = [line["A"], line["B"], line["C"], line["D"]] + gold = list("ABCD").index(line["answer"]) + return {"question": line["question"], "choices": choices, "gold_idx": gold} + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"fineval:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, _fineval_adapter, formulation=formulation), + hf_repo="SUFE-AIFLM-Lab/FinEval", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/frenchmedmcqa.py b/src/lighteval/tasks/multilingual/tasks/frenchmedmcqa.py new file mode 100644 index 000000000..393dd5ab2 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/frenchmedmcqa.py @@ -0,0 +1,66 @@ +""" +name: +FrenchMedMCQA French Medical Licensing QA + +dataset: +titaneval_local (local parquet — data/titaneval/frenchmedmcqa.parquet) + +abstract: +French medical licensing exam multiple-choice benchmark. 3,105 questions +from TitanEval-MCQ, 0-shot. + +languages: +french + +tags: +medical, multiple-choice, qa, french +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"frenchmedmcqa:{suffix}", + prompt_function=get_mcq_prompt_function(Language.FRENCH, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="frenchmedmcqa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/jecqa.py b/src/lighteval/tasks/multilingual/tasks/jecqa.py new file mode 100644 index 000000000..31102c69b --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/jecqa.py @@ -0,0 +1,66 @@ +""" +name: +JEC-QA Chinese Law Exam + +dataset: +titaneval_local (local parquet — data/titaneval/jecqa.parquet) + +abstract: +Chinese judicial examination multiple-choice benchmark. 1,998 questions +from TitanEval-MCQ, 0-shot. + +languages: +chinese + +tags: +law, multiple-choice, qa, chinese +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"jecqa:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="jecqa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/lexeval.py b/src/lighteval/tasks/multilingual/tasks/lexeval.py new file mode 100644 index 000000000..c3a4d1e85 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/lexeval.py @@ -0,0 +1,66 @@ +""" +name: +LexEval Legal Evaluation QA + +dataset: +titaneval_local (local parquet — data/titaneval/lexeval.parquet) + +abstract: +Legal evaluation multiple-choice benchmark (Chinese). 10,920 questions +from TitanEval-MCQ, 0-shot. + +languages: +chinese + +tags: +law, multiple-choice, qa, chinese +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"lexeval:{suffix}", + prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="lexeval", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/medexpqa.py b/src/lighteval/tasks/multilingual/tasks/medexpqa.py new file mode 100644 index 000000000..29de5b3b1 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/medexpqa.py @@ -0,0 +1,86 @@ +""" +name: +MedExpQA + +dataset: +HiTZ/MedExpQA + +abstract: +MedExpQA is a multilingual medical expert QA benchmark based on Spanish +board-exam style questions, with translations to French, Italian, and English. +Each question has 4-5 options with a single correct answer. + +languages: +english, french, italian, spanish + +tags: +medical, multilingual, multiple-choice, qa + +paper: +https://arxiv.org/abs/2307.00099 +""" + +import ast + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +# (lighteval Language, HF config name) +_LANGUAGES = [ + (Language.SPANISH, "es"), + (Language.FRENCH, "fr"), + (Language.ITALIAN, "it"), + (Language.ENGLISH, "en"), +] + + +def _make_adapter(): + def adapter(line): + opts = ast.literal_eval(line["options"]) if isinstance(line["options"], str) else line["options"] + keys = sorted(opts.keys(), key=lambda x: int(x)) + choices = [opts[k] for k in keys] + gold = keys.index(str(int(line["correct_option"]))) + return {"question": line["full_question"], "choices": choices, "gold_idx": gold} + return adapter + + +_adapter = _make_adapter() + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"medexpqa:{language.value}:{suffix}", + prompt_function=get_mcq_prompt_function(language, _adapter, formulation=formulation), + hf_repo="HiTZ/MedExpQA", + hf_subset=hf_config, + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for language, hf_config in _LANGUAGES + for suffix, formulation, metrics, gen in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mediqal.py b/src/lighteval/tasks/multilingual/tasks/mediqal.py new file mode 100644 index 000000000..47bab111f --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mediqal.py @@ -0,0 +1,66 @@ +""" +name: +MediQAL French Medical QA + +dataset: +titaneval_local (local parquet — data/titaneval/mediqal.parquet) + +abstract: +French medical QA multiple-choice benchmark. 27,634 questions +from TitanEval-MCQ, 0-shot. + +languages: +french + +tags: +medical, multiple-choice, qa, french +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": line["answer_index"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mediqal:{suffix}", + prompt_function=get_mcq_prompt_function(Language.FRENCH, _adapter, formulation=formulation), + hf_repo="titaneval_local", + hf_subset="mediqal", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py index 410268f9e..b8ce1f301 100644 --- a/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py +++ b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py @@ -6,7 +6,8 @@ AYueksel/TurkishMMLU abstract: -Turkish Mmlu multilingual benchmark. +TurkishMMLU is a Turkish-language multiple-choice benchmark modelled after +MMLU, covering 9 school subjects. languages: turkish @@ -19,22 +20,27 @@ from string import ascii_uppercase -from lighteval.metrics.dynamic_metrics import ( - LogLikelihoodAccMetric, -) -from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset from lighteval.tasks.templates.multichoice import get_mcq_prompt_function -from lighteval.tasks.templates.utils.formulation import ( - CFFormulation, - HybridFormulation, - MCFFormulation, -) +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation from lighteval.utils.language import Language +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] -TURKISH_MMLU_SUBSET = [ +# HF subset names (capitalized); task names use lowercase. +TURKISH_MMLU_SUBSETS = [ "Biology", "Chemistry", "Geography", @@ -47,35 +53,31 @@ ] +def _adapter(line): + return { + "question": line["question"], + "choices": line["choices"], + "gold_idx": ascii_uppercase.index(line["answer"]), + } + + TASKS_TABLE = [ LightevalTaskConfig( - name=f"mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{normalize_subset(subset)}", - prompt_function=get_mcq_prompt_function( - Language.TURKISH, - lambda line: { - "question": line["question"], - "choices": line["choices"], - "gold_idx": ascii_uppercase.index(line["answer"]), - }, - formulation=formulation, - ), + name=f"turkishmmlu:{subset.lower()}:{suffix}", + prompt_function=get_mcq_prompt_function(Language.TURKISH, _adapter, formulation=formulation), hf_repo="AYueksel/TurkishMMLU", hf_subset=subset, evaluation_splits=("test",), few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), + generation_size=gen_size, + metrics=metrics, + stop_sequence=["\n"], + version=1, ) - for subset in TURKISH_MMLU_SUBSET - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), + for subset in TURKISH_MMLU_SUBSETS + for suffix, formulation, metrics, gen_size in [ + ("cf", CFFormulation(), _CF_METRICS, -1), + ("mcf", MCFFormulation(), _MCF_METRICS, -1), + ("mcf_em", MCFFormulation(), [Metrics.exact_match], 1), ] ] diff --git a/src/lighteval/tasks/tasks/boolq.py b/src/lighteval/tasks/tasks/boolq.py index f927f9491..115915cd6 100644 --- a/src/lighteval/tasks/tasks/boolq.py +++ b/src/lighteval/tasks/tasks/boolq.py @@ -18,102 +18,94 @@ https://arxiv.org/abs/1905.11946 """ -from string import ascii_uppercase - -from inspect_ai.dataset import Sample -from inspect_ai.scorer import choice -from inspect_ai.solver import multiple_choice - +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _question(line) -> str: + q = line["question"] + return q[:-1] if q.endswith("??") else q -def boolq_prompt(line, task_name: str = None): - question = line["question"][:-1] if line["question"][-2:] == "??" else line["question"] + +def _gold(line) -> int: + return 0 if line["answer"] == "Yes" else 1 + + +def boolq_cf_prompt(line, task_name: str = None): + """CF variant: score full answer text via logprobs.""" return Doc( task_name=task_name, - query=f"Passage: {line['passage']}\nQuestion: {question}\nAnswer:", + query=f"Passage: {line['passage']}\nQuestion: {_question(line)}\nAnswer:", choices=[" Yes", " No"], - gold_index=["Yes", "No"].index(line["answer"]), + gold_index=_gold(line), ) -def boolq_contrastset_prompt(line, task_name: str = None): - if line["contrast_inputs"] in [None, ""]: - return boolq_prompt(line) - - return [ - Doc( - task_name=task_name, - query=f"{passage}\nQuestion: {question}\nAnswer:", - choices=["Yes", "No"], - gold_index=["No", "Yes"].index(line["answer"]), - ) - for passage, question in zip(line["contrast_inputs"]["passage"], line["contrast_inputs"]["question"]) - ][0] - - -def record_to_sample(record): - choices = ["Yes", "No"] - query = f"{record['passage']}\n{record['question']}" - target = ascii_uppercase[choices.index(record["answer"])] - return Sample(input=query, target=target, choices=choices) - - -def record_to_sample_contrastset(record): - if record["contrast_inputs"] in [None, ""]: - return record_to_sample(record) - - choices = ["Yes", "No"] - query = f"{record['contrast_inputs']['passage']}\n{record['contrast_inputs']['question']}" - target = ascii_uppercase[choices.index(record["answer"])] - - return Sample(input=query, target=target, choices=choices) - - -boolq = LightevalTaskConfig( - name="boolq", - prompt_function=boolq_prompt, - hf_repo="lighteval/boolq_helm", - hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - ], - stop_sequence=["\n"], - version=0, - sample_fields=record_to_sample, - solver=[multiple_choice(cache=True)], - scorer=choice(), -) - - -boolq_contrastset = LightevalTaskConfig( - name="boolq:contrastset", - prompt_function=boolq_contrastset_prompt, - hf_repo="lighteval/boolq_helm", - hf_subset="default", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - ], - stop_sequence=["\n"], - version=0, - sample_fields=record_to_sample_contrastset, - solver=[multiple_choice(cache=True)], - scorer=choice(), -) +def boolq_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled A/B options, score label tokens via logprobs.""" + return Doc( + task_name=task_name, + query=f"Passage: {line['passage']}\nQuestion: {_question(line)}\n A. Yes\n B. No\nAnswer:", + choices=[" A", " B"], + gold_index=_gold(line), + ) + TASKS_TABLE = [ - boolq, - boolq_contrastset, + LightevalTaskConfig( + name="boolq:cf", + prompt_function=boolq_cf_prompt, + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_CF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name="boolq:mcf", + prompt_function=boolq_mcf_prompt, + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_MCF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name="boolq:mcf_em", + prompt_function=boolq_mcf_prompt, + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=1, + ), ] diff --git a/src/lighteval/tasks/tasks/chembench.py b/src/lighteval/tasks/tasks/chembench.py new file mode 100644 index 000000000..340b70761 --- /dev/null +++ b/src/lighteval/tasks/tasks/chembench.py @@ -0,0 +1,78 @@ +""" +name: +ChemBench Chemistry QA + +dataset: +titaneval_local (local parquet — data/titaneval/chembench.parquet) + +abstract: +Chemistry multiple-choice benchmark covering analytical, organic, and physical +chemistry. 2,542 questions from TitanEval-MCQ, 0-shot. + +languages: +english + +tags: +chemistry, multiple-choice, qa, science +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def chembench_cf_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def chembench_mcf_prompt(line, task_name: str = None): + choices = line["choices"] + labels = list("ABCDEFGHIJ"[: len(choices)]) + options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"chembench:{suffix}", + prompt_function=fn, + hf_repo="titaneval_local", + hf_subset="chembench", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", chembench_cf_prompt, _CF_METRICS, -1), + ("mcf", chembench_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", chembench_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/coqa.py b/src/lighteval/tasks/tasks/coqa.py index f1bd3bad6..e1748ea73 100644 --- a/src/lighteval/tasks/tasks/coqa.py +++ b/src/lighteval/tasks/tasks/coqa.py @@ -122,7 +122,7 @@ def coqa_bpb_prompt(line, task_name: str = None): few_shots_select=None, generation_size=50, stop_sequence=["\n\n"], - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], version=1, ), ] diff --git a/src/lighteval/tasks/tasks/cybermetric.py b/src/lighteval/tasks/tasks/cybermetric.py new file mode 100644 index 000000000..5bec7d71c --- /dev/null +++ b/src/lighteval/tasks/tasks/cybermetric.py @@ -0,0 +1,204 @@ +""" +name: +CyberMetric + SecQA + +datasets: +tihanyin/CyberMetric +zefang-liu/secqa (secqa_v1, secqa_v2) + +abstract: +Cybersecurity multiple-choice benchmarks. CyberMetric covers cybersecurity +concepts. SecQA (Security QA) consists of expert-written 4-choice security +questions in two versions of increasing difficulty. + +languages: +english + +tags: +cybersecurity, multiple-choice, qa + +paper: +https://arxiv.org/abs/2411.02228 (CyberMetric) +""" + +import ast + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +# ---- CyberMetric ---- + +def _cybermetric_parse(line): + """Parse nested 'questions' field into question/choices/gold.""" + d = line["questions"] + if isinstance(d, str): + d = ast.literal_eval(d) + keys = sorted(d["answers"].keys()) + choices = [d["answers"][k] for k in keys] + answer_key = d.get("correct_solution") or d.get("solution") + gold = keys.index(answer_key) + return d["question"], choices, gold + + +def cybermetric_cf_prompt(line, task_name: str = None): + question, choices, gold = _cybermetric_parse(line) + return Doc( + task_name=task_name, + query=f"Question: {question}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=gold, + ) + + +def cybermetric_mcf_prompt(line, task_name: str = None): + question, choices, gold = _cybermetric_parse(line) + labels = list("ABCD")[: len(choices)] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {question}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=gold, + ) + + +# ---- SecQA ---- + +def secqa_cf_prompt(line, task_name: str = None): + choices = [line["A"], line["B"], line["C"], line["D"]] + return Doc( + task_name=task_name, + query=f"Question: {line['Question']}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=list("ABCD").index(line["Answer"]), + ) + + +def secqa_mcf_prompt(line, task_name: str = None): + choices = [line["A"], line["B"], line["C"], line["D"]] + options = "\n".join(f" {l}. {c}" for l, c in zip("ABCD", choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['Question']}\n{options}\nAnswer:", + choices=[" A", " B", " C", " D"], + gold_index=list("ABCD").index(line["Answer"]), + ) + + +def _cybermetric_configs(): + return [ + LightevalTaskConfig( + name=f"cybermetric:{suffix}", + prompt_function=fn, + hf_repo="tihanyin/CyberMetric", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", cybermetric_cf_prompt, _CF_METRICS, -1), + ("mcf", cybermetric_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", cybermetric_mcf_prompt, [Metrics.exact_match], 1), + ] + ] + + +def _secqa_configs(version: str): + return [ + LightevalTaskConfig( + name=f"secqa:{version}:{suffix}", + prompt_function=fn, + hf_repo="zefang-liu/secqa", + hf_subset=f"secqa_{version}", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", secqa_cf_prompt, _CF_METRICS, -1), + ("mcf", secqa_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", secqa_mcf_prompt, [Metrics.exact_match], 1), + ] + ] + + +# ── cybermetric: titaneval_local (cached parquet, pre-flattened) ── + +def _cybermetric_titaneval_cf_prompt(line, task_name: str = None): + """CF variant: completion-style, logprob on full answer text.""" + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def _cybermetric_titaneval_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled options, score label tokens via logprobs.""" + labels = list("ABCD")[: len(line["choices"])] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, line["choices"])) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + +def _cybermetric_titaneval_configs(): + return [ + LightevalTaskConfig( + name=f"cybermetric:{suffix}", + prompt_function=fn, + hf_repo="titaneval_local", + hf_subset="cybermetric", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=1, + ) + for suffix, fn, metrics, gen in [ + ("cf", _cybermetric_titaneval_cf_prompt, _CF_METRICS, -1), + ("mcf", _cybermetric_titaneval_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", _cybermetric_titaneval_mcf_prompt, [Metrics.exact_match], 1), + ] + ] + + +TASKS_TABLE = ( + _cybermetric_configs() + + _secqa_configs("v1") + + _secqa_configs("v2") + + _cybermetric_titaneval_configs() +) diff --git a/src/lighteval/tasks/tasks/esgenius.py b/src/lighteval/tasks/tasks/esgenius.py new file mode 100644 index 000000000..8d4d5a0e0 --- /dev/null +++ b/src/lighteval/tasks/tasks/esgenius.py @@ -0,0 +1,78 @@ +""" +name: +ESGenius ESG/Sustainability QA + +dataset: +titaneval_local (local parquet — data/titaneval/esgenius.parquet) + +abstract: +ESG (Environmental, Social, Governance) and sustainability multiple-choice +benchmark. 1,136 questions from TitanEval-MCQ, 0-shot. + +languages: +english + +tags: +esg, sustainability, multiple-choice, qa +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def esgenius_cf_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def esgenius_mcf_prompt(line, task_name: str = None): + choices = line["choices"] + labels = list("ABCDEFGHIJ"[: len(choices)]) + options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"esgenius:{suffix}", + prompt_function=fn, + hf_repo="titaneval_local", + hf_subset="esgenius", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", esgenius_cf_prompt, _CF_METRICS, -1), + ("mcf", esgenius_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", esgenius_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/formationeval.py b/src/lighteval/tasks/tasks/formationeval.py new file mode 100644 index 000000000..a760c6c98 --- /dev/null +++ b/src/lighteval/tasks/tasks/formationeval.py @@ -0,0 +1,85 @@ +""" +name: +FormationEval + +dataset: +AlmazErmilov/FormationEval + +abstract: +FormationEval is a multiple-choice benchmark for formation evaluation and +petroleum engineering knowledge. + +languages: +english + +tags: +multiple-choice, petroleum, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +_LABELS = list("ABCDE") + + +def formationeval_cf_prompt(line, task_name: str = None): + import ast + choices = ast.literal_eval(line["choices"]) if isinstance(line["choices"], str) else line["choices"] + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=int(line["answer_index"]), + ) + + +def formationeval_mcf_prompt(line, task_name: str = None): + import ast + choices = ast.literal_eval(line["choices"]) if isinstance(line["choices"], str) else line["choices"] + labels = _LABELS[: len(choices)] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=int(line["answer_index"]), + ) + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"formationeval:{suffix}", + prompt_function=fn, + hf_repo="AlmazErmilov/FormationEval", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", formationeval_cf_prompt, _CF_METRICS, -1), + ("mcf", formationeval_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", formationeval_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/geobench.py b/src/lighteval/tasks/tasks/geobench.py new file mode 100644 index 000000000..75d8358b3 --- /dev/null +++ b/src/lighteval/tasks/tasks/geobench.py @@ -0,0 +1,78 @@ +""" +name: +GeoBench Geoscience QA + +dataset: +titaneval_local (local parquet — data/titaneval/geobench.parquet) + +abstract: +Geoscience multiple-choice benchmark. 1,390 questions from TitanEval-MCQ +(originally GeoBench-VLM, text-only subset used here), 0-shot. + +languages: +english + +tags: +geoscience, multiple-choice, qa, science +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def geobench_cf_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def geobench_mcf_prompt(line, task_name: str = None): + choices = line["choices"] + labels = list("ABCDEFGHIJ"[: len(choices)]) + options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"geobench:{suffix}", + prompt_function=fn, + hf_repo="titaneval_local", + hf_subset="geobench", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", geobench_cf_prompt, _CF_METRICS, -1), + ("mcf", geobench_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", geobench_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/gpqa.py b/src/lighteval/tasks/tasks/gpqa.py index 88a31c97c..31b955c66 100644 --- a/src/lighteval/tasks/tasks/gpqa.py +++ b/src/lighteval/tasks/tasks/gpqa.py @@ -23,6 +23,7 @@ https://arxiv.org/abs/2311.12022 """ +import hashlib import random from string import ascii_uppercase @@ -30,10 +31,38 @@ from inspect_ai.scorer import choice from inspect_ai.solver import multiple_choice +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _stable_choices(line): + """Deterministic shuffle of [Correct, Incorrect1, Incorrect2, Incorrect3] by question hash.""" + items = [ + line["Correct Answer"], + line["Incorrect Answer 1"], + line["Incorrect Answer 2"], + line["Incorrect Answer 3"], + ] + seed = int(hashlib.md5(line["Question"].encode()).hexdigest(), 16) % (2**32) + rng = random.Random(seed) + rng.shuffle(items) + gold = items.index(line["Correct Answer"]) + return items, gold + def record_to_sample(record): gold_index = random.randint(0, 3) @@ -178,9 +207,74 @@ def gpqa_instruct_prompt(line, task_name: str = None): version=0, ) +def gpqa_cf_prompt(line, task_name: str = None): + """CF variant: score full answer texts via logprobs (deterministic shuffle).""" + choices, gold = _stable_choices(line) + return Doc( + task_name=task_name, + query=f"Question: {line['Question'].strip()}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=gold, + ) + + +def gpqa_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled A/B/C/D options, score label tokens via logprobs.""" + choices, gold = _stable_choices(line) + options = "\n".join(f" {l}. {c}" for l, c in zip(ascii_uppercase, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['Question'].strip()}\n{options}\nAnswer:", + choices=[f" {l}" for l in ascii_uppercase[:4]], + gold_index=gold, + ) + + +_GPQA_COMMON = dict( + hf_repo="Idavidrein/gpqa", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + stop_sequence=["\n"], +) + +gpqa_diamond_cf = LightevalTaskConfig( + name="gpqa:diamond:cf", + prompt_function=gpqa_cf_prompt, + hf_subset="gpqa_diamond", + generation_size=-1, + metrics=_CF_METRICS, + version=0, + **_GPQA_COMMON, +) + +gpqa_diamond_mcf = LightevalTaskConfig( + name="gpqa:diamond:mcf", + prompt_function=gpqa_mcf_prompt, + hf_subset="gpqa_diamond", + generation_size=-1, + metrics=_MCF_METRICS, + version=0, + **_GPQA_COMMON, +) + +gpqa_diamond_mcf_em = LightevalTaskConfig( + name="gpqa:diamond:mcf_em", + prompt_function=gpqa_mcf_prompt, + hf_subset="gpqa_diamond", + generation_size=1, + metrics=[Metrics.exact_match], + version=0, + **_GPQA_COMMON, +) + TASKS_TABLE = [ gpqa, gpqa_diamond_instruct, gpqa_extended_instruct, gpqa_main_instruct, + gpqa_diamond_cf, + gpqa_diamond_mcf, + gpqa_diamond_mcf_em, ] diff --git a/src/lighteval/tasks/tasks/gsm_plus.py b/src/lighteval/tasks/tasks/gsm_plus.py index 27a1dca27..ddb958e23 100644 --- a/src/lighteval/tasks/tasks/gsm_plus.py +++ b/src/lighteval/tasks/tasks/gsm_plus.py @@ -76,8 +76,8 @@ def gsm_plus_prompt(line, task_name: str = None): hf_subset="default", hf_avail_splits=["test", "testmini"], evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, + few_shots_split="testmini", + few_shots_select="random_sampling", generation_size=512, metrics=[Metrics.expr_gold_metric], stop_sequence=["Question:", "\n####", "####"], diff --git a/src/lighteval/tasks/tasks/gsm_symbolic.py b/src/lighteval/tasks/tasks/gsm_symbolic.py index 464f51e11..e3c3595d8 100644 --- a/src/lighteval/tasks/tasks/gsm_symbolic.py +++ b/src/lighteval/tasks/tasks/gsm_symbolic.py @@ -66,7 +66,7 @@ def gsm_symbolic_prompt(line, task_name: str = None): hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split="test", - few_shots_select="random_sampling_from_train", + few_shots_select="random_sampling", generation_size=512, metrics=[Metrics.expr_gold_metric], stop_sequence=["Problem:", "\n####", "####"], diff --git a/src/lighteval/tasks/tasks/headqa.py b/src/lighteval/tasks/tasks/headqa.py index 85c025c5f..83f99bd8a 100644 --- a/src/lighteval/tasks/tasks/headqa.py +++ b/src/lighteval/tasks/tasks/headqa.py @@ -22,12 +22,26 @@ https://arxiv.org/abs/1906.04701 """ +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + -def headqa_prompt(line, task_name: str = None): +def headqa_cf_prompt(line, task_name: str = None): + """CF variant: score full answer texts via logprobs.""" return Doc( task_name=task_name, query=f"Question: {line['qtext']}\nAnswer:", @@ -36,42 +50,146 @@ def headqa_prompt(line, task_name: str = None): ) -headqa_en = LightevalTaskConfig( - name="headqa:en", - prompt_function=headqa_prompt, - hf_repo="lighteval/headqa_harness", - hf_subset="en", - hf_avail_splits=["train", "test", "validation"], +def headqa_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled options, score label tokens via logprobs.""" + labels = list("ABCDE")[: len(line["answers"])] + options = "\n".join(f" {l}. {a['atext']}" for l, a in zip(labels, line["answers"])) + return Doc( + task_name=task_name, + query=f"Question: {line['qtext']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=int(line["ra"]) - 1, + ) + + +def _configs(lang: str): + return [ + LightevalTaskConfig( + name=f"headqa:{lang}:cf", + prompt_function=headqa_cf_prompt, + hf_repo="lighteval/headqa_harness", + hf_subset=lang, + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_CF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name=f"headqa:{lang}:mcf", + prompt_function=headqa_mcf_prompt, + hf_repo="lighteval/headqa_harness", + hf_subset=lang, + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_MCF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name=f"headqa:{lang}:mcf_em", + prompt_function=headqa_mcf_prompt, + hf_repo="lighteval/headqa_harness", + hf_subset=lang, + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=1, + ), + ] + + +# ── headqa: titaneval_local (cached parquet, en+es combined) ── + +def _headqa_en_filter(row) -> bool: + """Select English-only rows (domain has format 'category/en').""" + domain = row.get("domain") or "" + return domain.endswith("/en") + + +def _headqa_titaneval_cf_prompt(line, task_name: str = None): + """CF variant: completion-style, logprob on full answer text.""" + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def _headqa_titaneval_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled options, score label tokens via logprobs.""" + labels = list("ABCDE")[: len(line["choices"])] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, line["choices"])) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + +headqa_titaneval_cf = LightevalTaskConfig( + name="headqa:cf", + prompt_function=_headqa_titaneval_cf_prompt, + hf_repo="titaneval_local", + hf_subset="headqa", + hf_avail_splits=["test"], evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, + few_shots_split="test", + few_shots_select="random_sampling", + hf_filter=_headqa_en_filter, generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - ], + metrics=_CF_METRICS, stop_sequence=["\n"], - version=0, + version=1, ) - -headqa_es = LightevalTaskConfig( - name="headqa:es", - prompt_function=headqa_prompt, - hf_repo="lighteval/headqa_harness", - hf_subset="es", - hf_avail_splits=["train", "test", "validation"], +headqa_titaneval_mcf = LightevalTaskConfig( + name="headqa:mcf", + prompt_function=_headqa_titaneval_mcf_prompt, + hf_repo="titaneval_local", + hf_subset="headqa", + hf_avail_splits=["test"], evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, + few_shots_split="test", + few_shots_select="random_sampling", + hf_filter=_headqa_en_filter, generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - ], + metrics=_MCF_METRICS, stop_sequence=["\n"], - version=0, + version=1, ) +headqa_titaneval_mcf_em = LightevalTaskConfig( + name="headqa:mcf_em", + prompt_function=_headqa_titaneval_mcf_prompt, + hf_repo="titaneval_local", + hf_subset="headqa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + hf_filter=_headqa_en_filter, + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=1, +) + + TASKS_TABLE = [ - headqa_en, - headqa_es, + headqa_titaneval_cf, + headqa_titaneval_mcf, + headqa_titaneval_mcf_em, ] diff --git a/src/lighteval/tasks/tasks/jeopardy.py b/src/lighteval/tasks/tasks/jeopardy.py index 67cf40360..21d7ba996 100644 --- a/src/lighteval/tasks/tasks/jeopardy.py +++ b/src/lighteval/tasks/tasks/jeopardy.py @@ -125,7 +125,7 @@ def jeopardy_mc_mcf_prompt(line, task_name: str = None): few_shots_select="random_sampling_from_train", generation_size=50, stop_sequence=["\n\n", "Question:", "Category:"], - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], version=0, ), LightevalTaskConfig( diff --git a/src/lighteval/tasks/tasks/labbench.py b/src/lighteval/tasks/tasks/labbench.py new file mode 100644 index 000000000..f37be5480 --- /dev/null +++ b/src/lighteval/tasks/tasks/labbench.py @@ -0,0 +1,99 @@ +""" +name: +LAB-Bench (TableQA) + +dataset: +futurehouse/LAB-Bench (TableQA config) + +abstract: +LAB-Bench is a biology laboratory capabilities benchmark. The TableQA subset +tests reading comprehension of scientific tables with 4-choice MCQ. Note: the +original tables are provided as images; this text-only variant evaluates +without visual context. + +languages: +english + +tags: +biology, multiple-choice, qa, science + +paper: +https://arxiv.org/abs/2407.10362 +""" + +import ast +import hashlib +import random + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _stable_choices(line): + """Deterministic shuffle of [ideal] + distractors to avoid position bias.""" + ideal = line["ideal"] + distractors = ast.literal_eval(line["distractors"]) if isinstance(line["distractors"], str) else line["distractors"] + items = [ideal] + list(distractors) + seed = int(hashlib.md5(line["question"].encode()).hexdigest(), 16) % (2**32) + rng = random.Random(seed) + rng.shuffle(items) + return items, items.index(ideal) + + +def labbench_cf_prompt(line, task_name: str = None): + choices, gold = _stable_choices(line) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=gold, + ) + + +def labbench_mcf_prompt(line, task_name: str = None): + choices, gold = _stable_choices(line) + labels = list("ABCD")[: len(choices)] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=gold, + ) + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"labbench:{suffix}", + prompt_function=fn, + hf_repo="futurehouse/LAB-Bench", + hf_subset="TableQA", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", labbench_cf_prompt, _CF_METRICS, -1), + ("mcf", labbench_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", labbench_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/mascqa.py b/src/lighteval/tasks/tasks/mascqa.py new file mode 100644 index 000000000..c122c05d5 --- /dev/null +++ b/src/lighteval/tasks/tasks/mascqa.py @@ -0,0 +1,98 @@ +""" +name: +MaScQA + +dataset: +heegyu/mascqa + +abstract: +MaScQA (Materials Science Question Answering) is a multiple-choice benchmark +for materials science knowledge, sourced from standardized exams. Answer choices +are embedded in the question text in (A) / (B) / (C) / (D) format. + +languages: +english + +tags: +materials-science, multiple-choice, qa, science + +paper: +https://arxiv.org/abs/2209.09088 +""" + +import re + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +_LABELS = list("ABCDE") + + +def _parse(line): + """Extract question and choices from embedded format '... (A) c1 (B) c2 ...'.""" + text = line["questions"].strip() + m = re.search(r'\s*\(A\)', text) + question = text[: m.start()].strip() if m else text + raw = re.findall(r'\(([A-E])\)\s*(.*?)(?=\s*\([A-E]\)|$)', text) + choices = [v.strip() for _, v in raw] + labels = [k for k, _ in raw] + gold = labels.index(line["label"]) if line["label"] in labels else 0 + return question, choices, labels, gold + + +def mascqa_cf_prompt(line, task_name: str = None): + question, choices, _, gold = _parse(line) + return Doc( + task_name=task_name, + query=f"Question: {question}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=gold, + ) + + +def mascqa_mcf_prompt(line, task_name: str = None): + question, choices, labels, gold = _parse(line) + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {question}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=gold, + ) + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mascqa:{suffix}", + prompt_function=fn, + hf_repo="heegyu/mascqa", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", mascqa_cf_prompt, _CF_METRICS, -1), + ("mcf", mascqa_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", mascqa_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/math_500.py b/src/lighteval/tasks/tasks/math_500.py index 1ca2e2aa6..5299c8817 100644 --- a/src/lighteval/tasks/tasks/math_500.py +++ b/src/lighteval/tasks/tasks/math_500.py @@ -64,8 +64,8 @@ def record_to_sample(record): hf_subset="default", hf_avail_splits=["test"], evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, + few_shots_split="test", + few_shots_select="random_sampling", generation_size=1024, metrics=[ # Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), diff --git a/src/lighteval/tasks/tasks/med.py b/src/lighteval/tasks/tasks/med.py index fa4f2f19f..b94568948 100644 --- a/src/lighteval/tasks/tasks/med.py +++ b/src/lighteval/tasks/tasks/med.py @@ -82,6 +82,16 @@ def med_qa_prompt(line, task_name: str = None): ) +def med_qa_cf_prompt(line, task_name: str = None): + """CF variant: completion-style prompt with full answer texts as choices.""" + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + opt["value"] for opt in line["options"]], + gold_index=list(ascii_uppercase).index(line["answer_idx"]), + ) + + def med_qa_mcf_em_prompt(line, task_name: str = None): """Greedy variant: generate 1 token, compare with gold letter.""" query = f"Give a letter answer among A, B, C or D.\nQuestion: {line['question']}\n" @@ -187,10 +197,89 @@ def med_mcqa_cf_prompt(line, task_name: str = None): version=0, ) +# ── med_qa: titaneval_local (cached parquet) ── + +def _med_qa_titaneval_cf_prompt(line, task_name: str = None): + """CF variant: completion-style, logprob on full answer text.""" + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def _med_qa_titaneval_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled A/B/C/D options, score label tokens via logprobs.""" + labels = list(ascii_uppercase)[: len(line["choices"])] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, line["choices"])) + return Doc( + task_name=task_name, + query=f"Give a letter answer among A, B, C or D.\nQuestion: {line['question']}\n{options}\nAnswer:", + choices=[" " + l for l in labels], + gold_index=line["answer_index"], + instruction="Give a letter answer among A, B, C or D.\n", + ) + + +med_qa_titaneval_cf = LightevalTaskConfig( + name="med_qa:cf", + prompt_function=_med_qa_titaneval_cf_prompt, + hf_repo="titaneval_local", + hf_subset="med_qa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=-1, + metrics=_CF_METRICS, + stop_sequence=["\n"], + version=1, +) + +med_qa_titaneval_mcf = LightevalTaskConfig( + name="med_qa:mcf", + prompt_function=_med_qa_titaneval_mcf_prompt, + hf_repo="titaneval_local", + hf_subset="med_qa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=-1, + metrics=_MCF_METRICS, + stop_sequence=["\n"], + version=1, +) + +med_qa_titaneval_mcf_em = LightevalTaskConfig( + name="med_qa:mcf_em", + prompt_function=_med_qa_titaneval_mcf_prompt, + hf_repo="titaneval_local", + hf_subset="med_qa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=1, +) + +# ── original bigbio/med_qa configs (broken: script-based dataset) ── +# med_qa_cf = LightevalTaskConfig( +# name="med_qa:cf", +# prompt_function=med_qa_cf_prompt, +# hf_repo="bigbio/med_qa", +# ... +# ) + TASKS_TABLE = [ med_mcqa_mcf_em, med_mcqa_mcf, med_mcqa_cf, - med_qa_mcf, - med_qa_mcf_em, + med_qa_titaneval_cf, + med_qa_titaneval_mcf, + med_qa_titaneval_mcf_em, ] diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py index 2ede01f50..66378cbfb 100644 --- a/src/lighteval/tasks/tasks/mmlu_pro.py +++ b/src/lighteval/tasks/tasks/mmlu_pro.py @@ -88,8 +88,19 @@ def record_to_sample(record): return Sample(input=record["question"], target=record["answer"], choices=record["options"]) -mmlu_pro = LightevalTaskConfig( - name="mmlu_pro", +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +mmlu_pro_cot = LightevalTaskConfig( + name="mmlu_pro:cot", prompt_function=mmlu_pro_prompt_function, sample_fields=record_to_sample, solver=[multiple_choice(cache=True)], @@ -102,8 +113,8 @@ def record_to_sample(record): metrics=[Metrics.gpqa_instruct_metric], ) -mmlu_pro_mcf = LightevalTaskConfig( - name="mmlu_pro_mcf", +mmlu_pro_mcf_em = LightevalTaskConfig( + name="mmlu_pro:mcf_em", prompt_function=mmlu_pro_mcf_prompt_function, sample_fields=record_to_sample, solver=[multiple_choice(cache=True)], @@ -113,16 +124,28 @@ def record_to_sample(record): hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", evaluation_splits=("test",), few_shots_split="validation", - metrics=[ - Metrics.exact_match, - # Metrics.gpqa_instruct_metric - ], generation_size=1, - stop_sequence=["\n", "\n\n"], + stop_sequence=["\n"], + metrics=[Metrics.exact_match], +) + +mmlu_pro_mcf = LightevalTaskConfig( + name="mmlu_pro:mcf", + prompt_function=mmlu_pro_mcf_prompt_function, + sample_fields=record_to_sample, + solver=[multiple_choice(cache=True)], + scorer=choice(), + hf_repo="TIGER-Lab/MMLU-Pro", + hf_subset="default", + hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", + evaluation_splits=("test",), + few_shots_split="validation", + generation_size=-1, + metrics=_MCF_METRICS, ) mmlu_pro_cf = LightevalTaskConfig( - name="mmlu_pro_cf", + name="mmlu_pro:cf", prompt_function=mmlu_pro_cf_prompt_function, sample_fields=record_to_sample, solver=[multiple_choice(cache=True)], @@ -132,11 +155,8 @@ def record_to_sample(record): hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", evaluation_splits=("test",), few_shots_split="validation", - metrics=[ - LogLikelihoodAccMetric(), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - Metrics.target_bits_per_byte, - ], + generation_size=-1, + metrics=_CF_METRICS, ) -TASKS_TABLE = [mmlu_pro, mmlu_pro_mcf, mmlu_pro_cf] +TASKS_TABLE = [mmlu_pro_cot, mmlu_pro_cf, mmlu_pro_mcf, mmlu_pro_mcf_em] diff --git a/src/lighteval/tasks/tasks/natural_questions.py b/src/lighteval/tasks/tasks/natural_questions.py index 5a39fa8ff..b532c51c8 100644 --- a/src/lighteval/tasks/tasks/natural_questions.py +++ b/src/lighteval/tasks/tasks/natural_questions.py @@ -81,7 +81,7 @@ def nq_bpb_prompt(line, task_name: str = None): few_shots_select="random_sampling_from_train", generation_size=50, stop_sequence=["Question:", "Q:", "\n\n"], - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], version=1, ), ] diff --git a/src/lighteval/tasks/tasks/popqa.py b/src/lighteval/tasks/tasks/popqa.py index ce44c8de3..11f74f582 100644 --- a/src/lighteval/tasks/tasks/popqa.py +++ b/src/lighteval/tasks/tasks/popqa.py @@ -67,7 +67,7 @@ def popqa_bpb_prompt(line, task_name: str = None): few_shots_split="test", few_shots_select="random_sampling", generation_size=8, - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], stop_sequence=["\n"], version=1, ), diff --git a/src/lighteval/tasks/tasks/preflight.py b/src/lighteval/tasks/tasks/preflight.py new file mode 100644 index 000000000..b4ea34fba --- /dev/null +++ b/src/lighteval/tasks/tasks/preflight.py @@ -0,0 +1,78 @@ +""" +name: +Preflight Aviation Safety QA + +dataset: +titaneval_local (local parquet — data/titaneval/preflight.parquet) + +abstract: +Aviation safety multiple-choice questions derived from international airport ground +operations manuals and FAA/ICAO regulations. 300 questions, 0-shot. + +languages: +english + +tags: +aviation, multiple-choice, qa, safety +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def preflight_cf_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def preflight_mcf_prompt(line, task_name: str = None): + choices = line["choices"] + labels = list("ABCDEFGHIJ"[: len(choices)]) + options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"preflight:{suffix}", + prompt_function=fn, + hf_repo="titaneval_local", + hf_subset="preflight", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", preflight_cf_prompt, _CF_METRICS, -1), + ("mcf", preflight_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", preflight_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/pubmedqa.py b/src/lighteval/tasks/tasks/pubmedqa.py index 326789bb6..17654760a 100644 --- a/src/lighteval/tasks/tasks/pubmedqa.py +++ b/src/lighteval/tasks/tasks/pubmedqa.py @@ -3,10 +3,11 @@ Pubmedqa dataset: -pubmed_qa +qiaojin/PubMedQA abstract: PubMedQA is a dataset for biomedical research question answering. +Each question is answerable with yes, no, or maybe based on a PubMed abstract. languages: english @@ -18,37 +19,91 @@ https://pubmedqa.github.io/ """ +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +_CHOICES = ["yes", "no", "maybe"] +_GOLD = {c: i for i, c in enumerate(_CHOICES)} -def pubmed_qa_prompt(line, task_name: str = None): +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def pubmedqa_cf_prompt(line, task_name: str = None): + """CF variant: score full answer text (yes/no/maybe) via logprobs.""" + ctx = " ".join(line["CONTEXTS"]) if isinstance(line["CONTEXTS"], list) else line["CONTEXTS"] return Doc( task_name=task_name, - query=f"{line['QUESTION']}\n{line['CONTEXTS']}\nAnswer: ", - choices=[line["final_decision"]], - gold_index=0, + query=f"Abstract: {ctx}\nQuestion: {line['QUESTION']}\nAnswer:", + choices=[f" {c}" for c in _CHOICES], + gold_index=_GOLD[line["final_decision"].lower()], ) -pubmedqa = LightevalTaskConfig( - name="pubmedqa", - prompt_function=pubmed_qa_prompt, - hf_repo="pubmed_qa", - hf_subset="pqa_labeled", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - ], - stop_sequence=["\n"], - version=0, -) +def pubmedqa_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled A/B/C options, score label tokens via logprobs.""" + ctx = " ".join(line["CONTEXTS"]) if isinstance(line["CONTEXTS"], list) else line["CONTEXTS"] + options = "\n".join(f" {l}. {c}" for l, c in zip("ABC", _CHOICES)) + return Doc( + task_name=task_name, + query=f"Abstract: {ctx}\nQuestion: {line['QUESTION']}\n{options}\nAnswer:", + choices=[" A", " B", " C"], + gold_index=_GOLD[line["final_decision"].lower()], + ) + TASKS_TABLE = [ - pubmedqa, + LightevalTaskConfig( + name="pubmedqa:cf", + prompt_function=pubmedqa_cf_prompt, + hf_repo="qiaojin/PubMedQA", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=-1, + metrics=_CF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name="pubmedqa:mcf", + prompt_function=pubmedqa_mcf_prompt, + hf_repo="qiaojin/PubMedQA", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=-1, + metrics=_MCF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name="pubmedqa:mcf_em", + prompt_function=pubmedqa_mcf_prompt, + hf_repo="qiaojin/PubMedQA", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=1, + ), ] diff --git a/src/lighteval/tasks/tasks/qasc.py b/src/lighteval/tasks/tasks/qasc.py new file mode 100644 index 000000000..bb229f9a0 --- /dev/null +++ b/src/lighteval/tasks/tasks/qasc.py @@ -0,0 +1,110 @@ +""" +name: +Qasc + +dataset: +allenai/qasc + +abstract: +QASC is a question-and-answer dataset that focuses on sentence composition. +It consists of 8-way multiple choice questions requiring combining two facts +from a large corpus to derive an answer. + +languages: +english + +tags: +multiple-choice, qa, reasoning, science + +paper: +https://arxiv.org/abs/1910.11473 +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +_LABELS = list("ABCDEFGH") + + +def qasc_cf_prompt(line, task_name: str = None): + """CF variant: score full answer texts via logprobs.""" + choices = line["choices"]["text"] + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=_LABELS.index(line["answerKey"]), + ) + + +def qasc_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled A-H options, score label tokens via logprobs.""" + choices = line["choices"]["text"] + labels = _LABELS[: len(choices)] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=_LABELS.index(line["answerKey"]), + ) + + +TASKS_TABLE = [ + LightevalTaskConfig( + name="qasc:cf", + prompt_function=qasc_cf_prompt, + hf_repo="allenai/qasc", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_CF_METRICS, + stop_sequence=["\n"], + version=0, + ), + LightevalTaskConfig( + name="qasc:mcf", + prompt_function=qasc_mcf_prompt, + hf_repo="allenai/qasc", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_MCF_METRICS, + stop_sequence=["\n"], + version=0, + ), + LightevalTaskConfig( + name="qasc:mcf_em", + prompt_function=qasc_mcf_prompt, + hf_repo="allenai/qasc", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, + ), +] diff --git a/src/lighteval/tasks/tasks/simpleqa.py b/src/lighteval/tasks/tasks/simpleqa.py index 4493fc322..6e7c1628f 100644 --- a/src/lighteval/tasks/tasks/simpleqa.py +++ b/src/lighteval/tasks/tasks/simpleqa.py @@ -31,6 +31,14 @@ from lighteval.tasks.requests import Doc +def record_to_sample(record): + query = record["problem"] + target = record["answer"] + return Sample(input=query, target=target) + + +# ---- Original graded variant (kept for compatibility) ---- + def simpleqa_prompt(line, task_name: str = None): query = f"Question: {line['question']}\n" query += "".join( @@ -45,12 +53,6 @@ def simpleqa_prompt(line, task_name: str = None): ) -def record_to_sample(record): - query = record["problem"] - target = record["answer"] - return Sample(input=query, target=target) - - simpleqa = LightevalTaskConfig( name="simpleqa", prompt_function=simpleqa_prompt, @@ -69,6 +71,67 @@ def record_to_sample(record): scorer=model_graded_fact(), ) +# ---- GenQA variants (our convention: gen{em,f1} + decoupled bpb) ---- + +def simpleqa_gen_prompt(line, task_name: str = None): + """GenQA variant: generate short answer, score with F1/EM.""" + answer = line["answer"] + prefix = " " if line.get("__few_shots", False) else "" + return Doc( + task_name=task_name, + query=f"Question: {line['problem']}\nAnswer:", + choices=[f"{prefix}{answer}"], + gold_index=0, + ) + + +def simpleqa_bpb_prompt(line, task_name: str = None): + """BPB variant: score the gold answer continuation.""" + answer = line["answer"] + if not answer: + return None + if not answer[0].isspace(): + answer = " " + answer + return Doc( + task_name=task_name, + query=f"Question: {line['problem']}\nAnswer:", + choices=[answer], + gold_index=0, + ) + + +simpleqa_gen = LightevalTaskConfig( + name="simpleqa:gen", + prompt_function=simpleqa_gen_prompt, + hf_repo="lighteval/SimpleQA", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="few_shot", + few_shots_select="random_sampling", + generation_size=50, + metrics=[Metrics.qa_f1, Metrics.qa_em], + stop_sequence=["\n"], + version=1, +) + +simpleqa_bpb = LightevalTaskConfig( + name="simpleqa:bpb", + prompt_function=simpleqa_bpb_prompt, + hf_repo="lighteval/SimpleQA", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="few_shot", + few_shots_select="random_sampling", + generation_size=-1, + metrics=[Metrics.target_bits_per_byte], + stop_sequence=["\n"], + version=1, +) + TASKS_TABLE = [ simpleqa, + simpleqa_gen, + simpleqa_bpb, ] diff --git a/src/lighteval/tasks/tasks/squad.py b/src/lighteval/tasks/tasks/squad.py index 6baf16d6b..ff1ba8acf 100644 --- a/src/lighteval/tasks/tasks/squad.py +++ b/src/lighteval/tasks/tasks/squad.py @@ -82,7 +82,7 @@ def squad_bpb_prompt(line, task_name: str = None): few_shots_select="random_sampling_from_train", generation_size=50, stop_sequence=["Title:", "\n\n"], - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], version=1, ), ] diff --git a/src/lighteval/tasks/tasks/squad_v2.py b/src/lighteval/tasks/tasks/squad_v2.py index 6891fb709..2c912843c 100644 --- a/src/lighteval/tasks/tasks/squad_v2.py +++ b/src/lighteval/tasks/tasks/squad_v2.py @@ -79,7 +79,7 @@ def squad_v2_bpb_prompt(line, task_name: str = None): few_shots_split="train", few_shots_select="random_sampling_from_train", generation_size=200, - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], stop_sequence=["\n", "Question:", "question:"], version=1, ) diff --git a/src/lighteval/tasks/tasks/swag.py b/src/lighteval/tasks/tasks/swag.py index 67febe8ec..d0f7a4399 100644 --- a/src/lighteval/tasks/tasks/swag.py +++ b/src/lighteval/tasks/tasks/swag.py @@ -25,36 +25,91 @@ https://arxiv.org/abs/1808.05326 """ +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + +_ENDINGS = ["ending0", "ending1", "ending2", "ending3"] + -def swag_prompt(line, task_name: str = None): - choices = [line["ending0"], line["ending1"], line["ending2"], line["ending3"]] +def swag_cf_prompt(line, task_name: str = None): + """CF variant: score full completion texts via logprobs.""" + choices = [line[e] for e in _ENDINGS] return Doc( task_name=task_name, query=line["startphrase"], - choices=choices, + choices=[" " + c for c in choices], gold_index=int(line["label"]), ) -swag = LightevalTaskConfig( - name="swag", - prompt_function=swag_prompt, - hf_repo="allenai/swag", - hf_subset="regular", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) +def swag_mcf_prompt(line, task_name: str = None): + """MCF variant: labeled A/B/C/D options, score label tokens via logprobs.""" + choices = [line[e] for e in _ENDINGS] + options = "\n".join(f" {l}. {c}" for l, c in zip("ABCD", choices)) + query = f"{line['startphrase']}\n{options}\nAnswer:" + return Doc( + task_name=task_name, + query=query, + choices=[" A", " B", " C", " D"], + gold_index=int(line["label"]), + ) + TASKS_TABLE = [ - swag, + LightevalTaskConfig( + name="swag:cf", + prompt_function=swag_cf_prompt, + hf_repo="allenai/swag", + hf_subset="regular", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_CF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name="swag:mcf", + prompt_function=swag_mcf_prompt, + hf_repo="allenai/swag", + hf_subset="regular", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=-1, + metrics=_MCF_METRICS, + stop_sequence=["\n"], + version=1, + ), + LightevalTaskConfig( + name="swag:mcf_em", + prompt_function=swag_mcf_prompt, + hf_repo="allenai/swag", + hf_subset="regular", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=1, + ), ] diff --git a/src/lighteval/tasks/tasks/teleqna.py b/src/lighteval/tasks/tasks/teleqna.py new file mode 100644 index 000000000..4cf686943 --- /dev/null +++ b/src/lighteval/tasks/tasks/teleqna.py @@ -0,0 +1,90 @@ +""" +name: +TeleQnA + +dataset: +netop/TeleQnA (gated — must be pre-cached) + +abstract: +TeleQnA is a multiple-choice benchmark covering telecommunications standards +from 3GPP, IEEE, and other telecom bodies. + +languages: +english + +tags: +multiple-choice, qa, telecom + +paper: +https://arxiv.org/abs/2310.15051 +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] + +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def _choices(line): + choices = line["choices"] + if isinstance(choices, str): + import ast + choices = ast.literal_eval(choices) + return choices + + +def teleqna_cf_prompt(line, task_name: str = None): + choices = _choices(line) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in choices], + gold_index=int(line["answer"]), + ) + + +def teleqna_mcf_prompt(line, task_name: str = None): + choices = _choices(line) + labels = list("ABCDE")[: len(choices)] + options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=int(line["answer"]), + ) + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"teleqna:{suffix}", + prompt_function=fn, + hf_repo="netop/TeleQnA", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", teleqna_cf_prompt, _CF_METRICS, -1), + ("mcf", teleqna_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", teleqna_mcf_prompt, [Metrics.exact_match], 1), + ] +] diff --git a/src/lighteval/tasks/tasks/triviaqa.py b/src/lighteval/tasks/tasks/triviaqa.py index 94891a09c..4979a5890 100644 --- a/src/lighteval/tasks/tasks/triviaqa.py +++ b/src/lighteval/tasks/tasks/triviaqa.py @@ -76,7 +76,7 @@ def triviaqa_bpb_prompt(line, task_name: str = None): few_shots_split="train", few_shots_select="random_sampling_from_train", generation_size=20, - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], stop_sequence=["\n", ".", ","], version=1, ), diff --git a/src/lighteval/tasks/tasks/wikifact.py b/src/lighteval/tasks/tasks/wikifact.py index c22d16852..b5328509d 100644 --- a/src/lighteval/tasks/tasks/wikifact.py +++ b/src/lighteval/tasks/tasks/wikifact.py @@ -146,7 +146,7 @@ def _gen_config(subset: str) -> LightevalTaskConfig: # num_fewshot (not num_fewshot+1), so 5-shot doesn't overflow those tiny pools. few_shots_select="random_sampling_from_train", generation_size=8, - metrics=[Metrics.f1_score, Metrics.exact_match], + metrics=[Metrics.qa_f1, Metrics.qa_em], stop_sequence=["\n"], version=1, ) diff --git a/src/lighteval/tasks/tasks/winogrande.py b/src/lighteval/tasks/tasks/winogrande.py index 72c2099c2..9373ad1ca 100644 --- a/src/lighteval/tasks/tasks/winogrande.py +++ b/src/lighteval/tasks/tasks/winogrande.py @@ -164,6 +164,8 @@ def winogrande_rc_prompt(line, task_name: str = None): generation_size=-1, metrics=[ LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, ], stop_sequence=["\n"], version=0, diff --git a/src/lighteval/tasks/tasks/xfinbench.py b/src/lighteval/tasks/tasks/xfinbench.py new file mode 100644 index 000000000..0ccaa5e5b --- /dev/null +++ b/src/lighteval/tasks/tasks/xfinbench.py @@ -0,0 +1,86 @@ +""" +name: +XFinBench Cross-lingual Finance QA + +dataset: +titaneval_local (local parquet — data/titaneval/xfinbench.parquet) + +abstract: +Cross-lingual finance multiple-choice benchmark (English subset). 588 valid +questions from TitanEval-MCQ, 0-shot. + +languages: +english + +tags: +finance, multiple-choice, qa +""" + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + +_CF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + Metrics.target_bits_per_byte, +] +_MCF_METRICS = [ + LogLikelihoodAccMetric(), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), +] + + +def xfinbench_cf_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[" " + c for c in line["choices"]], + gold_index=line["answer_index"], + ) + + +def xfinbench_mcf_prompt(line, task_name: str = None): + choices = line["choices"] + labels = list("ABCDEFGHIJ"[: len(choices)]) + options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices)) + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\n{options}\nAnswer:", + choices=[f" {l}" for l in labels], + gold_index=line["answer_index"], + ) + + +def _valid_row(row) -> bool: + """Skip rows with empty choices (xfinbench has a few invalid rows).""" + choices = row.get("choices") or [] + ans = row.get("answer_index") + return bool(choices) and any(c.strip() for c in choices) and ans is not None and 0 <= ans < len(choices) + + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xfinbench:{suffix}", + prompt_function=fn, + hf_repo="titaneval_local", + hf_subset="xfinbench", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="random_sampling", + generation_size=gen, + metrics=metrics, + hf_filter=_valid_row, + stop_sequence=["\n"], + version=0, + ) + for suffix, fn, metrics, gen in [ + ("cf", xfinbench_cf_prompt, _CF_METRICS, -1), + ("mcf", xfinbench_mcf_prompt, _MCF_METRICS, -1), + ("mcf_em", xfinbench_mcf_prompt, [Metrics.exact_match], 1), + ] +]