diff --git a/.gitignore b/.gitignore
index 77e347cf5..536877328 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ bert.pt.json
 work
 runs
 *.parquet
+!data/titaneval/*.parquet
 *.json
 cache/
 fastspeech_output
diff --git a/TASK_NAMING.md b/TASK_NAMING.md
index 1c20f44a1..1915e6bf6 100644
--- a/TASK_NAMING.md
+++ b/TASK_NAMING.md
@@ -1,292 +1,651 @@
 # Task Naming Conventions
 
-Every task name ends with an explicit metric-type suffix. There are no bare/default names.
+Every task name ends with an explicit metric-type suffix and a shot count: `{task}:{suffix}|{n_shot}`.
 
-## Evaluation Suites
+## Task Overview
 
-### Table 1 — BPB Prepare Suite
+**48 unique English tasks** (`:cf`/`:mcf`/`:gen` variants of the same task count as one):
 
-Tasks used to measure BPB during pre-training. All tasks run in logprob mode (`generation_size=-1`). CF tasks also produce `acc` and `acc_norm` in the same pass.
+| Category | # | Task names |
+|---|---|---|
+| Code BPB (§3) | 3 | `humaneval`, `mbpp`, `mt_mbpp` |
+| Math (§3+§4) | 2 | `math` (BPB+gen), `math_500` |
+| CoT Reasoning (§4) | 5 | `gsm8k`, `gsm_plus`, `gsm_symbolic`, `bigbench_hard`, `agieval_eng_em` |
+| English MC QA (§5) | 31 | `mmlu`, `mmlu_pro`, `arc`, `commonsenseqa`, `siqa`, `piqa`, `sciq`, `hellaswag`, `winogrande`, `swag`, `openbookqa`, `qasc`, `boolq`, `med_mcqa`, `med_qa`, `pubmedqa`, `headqa`, `gpqa`, `jeopardy_mc`, `truthfulqa`, `cybermetric`, `secqa`, `mascqa`, `formationeval`, `teleqna`, `labbench`, `preflight`, `chembench`, `esgenius`, `xfinbench`, `geobench` |
+| English GenQA (§6) | 10 | `coqa`, `drop`, `jeopardy`, `natural_questions`, `squad`, `squad_v2`, `triviaqa`, `popqa`, `wikifact`, `simpleqa` |
+| Lambada & Basic Skills (§7) | 2 | `lambada`, `basic_skills` |
+| **Total** | **53** | |
 
+---
+
+---
+
+## 1. Suffix Reference
+
+| Suffix | `generation_size` | Metrics reported | Description |
+|--------|-------------------|-----------------|-------------|
+| `:cf` | `-1` | `acc`, `acc_norm`, `target_bpb`* | Logprob on full answer texts; BPB merged in |
+| `:mcf` | `-1` | `acc`, `acc_norm` | Logprob on label tokens only (`A`, `B`, …) |
+| `:mcf_em` | `1` | `em` | Greedy-decode the label token, exact match |
+| `:gen` | task-specific | `f1`, `em` (normalized) | Generate free text; scored with normalized F1 + EM |
+| `:bpb` | `-1` | `target_bpb` | Standalone BPB; used for code, math, or decoupled from `:gen` / `:cf` |
+
+\*BPB merged into `:cf` applies to **English MC QA tasks (§5) only**. For lambada and basic_skills (§7), BPB uses a different prompt and is a separate `:bpb` config — `:cf` for those tasks does **not** include BPB.
+
+**Gen tasks**: `:gen` and `:bpb` are separate configs (different prompts). EM and F1 for `:gen` use
+`harness_triviaqa_normalizer` (lowercase + remove punctuation) on both gold and prediction.
+Exception: `drop:gen` uses `Metrics.drop` (span/number/date-aware normalization).
+
+---
+
+## 2. How to Run (CLI)
+
+```bash
+# Single task
+lighteval litellm config.yaml "arc:challenge:cf|5"
+
+# All variants of one task
+lighteval litellm config.yaml "arc:challenge|5"
+
+# All subsets of one task + one metric
+lighteval litellm config.yaml "mmlu:cf|5"            # 57 subsets
+lighteval litellm config.yaml "wikifact:gen|5"       # 81 relation subsets
+
+# All subsets × all metrics
+lighteval litellm config.yaml "mmlu|5"               # 57 × 3 = 171 tasks
+lighteval litellm config.yaml "arc|5"                # 2 × 3 = 6 tasks
+
+# Multilingual (requires --load-multilingual flag in runner)
+lighteval litellm config.yaml "global_mmlu:cf|5"
+lighteval litellm config.yaml "mlmm_arc:deu:mcf|5"
 ```
-# Code
+
+---
+
+## 3. Code & Math BPB Tasks
+
+BPB over the gold continuation only (`generation_size=-1`). No accuracy metric.
+
+| Task | Dataset | Eval | FS | ICL |
+|---|---|---|---|---|
+| `humaneval:{lang}:bpb` | `openai/openai_humaneval` | test | — | 3 |
+| `mbpp:bpb` | `google-research-datasets/mbpp` (sanitized) | test | — | 3 |
+| `mt_mbpp:{lang}:bpb` (17) | `allenai/multilingual_mbpp` | test | — | 3 |
+| `math:{subset}:bpb` (7) | `EleutherAI/hendrycks_math` | test | — | 4 |
+
+**MT-MBPP languages (17):** `bash`, `c`, `cpp`, `csharp`, `go`, `haskell`, `java`, `javascript`,
+`matlab`, `php`, `python`, `r`, `ruby`, `rust`, `scala`, `swift`, `typescript`.
+
+**Math subsets (7):** `algebra`, `counting_and_probability`, `geometry`, `intermediate_algebra`,
+`number_theory`, `prealgebra`, `precalculus`.
+
+```bash
 humaneval:bpb|3
 mbpp:bpb|3
-mt_mbpp:bpb|3          # expands to all 17 language subtasks
-
-# Math
-math:algebra:bpb|4
-math:counting_and_probability:bpb|4
-math:geometry:bpb|4
-math:intermediate_algebra:bpb|4
-math:number_theory:bpb|4
-math:prealgebra:bpb|4
-math:precalculus:bpb|4
-
-# QA — CF (BPB merged in)
-arc:easy:cf|5
-arc:challenge:cf|5
-mmlu:cf|5              # expands to all 57 subsets
-commonsenseqa:cf|5
-hellaswag:cf|5
-winogrande:cf|5
-siqa:cf|5
-piqa:cf|5
-sciq:cf|5
-basic_skills:cf|5      # expands to all 6 subsets
-lambada:cf             # 0-shot
-med_mcqa:cf|5
-
-# QA — standalone BPB (no fixed answer choices)
-coqa:bpb               # 0-shot
-drop:bpb|5
-jeopardy:bpb|5
-natural_questions:bpb|5
-squad:bpb|5
+mt_mbpp:bpb|3              # all 17 languages
+mt_mbpp:python:bpb|3       # single language
+math:bpb|4                 # all 7 subsets
+math:algebra:bpb|4         # single subset
 ```
 
 ---
 
-### Table 2 — Full Evaluation Suite
+## 4. Math & CoT Reasoning Tasks
 
-#### Math (CoT generation)
+All tasks in this section generate free text and score with extractive match metrics.
 
-```
-gsm8k|8
+| Task | Dataset | Few-shot split | Rec. ICL | gen_size | Metric |
+|---|---|---|---|---|---|
+| `math:{subset}:gen` (7) | `EleutherAI/hendrycks_math` | `train` | 4 | 1024 | `expr_gold_metric` |
+| `math_500` | `HuggingFaceH4/MATH-500` | `test`¹ | 4 | 1024 | `expr_gold_metric` |
+| `gsm8k` | `openai/gsm8k` | `train` | 8 | 512 | `expr_gold_metric` |
+| `gsm_plus` | `qintongli/GSM-Plus` | `testmini` | 8 | 512 | `expr_gold_metric` |
+| `gsm_symbolic:{main,p1,p2}` | `apple/GSM-Symbolic` | `test`¹ | 8 | 512 | `expr_gold_metric` |
+| `bigbench_hard:{subset}` (27) | `lukaemon/bbh` | `train` | 3 | 1024 | `bbh_cot_exact_match` |
+| `agieval_eng_em:{subset}` (7) | `lighteval/agi_eval_en` | `dev` | 0 | 512 | `gpqa_instruct_metric` |
+
+¹ Test-only datasets (no train split): `math_500` (`HuggingFaceH4/MATH-500`), `gsm_symbolic` (`apple/GSM-Symbolic`). Few-shot examples are drawn from the test pool via random sampling (potential leakage). For leakage-free math few-shot, prefer `math:gen|4` (draws from `hendrycks_math` train) and `gsm8k|8` (draws from GSM8K train).
+
+> **4k context note:** `gsm8k|8` ≈ 3–4k tokens of context (borderline); `math:gen|4` ≈ 4k+ (too long). Use `gsm8k|4` and `math:gen|1` for 4k-ctx models.
+
+**`expr_gold_metric`** — extracts mathematical expressions / LaTeX (including `\boxed{}`) from model output; scores with symbolic equivalence.
+
+**`bbh_cot_exact_match`** — extracts text after "the answer is" from CoT output; exact match.
+
+**`gpqa_instruct_metric`** (AGIEval) — extracts letter choice (A–E) from CoT output.
+
+```bash
+math:gen|4                 # all 7 subsets, 4-shot from train
+math:algebra:gen|4         # single subset
+math_500|4                 # 4-shot drawn from test pool (see ¹)
+gsm8k|8                    # 8-shot from train (standard)
+gsm_plus|8                 # 8-shot from testmini
 gsm_symbolic:main|8
 gsm_symbolic:p1|8
 gsm_symbolic:p2|8
-math:algebra:gen|4
-math:counting_and_probability:gen|4
-math:geometry:gen|4
-math:intermediate_algebra:gen|4
-math:number_theory:gen|4
-math:prealgebra:gen|4
-math:precalculus:gen|4
-math_500
+bigbench_hard|3            # all 27 subsets
+bigbench_hard:boolean_expressions|3
+agieval_eng_em|0           # all 7 subsets
+agieval_eng_em:aqua_rat|0  # single subset
 ```
 
-#### STEM QA + Non-STEM QA (MC)
+**Math subsets (7):** `algebra`, `counting_and_probability`, `geometry`, `intermediate_algebra`,
+`number_theory`, `prealgebra`, `precalculus`.
 
+**AGIEval (English) subsets (7):** `aqua_rat`, `logiqa-en`, `lsat-ar`, `lsat-lr`, `lsat-rc`, `sat-en`, `sat-math`.
+
+---
+
+## 5. English MC QA Tasks
+
+All tasks in this section expose three variants:
+- `:cf|N` → `acc`, `acc_norm`, `target_bpb`
+- `:mcf|N` → `acc`, `acc_norm`
+- `:mcf_em|N` → `em`
+
+Exceptions are noted per task.
+
+### MMLU
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `lighteval/mmlu` | test | dev | 5 |
+
+57 subjects across STEM, Humanities, Social Sciences, Other.
+
+```bash
+mmlu:cf|5                         # all 57 subsets
+mmlu:mcf|5
+mmlu:mcf_em|5
+mmlu:abstract_algebra:cf|5        # single subject
+mmlu|5                            # all 57 × 3 variants
 ```
-arc:easy:mcf|5
-arc:challenge:mcf|5
-mmlu:mcf|5             # expands to all 57 subsets (STEM + Humanities + Social Sci + Other)
-med_mcqa:mcf|5
-med_qa:mcf|5
-sciq:mcf|5
+
+### MMLU-Pro
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `TIGER-Lab/MMLU-Pro` | test | validation | 5 | up to 10 options |
+
+```bash
+mmlu_pro:cf|5
+mmlu_pro:mcf|5
+mmlu_pro:mcf_em|5
+mmlu_pro:cot|5                    # chain-of-thought + extractive match (separate config)
+```
+
+### ARC
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `allenai/ai2_arc` | test | train | 5 |
+
+```bash
+arc:cf|5                          # both easy + challenge
+arc:mcf|5
+arc:mcf_em|5
+arc:easy:cf|5                     # single subset
+arc:challenge:mcf_em|5
+```
+
+### CommonsenseQA
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `tau/commonsense_qa` | validation | train | 5 |
+
+```bash
+commonsenseqa:cf|5
 commonsenseqa:mcf|5
-piqa:mcf|5
+commonsenseqa:mcf_em|5
+```
+
+### SIQA
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `lighteval/siqa` | validation | train | 5 |
+
+```bash
+siqa:cf|5
 siqa:mcf|5
-jeopardy_mc:mcf|5
+siqa:mcf_em|5
 ```
 
-#### GenQA / RC (completion/generation)
+### PIQA
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `lighteval/piqa` | validation | train | 5 |
 
+```bash
+piqa:cf|5
+piqa:mcf|5
+piqa:mcf_em|5
 ```
-hellaswag:cf|5          # RC per-char norm
-winogrande:cf|5         # RC unnormalized
-lambada:cf              # RC per-char norm, 0-shot
-basic_skills:cf|5       # RC per-token norm, expands to all 6 subsets
-drop:gen|5
-jeopardy:gen|5
-natural_questions:gen|5
-squad:gen|5
-coqa:gen                # 0-shot
+
+### SciQ
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `allenai/sciq` | test | train | 5 |
+
+```bash
+sciq:cf|5
+sciq:mcf|5
+sciq:mcf_em|5
 ```
 
-#### Held-out Suite
+### HellaSwag
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `Rowan/hellaswag` | validation | train | 5 | sentence completion |
 
+```bash
+hellaswag:cf|5
+hellaswag:mcf|5
+hellaswag:mcf_em|5
 ```
-mmlu_pro:mcf|5
-bigbench_hard|3         # expands to all 27 BBH subsets
+
+### WinoGrande
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `allenai/winogrande` (xl) | validation | train | 5 | cloze / pronoun resolution |
+
+```bash
+winogrande:cf|5
+winogrande:mcf|5
+winogrande:mcf_em|5
+winogrande:bpb|5             # OLMO-style partial evaluation BPB (separate config)
 ```
 
----
+### SWAG
+
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `allenai/swag` (regular) | validation | train | 5 |
 
-## Suffix Reference
+```bash
+swag:cf|5
+swag:mcf|5
+swag:mcf_em|5
+```
 
-| Suffix    | Metric type                 | `generation_size`   | Metrics reported                | Description                                                                  |
-| --------- | --------------------------- | ------------------- | ------------------------------- | ---------------------------------------------------------------------------- |
-| `:mcf_em` | Greedy generation (MC only) | `1`–`5`             | `exact_match`                   | MC tasks only: generate label token, compare with EM                         |
-| `:cf`     | Completion formulation      | `-1` (logprob only) | `acc`, `acc_norm`, `target_bpb` | Score full answer text via log p(choice\|context); BPB merged in for MC tasks |
-| `:mcf`    | Multiple-choice formulation | `-1` (logprob only) | `acc`, `acc_norm`               | Score label tokens only (`A`, `B`, …)                                        |
-| `:bpb`    | Bits-per-byte (standalone)  | `-1` (logprob only) | `target_bpb`                    | Used only where CF is not applicable (MATH, free-form GenQA)                 |
-| `:gen`    | Greedy generation + F1/EM   | `50`–`1024`         | `f1`, `em` (task-specific)      | Actual text generation; answer scored with F1 or extractive match            |
+### OpenBookQA
 
-**Note on BPB for multiple-choice tasks**: BPB is **merged into `:cf`** for all MC tasks — running `:cf` produces `{acc, acc_norm, target_bpb}` in one pass. There are no standalone `:bpb` tasks for ARC, MMLU, HellaSwag, etc.
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `allenai/openbookqa` (main) | test | train | 5 |
 
-## How to reference tasks in CLI
+```bash
+openbookqa:cf|5
+openbookqa:mcf|5
+openbookqa:mcf_em|5
+```
 
-Task names follow the pattern `<task_base>:<subset>:<suffix>` (or `<task_base>:<suffix>` for single-subset tasks).
+### QASC
 
-Use the prefix before the first `:` as a superset to run all subsets at once:
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `allenai/qasc` | validation | train | 5 | 8-choice, requires two facts |
 
+```bash
+qasc:cf|5
+qasc:mcf|5
+qasc:mcf_em|5
 ```
-# Single task
-lighteval litellm config.yaml "arc:challenge:cf|5"
 
-# All ARC variants with CF metric (2 subsets × 1 metric)
-lighteval litellm config.yaml "arc:cf|5"
+### BoolQ
 
-# All metrics for one ARC subset
-lighteval litellm config.yaml "arc:challenge|5"
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `lighteval/boolq_helm` | validation | train | 5 | binary yes/no |
 
-# All MMLU subsets for a given metric (57 subsets)
-lighteval litellm config.yaml "mmlu:cf|5"
-lighteval litellm config.yaml "mmlu:mcf_em|5"
-lighteval litellm config.yaml "mmlu:mcf|5"
+```bash
+boolq:cf|5
+boolq:mcf|5
+boolq:mcf_em|5
+```
 
-# All metrics for one MMLU subset
-lighteval litellm config.yaml "mmlu:abstract_algebra|5"
+### MedMCQA
 
-# All MMLU tasks (all subsets × all metrics)
-lighteval litellm config.yaml "mmlu|5"
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `lighteval/med_mcqa` | validation | train | 5 |
 
-# All MT-MBPP language subtasks
-lighteval litellm config.yaml "mt_mbpp:bpb|3"
+```bash
+med_mcqa:cf|5
+med_mcqa:mcf|5
+med_mcqa:mcf_em|5
 ```
 
-## Task inventory (CLAUDE.md tasks)
+### MedQA (USMLE)
 
-### Code BPB tasks
+| Dataset | Eval | FS | ICL |
+|---|---|---|---|
+| `bigbio/med_qa` (med_qa_en_source) | test | train | 5 |
 
-| Task | Dataset | Subset | Eval split | ICL | Metric |
-|------|---------|--------|------------|-----|--------|
-| `humaneval:bpb` | `openai/openai_humaneval` | default | test | 3 | `target_bpb` |
-| `mbpp:bpb` | `google-research-datasets/mbpp` | sanitized | test | 3 | `target_bpb` |
-| `mt_mbpp:{lang}:bpb` (17) | `allenai/multilingual_mbpp` | `{lang}` | test | 3 | `target_bpb` |
+```bash
+med_qa:cf|5
+med_qa:mcf|5
+med_qa:mcf_em|5
+```
 
-**MT-MBPP superset**: `mt_mbpp:bpb|3` expands to all 17 language subtasks.
+### PubMedQA
 
-**17 languages**: `bash`, `c`, `cpp`, `csharp`, `go`, `haskell`, `java`, `javascript`, `matlab`, `php`, `python`, `r`, `ruby`, `rust`, `scala`, `swift`, `typescript`.
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `qiaojin/PubMedQA` (pqa_labeled) | train | train | 5 | 3-choice: yes/no/maybe |
 
-### Math
+```bash
+pubmedqa:cf|5
+pubmedqa:mcf|5
+pubmedqa:mcf_em|5
+```
 
-| Task | Dataset | Eval split | ICL | Metric |
-|------|---------|------------|-----|--------|
-| `math:{subset}:bpb` (7) | `EleutherAI/hendrycks_math` | test | 4 | `target_bpb` |
-| `math:{subset}:gen` (7) | `EleutherAI/hendrycks_math` | test | 4 | `expr_gold_metric` |
+### HeadQA
 
-Subsets: `algebra`, `counting_and_probability`, `geometry`, `intermediate_algebra`, `number_theory`, `prealgebra`, `precalculus`.
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `lighteval/headqa_harness` | test | train | 5 | en + es subsets |
 
-### MMLU
+```bash
+headqa:en:cf|5
+headqa:es:cf|5
+headqa:cf|5                  # both subsets
+headqa:mcf|5
+headqa:mcf_em|5
+```
+
+### GPQA (Diamond)
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `Idavidrein/gpqa` (gpqa_diamond) | train | train | 0 | gated; choices shuffled by question hash |
+
+```bash
+gpqa:diamond:cf|0
+gpqa:diamond:mcf|0
+gpqa:diamond:mcf_em|0
+gpqa:diamond|0               # all 3 variants
+```
+
+Note: `gpqa:diamond` (instruct CoT, `gpqa_instruct_pass_at_k`) and `gpqa:main` / `gpqa:extended`
+(instruct reasoning) are separate configs for instruction-tuned evaluation.
 
-| Superset | Expands to | # tasks |
-|----------|------------|---------|
-| `mmlu` | all subsets × all metrics | 171 (57×3) |
-| `mmlu:cf` | `mmlu:{subset}:cf` for all 57 subsets | 57 |
-| `mmlu:mcf_em` | `mmlu:{subset}:mcf_em` for all 57 subsets | 57 |
-| `mmlu:mcf` | `mmlu:{subset}:mcf` for all 57 subsets | 57 |
-| `mmlu_redux:cf` | all redux subsets with CF | 57 |
-
-Dataset: `lighteval/mmlu`. Each `:cf` task reports `{acc, acc_norm, target_bpb}`.
-
-### Multiple-choice QA tasks
-
-| Task | Dataset | Eval split | ICL | `:cf` metrics | `:mcf` metrics |
-|------|---------|------------|-----|---------------|----------------|
-| `arc:challenge` / `arc:easy` | `allenai/ai2_arc` | test | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `commonsenseqa` | `tau/commonsense_qa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `hellaswag` | `Rowan/hellaswag` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `winogrande` | `allenai/winogrande` (xl) | validation | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `siqa` | `lighteval/siqa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `piqa` | `lighteval/piqa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `sciq` | `allenai/sciq` | test | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `med_mcqa` | `lighteval/med_mcqa` | validation | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `openbookqa` | `allenai/openbookqa` (main) | test | 5 | acc, acc_norm, bpb | acc, acc_norm |
-| `jeopardy_mc:cf` | `allenai/jeopardy_mc` | test | 0 | acc, acc_norm, bpb | — |
-| `jeopardy_mc:mcf` | `allenai/jeopardy_mc` | test | 0 | — | acc, acc_norm |
-
-Note: `siqa` and `piqa` use `lighteval/*` wrapper repos (same data as `allenai/social_i_qa` / `ybisk/piqa`); both sources require the script fallback in `download_dataset_worker`.
+### Jeopardy MC
+
+OLMo Gen2MC — dedicated MC dataset derived from Jeopardy. For the generative form see `jeopardy` in §6.
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `allenai/jeopardy_mc` | test | — | 0 | CF + MCF only (no mcf_em) |
+
+```bash
+jeopardy_mc:cf|0
+jeopardy_mc:mcf|0
+```
 
 ### TruthfulQA (MC2)
 
-| Task | Dataset | Eval split | ICL | Metric |
-|------|---------|------------|-----|--------|
-| `truthfulqa:mc2:cf` | `truthfulqa/truthful_qa` (multiple_choice) | validation | 0 (built-in primer) | `truthfulqa_mc2` (single score, logprob) |
-
-MC2 = normalized probability mass on the set of true answers. Reuses the exact `truthfulqa:mc` prompt + computation but emits only mc2; the two-key `truthfulqa:mc` (`truthfulqa_mc1` + `truthfulqa_mc2`) is also available.
-
-### Single-answer completion tasks (CF only)
-
-| Task | Dataset | Eval split | ICL | Metrics |
-|------|---------|------------|-----|---------|
-| `lambada:cf` | `EleutherAI/lambada_openai` | test | 0 | acc_norm (char), bpb |
-| `basic_skills:{subset}:cf` (6) | `allenai/basic-skills` | validation | 5 | acc_norm (token), bpb |
-
-**Basic Skills subsets**: `arithmetic`, `string_operations`, `coding`, `logical_reasoning`, `common_knowledge`, `pattern`.
-
-**Lambada cloze variants** (perplexity only):
-
-| Task | Dataset | Prompt |
-|------|---------|--------|
-| `lambada:standard_cloze` | `cimec/lambada` | `{context} ____.  ->` |
-| `lambada:openai_cloze` | `EleutherAI/lambada_openai` | `{context} ____.  ->` |
-
-### Free-form GenQA tasks
-
-| Task | Dataset | Eval split | ICL | `:bpb` | `:gen` gen_size | `:gen` metrics |
-|------|---------|------------|-----|--------|-----------------|----------------|
-| `coqa` | `EleutherAI/coqa` (parquet) | validation | 0 | `target_bpb` | 50 | f1, em |
-| `drop` | `lighteval/drop_harness` | validation | 5 | `target_bpb` | 100 | em, f1 (DROP) |
-| `jeopardy` | `soldni/jeopardy` (mosaicml_gauntlet, 2117) | train | 5 | `target_bpb` | 50 | f1, em |
-| `natural_questions` | `google-research-datasets/nq_open` | validation | 5 | `target_bpb` | 50 | f1, em |
-| `squad` | `allenai/squad` (v1.1) | validation | 5 | `target_bpb` | 50 | f1, em |
-| `squad_v2` | `rajpurkar/squad_v2` (answerable-only) | validation | 5 | `target_bpb` | 200 | f1, em |
-| `triviaqa` | `mandarjoshi/trivia_qa` (rc.nocontext) | validation | 5 | `target_bpb` | 20 | f1, em |
-| `popqa` | `akariasai/PopQA` | test | 5 | `target_bpb` | 8 | f1, em |
-| `wikifact:{subset}` (81) | `lighteval/wikifact` | test | 5 | `target_bpb` | 8 | f1, em |
-
-Prompt formats:
-- **CoQA**: `Passage: {story}\n\nFinal question:\n\nQuestion: {q}\nAnswer:` — stop `["\n\n"]`
-- **DROP**: `Passage: {passage}\nQuestion: {question}\nAnswer:` — stop `["\n"]`
-- **Jeopardy**: `Category: {cat}\nQuestion: {q}\nAnswer:` — stop `["\n\n", "Question:", "Category:"]`
-- **NaturalQs**: `Question: {question}\nAnswer:` — stop `["Question:", "Q:", "\n\n"]`
-- **SQuAD**: `Title: {title}\n\nBackground: {context}\n\nQuestion: {question}\n\nAnswer:` — stop `["Title:", "\n\n"]`
-- **SQuAD v2**: QA template (answerable-only via `hf_filter`) — stop `["\n", "Question:", "question:"]`
-- **TriviaQA**: `Question: {question}\nAnswer:` — stop `["\n", ".", ","]`; gold = canonical value + aliases
-- **PopQA**: `{question} ` — stop `["\n"]`; gold = `possible_answers` aliases
-- **WikiFact**: `{question} ` — stop `["\n"]`; 81 relation subsets — superset `wikifact:gen` / `wikifact:bpb`
-
-### CoT generation tasks
-
-| Task | Dataset | ICL | gen_size | Metric | Subsets |
-|------|---------|-----|----------|--------|---------|
-| `gsm8k` | `openai/gsm8k` | 8 | 512 | `expr_gold_metric` | — |
-| `gsm_symbolic:{main,p1,p2}` | `apple/GSM-Symbolic` | 8 | 512 | `expr_gold_metric` | 3 |
-| `math_500` | `HuggingFaceH4/MATH-500` | 0 | 1024 | `expr_gold_metric` | — |
-| `bigbench_hard:{subset}` (27) | `lukaemon/bbh` | 3 | 1024 | em (after extraction) | 27 |
-| `agieval_eng_em:{subset}` (7) | `lighteval/agi_eval_en` | 0 | 512 | `gpqa_instruct` (CoT, extractive) | 7 |
-
-**AGIEval (English) subsets**: `aqua_rat`, `logiqa-en`, `lsat-ar`, `lsat-lr`, `lsat-rc`, `sat-en`, `sat-math`.
+| Dataset | Eval | FS | ICL | Metric | Notes |
+|---|---|---|---|---|---|
+| `truthfulqa/truthful_qa` (multiple_choice) | validation | — | 0 | `truthfulqa_mc2` | single score; built-in 5-QA primer |
 
----
+MC2 = normalized probability mass on the set of true answers. Higher is better.
+The two-key variant `truthfulqa:mc` (reports both `truthfulqa_mc1` + `truthfulqa_mc2`) is also available.
+
+```bash
+truthfulqa:mc2:cf|0
+```
 
-## Metric definitions
+### CyberMetric + SecQA
 
-**`:mcf_em`** — greedy decode (temperature=0), compare output to gold with exact_match.
+| Task | Dataset | Eval | FS | ICL |
+|---|---|---|---|---|
+| `cybermetric` | `tihanyin/CyberMetric` | train | train | 0 |
+| `secqa:v1` / `secqa:v2` | `zefang-liu/secqa` | test | test | 0 |
 
-**`:cf`** (completion formulation) — score full candidate answer text:
+```bash
+cybermetric:cf|0
+cybermetric:mcf|0
+cybermetric:mcf_em|0
+secqa:v1:cf|0
+secqa:v2:cf|0
+secqa:cf|0                   # both versions
 ```
-score_i = log p(answer_i | prompt)
+
+### MaScQA
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `heegyu/mascqa` | test | test | 0 | choices embedded in question text |
+
+```bash
+mascqa:cf|0
+mascqa:mcf|0
+mascqa:mcf_em|0
 ```
-Prediction = argmax. Normalizations: per-char (`LogProbCharNorm`) → `acc_norm`. BPB also computed from gold choice logprob in the same pass.
 
-**`:mcf`** (multiple-choice formulation) — prompt shows labeled options, score only the label token:
+### FormationEval
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `AlmazErmilov/FormationEval` | test | test | 0 | petroleum engineering |
+
+```bash
+formationeval:cf|0
+formationeval:mcf|0
+formationeval:mcf_em|0
 ```
-score_i = log p(" A" | prompt)   # or " B", " C", " D"
+
+### TeleQnA
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `netop/TeleQnA` | test | test | 0 | **gated** — must be pre-cached |
+
+```bash
+teleqna:cf|0
+teleqna:mcf|0
+teleqna:mcf_em|0
 ```
-Prediction = argmax label score. Reports `acc` and `acc_norm`.
 
-**`:bpb`** (bits-per-byte, standalone) — no choice ranking, gold continuation only:
+### LAB-Bench (TableQA)
+
+| Dataset | Eval | FS | ICL | Notes |
+|---|---|---|---|---|
+| `futurehouse/LAB-Bench` (TableQA) | train | train | 0 | biology; tables provided as images (text-only variant) |
+
+```bash
+labbench:cf|0
+labbench:mcf|0
+labbench:mcf_em|0
 ```
-BPB = -log2 p(gold | prompt) / bytes_utf8(gold)
+
+### TitanEval English Domain Tasks
+
+Loaded from local parquet (`data/titaneval/`). Source: TitanEval-MCQ benchmark suite.
+Each task has its own file: `tasks/{task}.py` (e.g., `tasks/preflight.py`).
+All expose `:cf` (acc + acc_norm_char + **BPB merged**), `:mcf` (acc + acc_norm_char), `:mcf_em` (exact match, greedy decode).
+
+> **Few-shot note:** These tasks have **test split only** — no dedicated few-shot split exists.
+> The config sets `few_shots_split="test"`, `few_shots_select="random_sampling"` to allow
+> CLI-level overrides, but running with `|N` (N > 0) draws examples from the test set itself
+> (leakage risk). **Recommended: use `|0` (0-shot) for all TitanEval tasks.**
+
+| Task | Domain | Rows | ICL | Notes |
+|---|---|---|---|---|
+| `preflight` | Aviation safety | 300 | 0 | |
+| `chembench` | Chemistry (analytical/organic/physical) | 2,542 | 0 | |
+| `esgenius` | ESG / sustainability | 1,136 | 0 | |
+| `xfinbench` | Finance (cross-lingual, EN subset) | 588 | 0 | 4 rows filtered (missing choices) |
+| `geobench` | Geoscience | 1,390 | 0 | |
+
+```bash
+preflight:cf|0
+preflight:mcf|0
+preflight:mcf_em|0
+chembench:cf|0
+chembench:mcf|0
+esgenius:cf|0
+xfinbench:cf|0
+geobench:cf|0
+```
+
+---
+
+## 6. English GenQA Tasks
+
+All tasks in this section expose two variants:
+- `:gen|N` → `f1`, `em` (normalized: lowercase + remove punctuation on both gold and prediction)
+- `:bpb|N` → `target_bpb` (decoupled: same query, scores only the first gold continuation)
+
+Exception: `drop:gen` uses `Metrics.drop` (span/number/date-aware normalization), not the standard normalized EM/F1.
+
+**OLMo Gen2MC note:** OLMo's Base Main Suite reformulates DROP, CoQA, SQuAD, NaturalQs, and Jeopardy as MC (Gen2MC, §A.4.2). Our codebase implements **gen-only** variants for all of these except Jeopardy — which has a dedicated MC dataset as `jeopardy_mc` (§5).
+
+| Task | Dataset | Eval | FS | ICL | gen_size | Notes |
+|---|---|---|---|---|---|---|
+| `coqa` | `EleutherAI/coqa` | validation | eval | 0 | 50 | OLMo Gen2MC |
+| `drop` | `lighteval/drop_harness` | validation | train | 5 | 100 | OLMo Gen2MC; uses `Metrics.drop` |
+| `jeopardy` | `soldni/jeopardy` | train | train | 5 | 50 | OLMo Gen2MC; MC form → `jeopardy_mc` (§5) |
+| `natural_questions` | `google-research-datasets/nq_open` | validation | train | 5 | 50 | OLMo Gen2MC |
+| `squad` | `allenai/squad` (v1.1) | validation | train | 5 | 50 | OLMo Gen2MC |
+| `squad_v2` | `rajpurkar/squad_v2` (answerable-only) | validation | train | 5 | 200 | OLMo Gen2MC |
+| `triviaqa` | `mandarjoshi/trivia_qa` (rc.nocontext) | validation | train | 5 | 20 | |
+| `popqa` | `akariasai/PopQA` | test | test | 5 | 8 | |
+| `wikifact:{subset}` (81) | `lighteval/wikifact` | test | test | 5 | 8 | |
+| `simpleqa` | `lighteval/SimpleQA` | test | few_shot | 0 | 50 | |
+
+```bash
+# CoQA (0-shot conversation QA)
+coqa:gen
+coqa:bpb
+
+# DROP (discrete reasoning)
+drop:gen|5
+drop:bpb|5
+
+# Jeopardy (gen form; MC form → jeopardy_mc in §5)
+jeopardy:gen|5
+jeopardy:bpb|5
+
+# NaturalQuestions
+natural_questions:gen|5
+natural_questions:bpb|5
+
+# SQuAD v1.1
+squad:gen|5
+squad:bpb|5
+
+# SQuAD v2 (unanswerable questions excluded via hf_filter)
+squad_v2:gen|5
+squad_v2:bpb|5
+
+# TriviaQA
+triviaqa:gen|5
+triviaqa:bpb|5
+
+# PopQA
+popqa:gen|5
+popqa:bpb|5
+
+# WikiFact (81 relation subsets)
+wikifact:gen|5             # all 81 subsets
+wikifact:bpb|5             # all 81 subsets
+wikifact:author:gen|5      # single subset
+
+# SimpleQA
+simpleqa:gen|0
+simpleqa:bpb|0
 ```
-Lower is better. Used for MATH and free-form GenQA. For MC tasks, BPB is reported inside `:cf`.
 
-**`:gen`** (greedy generation) — autoregressively decode up to `generation_size` tokens (temperature=0):
+**Prompt formats:**
+- CoQA: `Passage: {story}\n\nFinal question:\n\nQuestion: {q}\nAnswer:` — stop `["\n\n"]`
+- DROP: `Passage: {passage}\nQuestion: {question}\nAnswer:` — stop `["\n\n", "Passage:", "Question:"]`
+- Jeopardy: `Category: {cat}\nQuestion: {q}\nAnswer:` — stop `["\n\n", "Question:", "Category:"]`
+- NaturalQs: `Question: {question}\nAnswer:` — stop `["Question:", "Q:", "\n\n"]`
+- SQuAD: `Title: {title}\n\nBackground: {context}\n\nQuestion: {question}\n\nAnswer:` — stop `["Title:", "\n\n"]`
+- SQuAD v2: QA template (same prompt as SQuAD) — stop `["\n", "Question:", "question:"]`
+- TriviaQA: `Question: {question}\nAnswer:` — stop `["\n", ".", ","]`; all aliases as gold
+- PopQA: `{question} ` — stop `["\n"]`; `possible_answers` list as gold
+- WikiFact: `{question} ` — stop `["\n"]`; `references` list as gold
+- SimpleQA: `Question: {question}\nAnswer:` — stop `["\n"]`
+
+---
+
+## 7. Lambada & Basic Skills
+
+These tasks use rank-choice or cloze formulations. BPB is **decoupled** (separate `:bpb` config with a different prompt) — it is **not** merged into `:cf`.
+
+### Lambada
+
+| Dataset | Eval | ICL | Config | Metrics |
+|---|---|---|---|---|
+| `cimec/lambada` | test | 0 | `lambada:cf` | `acc_norm` (char-norm) |
+| | | | `lambada:bpb` | `target_bpb` (decoupled) |
+| | | | `lambada:standard_cloze` | `target_perplexity` |
+| `EleutherAI/lambada_openai` | test | 0 | `lambada:openai_cloze` | `target_perplexity` |
+
+`lambada:cf` uses a distractor format (gold last word vs 3 sampled distractors, scored by char-norm logprob).
+`lambada:bpb` scores the full passage continuation directly.
+
+```bash
+lambada:cf                    # rank-choice, acc_norm
+lambada:bpb                   # BPB decoupled
 ```
-y = argmax_v p(v | prompt, y_<t)
+
+### Basic Skills
+
+| Dataset | Eval | ICL | Config | Metrics |
+|---|---|---|---|---|
+| `allenai/basic-skills` | validation | 5 | `basic_skills:{subset}:cf` | `acc` (unnorm), `acc_norm` (token-norm) |
+| | | | `basic_skills:{subset}:mcf` | `acc` (unnorm), `acc_norm` (char-norm) |
+| | | | `basic_skills:{subset}:bpb` | `target_bpb` (decoupled) |
+
+`:cf` and `:mcf` use multi-choice formats with distractors.
+`:bpb` uses a single-choice (gold-only) prompt.
+
+```bash
+basic_skills:cf|5             # all 6 subsets, acc + acc_norm (token)
+basic_skills:mcf|5            # all 6 subsets, acc + acc_norm (char)
+basic_skills:bpb|5            # all 6 subsets, BPB decoupled
+basic_skills:arithmetic:cf|5  # single subset
+basic_skills:arithmetic:bpb|5
 ```
-Scored with F1 + EM (GenQA) or `expr_gold_metric` (math/CoT).
+
+**Subsets (6):** `arithmetic`, `string_operations`, `coding`, `logical_reasoning`,
+`common_knowledge`, `pattern`.
+
+---
+
+## 8. Skipped / Unavailable Tasks
+
+**Skipped (data issues):**
+- `nuclearqa` — all choices empty in titaneval parquets; source dataset not found
+- `ctibench` — 0 valid rows after filtering (all choices are empty strings in titaneval parquet)
+
+---
+
+## 9. Metric Definitions
+
+**`:cf`** — log p(answer_i | prompt) for each candidate; argmax. Reports:
+- `acc` — argmax correct
+- `acc_norm` — argmax correct with per-char length normalization
+- `target_bpb` — `-log₂ p(gold) / bytes_utf8(gold)`, lower is better
+  (merged into `:cf` for English MC QA tasks §5 only; decoupled for lambada, basic_skills)
+
+**`:mcf`** — log p(" A" | prompt), log p(" B" | prompt), … argmax. Reports `acc`, `acc_norm`.
+
+**`:mcf_em`** — greedy decode 1 token, exact-match against gold label string. Reports `em`.
+
+**`:bpb`** (standalone) — single gold continuation, no ranking. Reports `target_bpb`.
+Used for code, math, and as decoupled companion to `:gen` or `:cf`.
+
+**`:gen`** — greedy decode up to `generation_size` tokens. Scored with:
+- `qa_em` — exact match after `harness_triviaqa_normalizer` (lowercase + remove punctuation) on both gold and prediction; aggregates `max` over all gold aliases
+- `qa_f1` — bag-of-words F1 after same normalization; aggregates `max` over all gold aliases
+- Exception: `drop:gen` uses `Metrics.drop` (handles number spans, dates, multi-span answers)
diff --git a/TASK_NAMING_Multilingual.md b/TASK_NAMING_Multilingual.md
index e7ad15a50..64423ea16 100644
--- a/TASK_NAMING_Multilingual.md
+++ b/TASK_NAMING_Multilingual.md
@@ -30,7 +30,7 @@ corresponding English task file instead (e.g., `arc.py`, `mmlu.py`, `hellaswag.p
 | `mlmm_arc:cf\|5` | all 26 languages, CF variant |
 | `mmlu_prox:cf\|5` | all 28 multilingual languages, CF variant |
 | `mlmm_hellaswag:cf\|5` | all 32 languages |
-| `mgsm:gen\|8` | all 10 languages |
+| `mgsm:gen\|8` | all 40 languages |
 | `wmt24pp\|0` | all ~24 English-centric language slices (both directions each) |
 | `wmt24pp:de_DE\|0` | German slice only: en→de_DE + de_DE→en |
 | `flores200\|0` | all English-centric FLORES language slices |
@@ -306,23 +306,32 @@ mlmm_hellaswag:zho:mcf_em|5
 
 | Task pattern | Dataset | Eval | FS split | ICL | Metrics |
 |---|---|---|---|---|---|
-| `mgsm:{lang}:gen\|8` | `juletxara/mgsm` | test | train | 8 | expr_gold_metric, multilingual_quasi_em |
+| `mgsm:{lang}:gen\|8` | `CohereLabs/global-mgsm` | test | train | 8 | expr_gold_metric, multilingual_quasi_em |
 
 Both `expr_gold_metric` (math expression parser, for Arabic-numeral answers) and
 `MultilingualQuasiExactMatchMetric` (language-aware fuzzy match, for non-ASCII digit systems)
-are reported in the same pass.
-
-**10 languages** (English excluded): bengali, french, german, japanese, russian, spanish,
-swahili, telugu, thai, chinese.
-
-**Language codes**:
-`ben`, `fra`, `deu`, `jpn`, `rus`, `spa`, `swa`, `tel`, `tha`, `zho`.
+are reported in the same pass. English is excluded — use `gsm8k` instead.
+
+**40 languages** (English excluded):
+
+| Code | Language | Code | Language | Code | Language | Code | Language |
+|---|---|---|---|---|---|---|---|
+| `amh` | Amharic | `ara` | Arabic | `ben` | Bengali | `cat` | Catalan |
+| `ces` | Czech | `cym` | Welsh | `deu` | German | `ell` | Greek |
+| `eus` | Basque | `fra` | French | `glg` | Galician | `guj` | Gujarati |
+| `hau` | Hausa | `hun` | Hungarian | `jpn` | Japanese | `kan` | Kannada |
+| `khm` | Khmer | `kir` | Kyrgyz | `kor` | Korean | `lug` | Ganda |
+| `mya` | Burmese | `nep` | Nepali | `rus` | Russian | `sin` | Sinhala |
+| `sna` | Shona | `sot` | Southern Sotho | `spa` | Spanish | `srp` | Serbian |
+| `swa` | Swahili | `tam` | Tamil | `tel` | Telugu | `tha` | Thai |
+| `urd` | Urdu | `uzb` | Uzbek | `vie` | Vietnamese | `wol` | Wolof |
+| `xho` | Xhosa | `yor` | Yoruba | `zho` | Chinese | `zul` | Zulu |
 
 File: `multilingual/tasks/mgsm.py`
 
 **Copy-pasteable examples:**
 ```
-# All 10 languages
+# All 40 languages
 mgsm:gen|8
 
 # Single language
@@ -449,6 +458,108 @@ included) + Chinese `zho_cn`/`zho_tw` + Portuguese `por_pt`/`por_br`.
 
 ---
 
+### CMMLU — `cmmlu` (Chinese, no lang suffix)
+
+| Task pattern | Dataset | Eval | FS split | ICL | Metrics |
+|---|---|---|---|---|---|
+| `cmmlu:{subset}:cf\|5` | `haonan-li/cmmlu` | test | dev | 5 | acc, acc_norm, bpb |
+| `cmmlu:{subset}:mcf\|5` | same | test | dev | 5 | acc, acc_norm |
+| `cmmlu:{subset}:mcf_em\|5` | same | test | dev | 5 | em |
+
+**67 subjects** (Chinese-language MMLU). Use `cmmlu:cf|5` as superset over all subjects.
+File: `multilingual/tasks/cmmlu.py`
+
+### TurkishMMLU — `turkishmmlu` (Turkish, no lang suffix)
+
+| Task pattern | Dataset | Eval | FS split | ICL | Metrics |
+|---|---|---|---|---|---|
+| `turkishmmlu:{subset}:cf\|5` | `AYueksel/TurkishMMLU` | test | dev | 5 | acc, acc_norm, bpb |
+| `turkishmmlu:{subset}:mcf\|5` | same | test | dev | 5 | acc, acc_norm |
+| `turkishmmlu:{subset}:mcf_em\|5` | same | test | dev | 5 | em |
+
+**9 subjects**: biology, chemistry, geography, history, mathematics, philosophy, physics, religion_and_ethics, turkish_language_and_literature.
+File: `multilingual/tasks/turkish_mmlu.py`
+
+### EXAMS — `exams` (multilingual, per-language aggregate)
+
+| Task pattern | Dataset | Eval | FS split | ICL | Metrics |
+|---|---|---|---|---|---|
+| `exams:{lang}:cf\|0` | `mhardalov/exams` | test | train | 0 | acc, acc_norm, bpb |
+| `exams:{lang}:mcf\|0` | same | test | train | 0 | acc, acc_norm |
+| `exams:{lang}:mcf_em\|0` | same | test | train | 0 | em |
+
+**16 languages**: albanian, arabic, bulgarian, croatian, french, german, hungarian, italian, lithuanian, macedonian, polish, portuguese, serbian, spanish, turkish, vietnamese. Each language aggregates all subjects. File: `multilingual/tasks/exams.py`
+
+### MedExpQA — `medexpqa` (multilingual medical)
+
+| Task pattern | Dataset | Eval | FS split | ICL | Metrics |
+|---|---|---|---|---|---|
+| `medexpqa:{lang}:cf\|0` | `HiTZ/MedExpQA` | test | test | 0 | acc, acc_norm, bpb |
+| `medexpqa:{lang}:mcf\|0` | same | test | test | 0 | acc, acc_norm |
+| `medexpqa:{lang}:mcf_em\|0` | same | test | test | 0 | em |
+
+**4 languages**: `spa`, `fra`, `ita`, `eng`. File: `multilingual/tasks/medexpqa.py`
+
+### Chinese domain tasks (no lang suffix — single-language Chinese)
+
+| Task | Dataset | Eval | ICL | Notes |
+|---|---|---|---|---|
+| `agrieval:cf\|5` | `PaperHarvester/AgriEval` | train | 5 | Agriculture; single-choice only (`hf_filter`) |
+| `crop:cf\|5` | `AI4Agr/CROP-benchmark` | test | 5 | Crop science |
+| `fineval:cf\|5` | `SUFE-AIFLM-Lab/FinEval` | train | 5 | Finance; Chinese MMLU-style |
+
+All three get cf/mcf/mcf_em. Files: `multilingual/tasks/agrieval.py`, `multilingual/tasks/fineval.py`
+
+### TitanEval Multilingual Domain Tasks
+
+Loaded from local parquet (`data/titaneval/`). No public HF repo; data bundled in repo.
+Each task has its own file: `multilingual/tasks/{task}.py` (e.g., `multilingual/tasks/camb.py`).
+All expose `:cf` (acc + acc_norm_char + **BPB merged**), `:mcf` (acc + acc_norm_char), `:mcf_em` (exact match, greedy decode).
+
+> **Few-shot note:** These tasks have **test split only** — no dedicated few-shot split exists.
+> The config sets `few_shots_split="test"`, `few_shots_select="random_sampling"` to allow
+> CLI-level overrides, but running with `|N` (N > 0) draws examples from the test set itself
+> (leakage risk). **Recommended: use `|0` (0-shot) for all TitanEval tasks.**
+
+**Chinese (zh) — uses `Language.CHINESE` prompt template:**
+
+| Task | Domain | Rows | ICL |
+|---|---|---|---|
+| `camb` | Civil aviation maintenance | 7,969 | 0 |
+| `jecqa` | Chinese law exam | 1,998 | 0 |
+| `lexeval` | Legal evaluation | 10,920 | 0 |
+| `aecbench` | Architectural / civil engineering | 6,386 | 0 |
+
+**French (fr) — uses `Language.FRENCH` prompt template:**
+
+| Task | Domain | Rows | ICL |
+|---|---|---|---|
+| `frenchmedmcqa` | French medical licensing exam | 3,105 | 0 |
+| `mediqal` | French medical QA | 27,634 | 0 |
+
+**Arabic (ar) — uses `Language.ARABIC` prompt template:**
+
+| Task | Domain | Rows | ICL |
+|---|---|---|---|
+| `arabicmmlu` | Arabic MMLU | 14,455 | 0 |
+| `arastem` | Arabic STEM | 10,819 | 0 |
+
+```bash
+# Requires --load-multilingual; 0-shot recommended (test-only split)
+camb:cf|0
+camb:mcf|0
+camb:mcf_em|0
+jecqa:cf|0
+lexeval:cf|0
+aecbench:cf|0
+frenchmedmcqa:cf|0
+mediqal:cf|0
+arabicmmlu:cf|0
+arastem:cf|0
+```
+
+---
+
 ## Two-level averaging
 
 ### Level 1 — per-language average (automatic)
@@ -540,6 +651,37 @@ xquad:bpb|5                     # XQuAD, all 12 langs, BPB
 multi_wiki_qa:gen|0             # MultiWikiQA, all 54 slices (incl. eng)
 multi_wiki_qa:bpb|0             # MultiWikiQA, all 54 slices, BPB
 
+# --- Multilingual / single-language MMLU-style ---
+cmmlu:cf|5                      # Chinese MMLU, all 67 subjects, CF (acc+norm+bpb)
+cmmlu:mcf|5                     # Chinese MMLU, all 67 subjects, MCF
+cmmlu:mcf_em|5                  # Chinese MMLU, all 67 subjects, greedy
+cmmlu:agronomy:cf|5             # single subject
+
+turkishmmlu:cf|5                # Turkish MMLU, all 9 subjects, CF
+turkishmmlu:mcf|5
+turkishmmlu:mcf_em|5
+turkishmmlu:biology:cf|5        # single subject
+
+exams:cf|0                      # EXAMS multilingual, all 16 langs, CF
+exams:mcf|0
+exams:mcf_em|0
+exams:fra:cf|0                  # single language
+
+medexpqa:cf|0                   # MedExpQA, all 4 langs (spa/fra/ita/eng), CF
+medexpqa:mcf|0
+medexpqa:mcf_em|0
+medexpqa:spa:cf|0               # single language
+
+agrieval:cf|5                   # AgriEval Chinese agriculture, CF
+agrieval:mcf|5
+agrieval:mcf_em|5
+crop:cf|5                       # CROP Chinese crop science, CF
+crop:mcf|5
+crop:mcf_em|5
+fineval:cf|5                    # FinEval Chinese finance, CF
+fineval:mcf|5
+fineval:mcf_em|5
+
 # --- Translation ---
 wmt24pp|0                       # WMT24++, all English-centric language slices
 wmt24pp:de_DE|0                 # WMT24++, German slice (both directions)
diff --git a/data/titaneval/aecbench.parquet b/data/titaneval/aecbench.parquet
new file mode 100644
index 000000000..34fa55c56
--- /dev/null
+++ b/data/titaneval/aecbench.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f957409cf2d768a39490558386d9057b011d05dc9c1a4d25eee379e6d96a2570
+size 626336
diff --git a/data/titaneval/arabicmmlu.parquet b/data/titaneval/arabicmmlu.parquet
new file mode 100644
index 000000000..b03ccb8b4
--- /dev/null
+++ b/data/titaneval/arabicmmlu.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4031ff341db38d4d0418b9fe86f4297cbb9b2ed69b7ce029fed4cf5222eaa98
+size 1682400
diff --git a/data/titaneval/arastem.parquet b/data/titaneval/arastem.parquet
new file mode 100644
index 000000000..041d7922e
--- /dev/null
+++ b/data/titaneval/arastem.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25b950241f8d13a63af185242104e19d5ceefe0a5a1c08016c271bb9df45526b
+size 1108408
diff --git a/data/titaneval/camb.parquet b/data/titaneval/camb.parquet
new file mode 100644
index 000000000..d612c4469
--- /dev/null
+++ b/data/titaneval/camb.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a97e5dbfa8d1f4ded38acc60503b478fc2317d934a5447eebe078438500919c7
+size 779540
diff --git a/data/titaneval/chembench.parquet b/data/titaneval/chembench.parquet
new file mode 100644
index 000000000..cb171ad10
--- /dev/null
+++ b/data/titaneval/chembench.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc79af658739be361525b5a2aaecef42a077a6380f79872f30ba88b5b5ed4eab
+size 344067
diff --git a/data/titaneval/cybermetric.parquet b/data/titaneval/cybermetric.parquet
new file mode 100644
index 000000000..e5d3300a6
--- /dev/null
+++ b/data/titaneval/cybermetric.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44c6ccd81a038464b03649993e9813296b343e709f69cad4721a4e75b587b4f8
+size 1547418
diff --git a/data/titaneval/esgenius.parquet b/data/titaneval/esgenius.parquet
new file mode 100644
index 000000000..8bcfc5b0a
--- /dev/null
+++ b/data/titaneval/esgenius.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:048be850ec42630d9c1ac98597c0d1222a2def9e12b1ab3828726e5ed93da465
+size 2156197
diff --git a/data/titaneval/frenchmedmcqa.parquet b/data/titaneval/frenchmedmcqa.parquet
new file mode 100644
index 000000000..c9efcd7b2
--- /dev/null
+++ b/data/titaneval/frenchmedmcqa.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8347b970c914118f10293c6255d09ff10f3c5f6983f6b7af2aec4651f9ef0f8e
+size 692547
diff --git a/data/titaneval/geobench.parquet b/data/titaneval/geobench.parquet
new file mode 100644
index 000000000..9c8c97e37
--- /dev/null
+++ b/data/titaneval/geobench.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0ff5c41b5f1947413a632e11ea060f6a8dcf77fb37ba3fb12aefd177b32b114
+size 240640
diff --git a/data/titaneval/headqa.parquet b/data/titaneval/headqa.parquet
new file mode 100644
index 000000000..d9fb4202e
--- /dev/null
+++ b/data/titaneval/headqa.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4809ec8ed0fa1b17f36542486d8593af6ee85d92212be40218c98653ee7a4687
+size 3129133
diff --git a/data/titaneval/jecqa.parquet b/data/titaneval/jecqa.parquet
new file mode 100644
index 000000000..009bd4f5a
--- /dev/null
+++ b/data/titaneval/jecqa.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b158d449d187dcf58a3da32e1918f438cc77c32f9fbccad482d0c2c69bc48ae
+size 1126051
diff --git a/data/titaneval/lexeval.parquet b/data/titaneval/lexeval.parquet
new file mode 100644
index 000000000..3adf48b2e
--- /dev/null
+++ b/data/titaneval/lexeval.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35a3243bc682ccc3d7a01f6a5c3625e1ed08dc82223f518fca3bd6b6f8c43aec
+size 8383762
diff --git a/data/titaneval/med_qa.bak b/data/titaneval/med_qa.bak
new file mode 100644
index 000000000..f55bf4313
Binary files /dev/null and b/data/titaneval/med_qa.bak differ
diff --git a/data/titaneval/med_qa.parquet b/data/titaneval/med_qa.parquet
new file mode 100644
index 000000000..625931986
--- /dev/null
+++ b/data/titaneval/med_qa.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b57906cb75851cec4346d1264778c8cc0607a5bebafb3c4997caf1646d9520c9
+size 6094281
diff --git a/data/titaneval/mediqal.parquet b/data/titaneval/mediqal.parquet
new file mode 100644
index 000000000..7cea2c620
--- /dev/null
+++ b/data/titaneval/mediqal.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:332eb15d841ad60540f0738a6a1136d9133d552f9b7dd5c8e95fa369d837eab7
+size 9999318
diff --git a/data/titaneval/preflight.parquet b/data/titaneval/preflight.parquet
new file mode 100644
index 000000000..48243a469
--- /dev/null
+++ b/data/titaneval/preflight.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3e45d42693af51110fe40a4401e2b0325b95127050810c9bac1c56bc866ea25
+size 61282
diff --git a/data/titaneval/xfinbench.parquet b/data/titaneval/xfinbench.parquet
new file mode 100644
index 000000000..b4bcae7d5
--- /dev/null
+++ b/data/titaneval/xfinbench.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0a3a93b9dfa3c5f1b0f248ecb11107fb51fd985447250e360961ea1ab704995
+size 131528
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index b707b4fdd..dc7dc9e80 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -63,6 +63,7 @@
 )
 from lighteval.metrics.normalizations import (
     bigbench_normalizer,
+    harness_triviaqa_normalizer,
     remove_braces,
     remove_braces_and_strip,
 )
@@ -339,6 +340,20 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    # Normalized EM for open-domain gen QA (lowercase + remove punctuation on both gold and pred).
+    # Use for free-text fact-retrieval tasks (TriviaQA, WikiFact, PopQA, SQuAD, etc.)
+    # instead of exact_match, which is case-sensitive and punctuation-sensitive.
+    qa_em = SampleLevelMetric(
+        metric_name="em",
+        sample_level_fn=ExactMatches(
+            normalize_gold=harness_triviaqa_normalizer,
+            normalize_pred=harness_triviaqa_normalizer,
+            strip_strings=True,
+        ),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     expr_gold_metric = SampleLevelMetric(
         metric_name="extractive_match",
         sample_level_fn=MultilingualExtractiveMatchMetric(
@@ -462,6 +477,18 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    # Normalized F1 for open-domain gen QA (lowercase + remove punctuation on both gold and pred).
+    qa_f1 = SampleLevelMetric(
+        metric_name="f1",
+        sample_level_fn=F1_score(
+            normalize_gold=harness_triviaqa_normalizer,
+            normalize_pred=harness_triviaqa_normalizer,
+            strip_strings=True,
+        ),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     f1_score_macro = CorpusLevelMetric(
         metric_name="f1",
         sample_level_fn=GenerativePreparator(),
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 370e3a249..8764c41f3 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -295,6 +295,133 @@ def _build_split(split: str) -> Dataset:
     )
 
 
+def _load_titaneval_dataset(task_name: str) -> DatasetDict:
+    """Load a titaneval task from the local parquet copy in data/titaneval/.
+
+    If the parquet is missing for known HF-derived benchmarks (med_qa, headqa,
+    cybermetric), attempts to auto-download and cache before raising.
+    """
+    data_dir = Path(__file__).parent.parent.parent.parent / "data" / "titaneval"
+    parquet_path = data_dir / f"{task_name}.parquet"
+
+    if not parquet_path.exists():
+        _try_cache_titaneval(task_name, parquet_path)
+
+    if not parquet_path.exists():
+        raise FileNotFoundError(
+            f"titaneval_local: parquet not found at {parquet_path}. "
+            "Copy it from titaneval-mcq/benchmarks/ into data/titaneval/ "
+            "or run scripts/cache_broken_hf_to_titaneval.py."
+        )
+    return load_dataset("parquet", data_files={"test": str(parquet_path)})
+
+
+def _try_cache_titaneval(task_name: str, target: Path) -> None:
+    """Auto-download known HF datasets if the titaneval parquet doesn't exist."""
+    import json
+    import zipfile
+
+    import pandas as pd
+    from huggingface_hub import hf_hub_download
+
+    COLUMNS = ["benchmark", "domain", "question", "choices", "answer_index", "answer_text"]
+
+    def _save(rows, name):
+        target.parent.mkdir(parents=True, exist_ok=True)
+        pd.DataFrame(rows, columns=COLUMNS).to_parquet(target, index=False)
+
+    try:
+        if task_name == "med_qa":
+            zip_path = hf_hub_download("bigbio/med_qa", "data_clean.zip", repo_type="dataset")
+            rows = []
+            with zipfile.ZipFile(zip_path, "r") as zf:
+                for arc in [
+                    "data_clean/questions/US/4_options/phrases_no_exclude_train.jsonl",
+                    "data_clean/questions/US/4_options/phrases_no_exclude_dev.jsonl",
+                    "data_clean/questions/US/4_options/phrases_no_exclude_test.jsonl",
+                ]:
+                    with zf.open(arc) as fh:
+                        for line in fh.read().decode("utf-8").strip().split("\n"):
+                            obj = json.loads(line)
+                            opts = obj["options"]
+                            labels = sorted(opts.keys())
+                            rows.append({
+                                "benchmark": "med_qa",
+                                "domain": obj.get("meta_info", ""),
+                                "question": obj["question"],
+                                "choices": [opts[k] for k in labels],
+                                "answer_index": labels.index(obj["answer_idx"]),
+                                "answer_text": opts[obj["answer_idx"]],
+                            })
+            _save(rows, "med_qa")
+            logger.info(f"Auto-cached {len(rows)} rows → {target}")
+
+        elif task_name == "headqa":
+            rows = []
+            for lang in ("en", "es"):
+                data_files = {
+                    s: f"https://huggingface.co/datasets/EleutherAI/headqa/resolve/main/{lang}/{s}.parquet"
+                    for s in ("train", "validation", "test")
+                }
+                ds = load_dataset("parquet", data_files=data_files)
+                for split_ds in ds.values():
+                    for item in split_ds:
+                        answers = item["answers"]
+                        choices = [a["atext"] for a in answers]
+                        aid = item["ra"]
+                        idx = next((i for i, a in enumerate(answers) if a["aid"] == aid), -1)
+                        if idx < 0:
+                            continue
+                        rows.append({
+                            "benchmark": "headqa",
+                            "domain": f"{item.get('category', '')}/{lang}",
+                            "question": item["qtext"],
+                            "choices": choices,
+                            "answer_index": idx,
+                            "answer_text": choices[idx],
+                        })
+            _save(rows, "headqa")
+            logger.info(f"Auto-cached {len(rows)} rows → {target}")
+
+        elif task_name == "cybermetric":
+            rows = []
+            for fname in [
+                "CyberMetric-10000-v1.json",
+                "CyberMetric-2000-v1.json",
+                "CyberMetric-500-v1.json",
+                "CyberMetric-80-v1.json",
+            ]:
+                local = hf_hub_download("tihanyin/CyberMetric", fname, repo_type="dataset")
+                with open(local) as fh:
+                    data = json.load(fh)
+                for q in data.get("questions", []):
+                    answers = q.get("answers", {})
+                    labels = sorted(answers.keys())
+                    sol = q.get("solution") or q.get("correct_solution", "")
+                    idx = labels.index(sol) if sol in labels else -1
+                    if idx < 0:
+                        continue
+                    rows.append({
+                        "benchmark": "cybermetric",
+                        "domain": "cybersecurity",
+                        "question": q["question"],
+                        "choices": [answers[k] for k in labels],
+                        "answer_index": idx,
+                        "answer_text": answers[sol],
+                    })
+                if len(rows) >= 10000:
+                    break
+            _save(rows, "cybermetric")
+            logger.info(f"Auto-cached {len(rows)} rows → {target}")
+
+    except Exception:
+        logger.warning(
+            f"Auto-cache failed for '{task_name}'. "
+            "Run scripts/cache_broken_hf_to_titaneval.py manually.",
+            exc_info=True,
+        )
+
+
 @dataclass
 class LightevalTaskConfig:
     """Configuration dataclass for a LightevalTask.
@@ -764,6 +891,12 @@ def download_dataset_worker(
                 dataset = dataset.filter(task.dataset_filter)
             return dataset  # type: ignore
 
+        if task.dataset_path == "titaneval_local":
+            dataset = _load_titaneval_dataset(task.dataset_config_name)
+            if task.dataset_filter is not None:
+                dataset = dataset.filter(task.dataset_filter)
+            return dataset  # type: ignore
+
         try:
             dataset = load_dataset(
                 path=task.dataset_path,
@@ -804,6 +937,12 @@ def download_dataset_worker(
                     dataset = dataset.filter(task.dataset_filter)
                 return dataset  # type: ignore
 
+            if task.dataset_path == "titaneval_local":
+                dataset = _load_titaneval_dataset(task.dataset_config_name)
+                if task.dataset_filter is not None:
+                    dataset = dataset.filter(task.dataset_filter)
+                return dataset  # type: ignore
+
             if _is_script_err:
                 dataset = _load_hub_raw_dataset_files(
                     dataset_path=task.dataset_path,
diff --git a/src/lighteval/tasks/multilingual/tasks/aecbench.py b/src/lighteval/tasks/multilingual/tasks/aecbench.py
new file mode 100644
index 000000000..a4d7774ce
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/aecbench.py
@@ -0,0 +1,66 @@
+"""
+name:
+AECBench Architectural Engineering QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/aecbench.parquet)
+
+abstract:
+Chinese architectural and civil engineering multiple-choice benchmark. 6,386
+questions from TitanEval-MCQ, 0-shot.
+
+languages:
+chinese
+
+tags:
+engineering, multiple-choice, qa, chinese
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"aecbench:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="aecbench",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/agrieval.py b/src/lighteval/tasks/multilingual/tasks/agrieval.py
new file mode 100644
index 000000000..a69ba77b4
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/agrieval.py
@@ -0,0 +1,88 @@
+"""
+name:
+AgriEval + CROP
+
+datasets:
+PaperHarvester/AgriEval
+AI4Agr/CROP-benchmark
+
+abstract:
+Chinese agricultural knowledge benchmarks. AgriEval covers plant science,
+animal production, and related domains (variable 2–7 choices). CROP covers
+crop science with 4-choice questions. Both are Chinese-language.
+
+Single-language Chinese tasks — no language suffix in task name.
+
+languages:
+chinese
+
+tags:
+agriculture, knowledge, multilingual, multiple-choice, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _agrieval_adapter(line):
+    opts = line["options"]
+    keys = sorted(opts.keys())
+    choices = [opts[k] for k in keys]
+    gold = keys.index(line["answer"])
+    return {"question": line["question"], "choices": choices, "gold_idx": gold}
+
+
+def _crop_adapter(line):
+    choices = [line[f"Option {l}"] for l in "ABCD"]
+    gold = list("ABCD").index(line["Answer"])
+    return {"question": line["Question"], "choices": choices, "gold_idx": gold}
+
+
+def _configs(name, repo, subset, split, adapter, fs_select):
+    return [
+        LightevalTaskConfig(
+            name=f"{name}:{suffix}",
+            prompt_function=get_mcq_prompt_function(Language.CHINESE, adapter, formulation=formulation),
+            hf_repo=repo,
+            hf_subset=subset,
+            hf_avail_splits=[split],
+            evaluation_splits=[split],
+            few_shots_split=split,
+            few_shots_select=fs_select,
+            # single-choice only; AgriEval has 单选/多选
+            hf_filter=(lambda line: line.get("question_type") == "单选") if name == "agrieval" else None,
+            generation_size=gen,
+            metrics=metrics,
+            stop_sequence=["\n"],
+            version=0,
+        )
+        for suffix, formulation, metrics, gen in [
+            ("cf",     CFFormulation(),  _CF_METRICS,             -1),
+            ("mcf",    MCFFormulation(), _MCF_METRICS,            -1),
+            ("mcf_em", MCFFormulation(), [Metrics.exact_match],    1),
+        ]
+    ]
+
+
+TASKS_TABLE = (
+    _configs("agrieval", "PaperHarvester/AgriEval", "default", "train", _agrieval_adapter, "random_sampling")
+    + _configs("crop",     "AI4Agr/CROP-benchmark",  "default", "test",  _crop_adapter,    "random_sampling")
+)
diff --git a/src/lighteval/tasks/multilingual/tasks/arabicmmlu.py b/src/lighteval/tasks/multilingual/tasks/arabicmmlu.py
new file mode 100644
index 000000000..7bcd4c718
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/arabicmmlu.py
@@ -0,0 +1,66 @@
+"""
+name:
+ArabicMMLU Arabic MMLU
+
+dataset:
+titaneval_local (local parquet — data/titaneval/arabicmmlu.parquet)
+
+abstract:
+Arabic MMLU multiple-choice benchmark covering diverse academic subjects.
+14,455 questions from TitanEval-MCQ, 0-shot.
+
+languages:
+arabic
+
+tags:
+general, multiple-choice, qa, arabic
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"arabicmmlu:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.ARABIC, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="arabicmmlu",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/arastem.py b/src/lighteval/tasks/multilingual/tasks/arastem.py
new file mode 100644
index 000000000..4763293c3
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/arastem.py
@@ -0,0 +1,66 @@
+"""
+name:
+AraSTeM Arabic STEM QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/arastem.parquet)
+
+abstract:
+Arabic STEM multiple-choice benchmark covering science, technology, engineering,
+and math. 10,819 questions from TitanEval-MCQ, 0-shot.
+
+languages:
+arabic
+
+tags:
+stem, multiple-choice, qa, arabic
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"arastem:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.ARABIC, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="arastem",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/camb.py b/src/lighteval/tasks/multilingual/tasks/camb.py
new file mode 100644
index 000000000..dfdfd3987
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/camb.py
@@ -0,0 +1,66 @@
+"""
+name:
+CAMB Civil Aviation Maintenance QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/camb.parquet)
+
+abstract:
+Chinese civil aviation maintenance multiple-choice benchmark. 7,969 questions
+from TitanEval-MCQ, 0-shot.
+
+languages:
+chinese
+
+tags:
+aviation, multiple-choice, qa, chinese
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"camb:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="camb",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/cmmlu.py b/src/lighteval/tasks/multilingual/tasks/cmmlu.py
index 566fad0f2..0a62a04ad 100644
--- a/src/lighteval/tasks/multilingual/tasks/cmmlu.py
+++ b/src/lighteval/tasks/multilingual/tasks/cmmlu.py
@@ -6,7 +6,8 @@
 haonan-li/cmmlu
 
 abstract:
-Cmmlu multilingual benchmark.
+CMMLU is a comprehensive Chinese evaluation benchmark covering 67 topics from
+basic to advanced professional level. Named after the Chinese MMLU.
 
 languages:
 chinese
@@ -15,125 +16,78 @@
 knowledge, multilingual, multiple-choice
 
 paper:
+https://arxiv.org/abs/2307.09009
 """
 
 from string import ascii_uppercase
 
-from lighteval.metrics.dynamic_metrics import (
-    LogLikelihoodAccMetric,
-)
-from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
 from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
-from lighteval.tasks.templates.utils.formulation import (
-    CFFormulation,
-    HybridFormulation,
-    MCFFormulation,
-)
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
 from lighteval.utils.language import Language
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
 
 CMMLU_SUBSETS = [
-    "agronomy",
-    "anatomy",
-    "ancient_chinese",
-    "arts",
-    "astronomy",
-    "business_ethics",
-    "chinese_civil_service_exam",
-    "chinese_driving_rule",
-    "chinese_food_culture",
-    "chinese_foreign_policy",
-    "chinese_history",
-    "chinese_literature",
-    "chinese_teacher_qualification",
-    "clinical_knowledge",
-    "college_actuarial_science",
-    "college_education",
-    "college_engineering_hydrology",
-    "college_law",
-    "college_mathematics",
-    "college_medical_statistics",
-    "college_medicine",
-    "computer_science",
-    "computer_security",
-    "conceptual_physics",
-    "construction_project_management",
-    "economics",
-    "education",
-    "electrical_engineering",
-    "elementary_chinese",
-    "elementary_commonsense",
-    "elementary_information_and_technology",
-    "elementary_mathematics",
-    "ethnology",
-    "food_science",
-    "genetics",
-    "global_facts",
-    "high_school_biology",
-    "high_school_chemistry",
-    "high_school_geography",
-    "high_school_mathematics",
-    "high_school_physics",
-    "high_school_politics",
-    "human_sexuality",
-    "international_law",
-    "journalism",
-    "jurisprudence",
-    "legal_and_moral_basis",
-    "logical",
-    "machine_learning",
-    "management",
-    "marketing",
-    "marxist_theory",
-    "modern_chinese",
-    "nutrition",
-    "philosophy",
-    "professional_accounting",
-    "professional_law",
-    "professional_medicine",
-    "professional_psychology",
-    "public_relations",
-    "security_study",
-    "sociology",
-    "sports_science",
-    "traditional_chinese_medicine",
-    "virology",
-    "world_history",
-    "world_religions",
+    "agronomy", "anatomy", "ancient_chinese", "arts", "astronomy",
+    "business_ethics", "chinese_civil_service_exam", "chinese_driving_rule",
+    "chinese_food_culture", "chinese_foreign_policy", "chinese_history",
+    "chinese_literature", "chinese_teacher_qualification", "clinical_knowledge",
+    "college_actuarial_science", "college_education", "college_engineering_hydrology",
+    "college_law", "college_mathematics", "college_medical_statistics",
+    "college_medicine", "computer_science", "computer_security", "conceptual_physics",
+    "construction_project_management", "economics", "education",
+    "electrical_engineering", "elementary_chinese", "elementary_commonsense",
+    "elementary_information_and_technology", "elementary_mathematics", "ethnology",
+    "food_science", "genetics", "global_facts", "high_school_biology",
+    "high_school_chemistry", "high_school_geography", "high_school_mathematics",
+    "high_school_physics", "high_school_politics", "human_sexuality",
+    "international_law", "journalism", "jurisprudence", "legal_and_moral_basis",
+    "logical", "machine_learning", "management", "marketing", "marxist_theory",
+    "modern_chinese", "nutrition", "philosophy", "professional_accounting",
+    "professional_law", "professional_medicine", "professional_psychology",
+    "public_relations", "security_study", "sociology", "sports_science",
+    "traditional_chinese_medicine", "virology", "world_history", "world_religions",
 ]
 
 
+def _adapter(line):
+    return {
+        "question": line["Question"],
+        "choices": [line["A"], line["B"], line["C"], line["D"]],
+        "gold_idx": ascii_uppercase.index(line["Answer"]),
+    }
+
+
 TASKS_TABLE = [
     LightevalTaskConfig(
-        name=f"cmmlu_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}",
-        prompt_function=get_mcq_prompt_function(
-            Language.CHINESE,
-            lambda line: {
-                "question": line["Question"],
-                "choices": [line["A"], line["B"], line["C"], line["D"]],
-                "gold_idx": ascii_uppercase.index(line["Answer"]),
-            },
-            formulation=formulation,
-        ),
+        name=f"cmmlu:{subset}:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation),
         hf_repo="haonan-li/cmmlu",
         hf_subset=subset,
         evaluation_splits=("test",),
         few_shots_split="dev",
-        metrics=get_metrics_for_formulation(
-            formulation,
-            [
-                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
-                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
-                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
-            ],
-        ),
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=1,
     )
     for subset in CMMLU_SUBSETS
-    for formulation in [
-        MCFFormulation(),
-        CFFormulation(),
-        HybridFormulation(),
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
     ]
 ]
diff --git a/src/lighteval/tasks/multilingual/tasks/exams.py b/src/lighteval/tasks/multilingual/tasks/exams.py
index 28e40e989..8a12d6ada 100644
--- a/src/lighteval/tasks/multilingual/tasks/exams.py
+++ b/src/lighteval/tasks/multilingual/tasks/exams.py
@@ -6,7 +6,8 @@
 mhardalov/exams
 
 abstract:
-Exams multilingual benchmark.
+EXAMS is a multilingual benchmark for school-level subject knowledge across
+16 languages and multiple subjects per language.
 
 languages:
 albanian, arabic, bulgarian, croatian, french, german, hungarian, italian,
@@ -17,177 +18,90 @@
 knowledge, multilingual, multiple-choice
 
 paper:
+https://arxiv.org/abs/2011.03080
 """
 
-from functools import partial
-
 from langcodes import Language as LangCodeLanguage
 from langcodes import standardize_tag
 
-from lighteval.metrics.dynamic_metrics import (
-    LogLikelihoodAccMetric,
-)
-from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset
 from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
-from lighteval.tasks.templates.utils.formulation import (
-    CFFormulation,
-    HybridFormulation,
-    MCFFormulation,
-)
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
 from lighteval.utils.language import Language
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+# Languages with EXAMS data; all subjects for each language are aggregated.
+_LANGUAGES = [
+    Language.ARABIC,
+    Language.BULGARIAN,
+    Language.CROATIAN,
+    Language.HUNGARIAN,
+    Language.ITALIAN,
+    Language.SERBIAN,
+    Language.FRENCH,
+    Language.GERMAN,
+    Language.SPANISH,
+    Language.LITHUANIAN,
+    Language.ALBANIAN,
+    Language.MACEDONIAN,
+    Language.TURKISH,
+    Language.POLISH,
+    Language.PORTUGUESE,
+    Language.VIETNAMESE,
+]
+
+
+def _lang_name(language: Language) -> str:
+    return LangCodeLanguage(standardize_tag(language.value)).language_name()
+
+
+def _make_filter(language: Language):
+    lang_name = _lang_name(language)
+    return lambda line: (
+        line["answerKey"] != "@"
+        and line["info"]["language"] == lang_name
+    )
+
 
-exams_subjects_by_lang: dict[Language, set[str]] = {
-    Language.ARABIC: {"Biology", "Islamic Studies", "Physics", "Science", "Social"},
-    Language.BULGARIAN: {"Biology", "Chemistry", "Geography", "History", "Philosophy", "Physics"},
-    Language.CROATIAN: {
-        "Biology",
-        "Chemistry",
-        "Ethics",
-        "Fine Arts",
-        "Geography",
-        "Geology",
-        "History",
-        "Informatics",
-        "Philosophy",
-        "Physics",
-        "Politics",
-        "Psychology",
-        "Religion",
-        "Sociology",
-    },
-    Language.HUNGARIAN: {
-        "Agriculture",
-        "Agriculture (Mechanical knowledge)",
-        "Biology",
-        "Chemistry",
-        "Economics",
-        "Economics & Marketing",
-        "Economics Basics (Business)",
-        "Economics Basics (Theoretical)",
-        "Forestry",
-        "Geography",
-        "Landscaping",
-        "Physics",
-        "Politics",
-        "Tourism",
-    },
-    Language.ITALIAN: {
-        "Biology",
-        "Chemistry",
-        "Ethics",
-        "Geography",
-        "Geology",
-        "History",
-        "Informatics",
-        "Philosophy",
-        "Physics",
-        "Politics",
-        "Psychology",
-        "Sociology",
-    },
-    Language.SERBIAN: {
-        "Biology",
-        "Chemistry",
-        "Ethics",
-        "Geography",
-        "Geology",
-        "History",
-        "Informatics",
-        "Philosophy",
-        "Physics",
-        "Politics",
-        "Psychology",
-        "Religion",
-        "Sociology",
-    },
-    Language.FRENCH: {"Economics", "Economics & Marketing", "Economics Basics (Theoretical)", "Geography", "Physics"},
-    Language.GERMAN: {
-        "Chemistry",
-        "Economics",
-        "Economics & Marketing",
-        "Economics Basics (Theoretical)",
-        "Geography",
-        "Physics",
-        "Tourism",
-    },
-    Language.SPANISH: {"Geography", "Physics"},
-    Language.LITHUANIAN: {"Geology", "History"},
-    Language.ALBANIAN: {
-        "Biology",
-        "Business",
-        "Chemistry",
-        "Fine Arts",
-        "History",
-        "Philosophy",
-        "Physics",
-        "Sociology",
-    },
-    Language.MACEDONIAN: {
-        "Biology",
-        "Business",
-        "Chemistry",
-        "Fine Arts",
-        "History",
-        "Philosophy",
-        "Physics",
-        "Sociology",
-    },
-    Language.TURKISH: {
-        "Biology",
-        "Business",
-        "Chemistry",
-        "Geography",
-        "History",
-        "Philosophy",
-        "Physics",
-        "Sociology",
-    },
-    Language.POLISH: {"Professional"},
-    Language.PORTUGUESE: {"Biology", "Economics", "Geology", "Philosophy"},
-    Language.VIETNAMESE: {"Biology", "Chemistry", "Citizenship", "Geography", "History", "Physics"},
-}
+def _adapter(line):
+    return {
+        "question": line["question"]["stem"],
+        "choices": line["question"]["choices"]["text"],
+        "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]),
+    }
 
 
 TASKS_TABLE = [
     LightevalTaskConfig(
-        name=f"exams_{language.value}_{formulation.name.lower()}:{normalize_subset(subject)}",
-        prompt_function=get_mcq_prompt_function(
-            language,
-            lambda line: {
-                "question": line["question"]["stem"],
-                "choices": line["question"]["choices"]["text"],
-                "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]),
-            },
-            formulation=formulation,
-        ),
+        name=f"exams:{language.value}:{suffix}",
+        prompt_function=get_mcq_prompt_function(language, _adapter, formulation=formulation),
         hf_repo="mhardalov/exams",
         hf_subset="multilingual",
-        # Weird bug in dataset
-        hf_filter=partial(
-            lambda language, subject, line: line["answerKey"] != "@"
-            and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
-            and line["info"]["subject"] == subject,
-            language,
-            subject,
-        ),
+        hf_filter=_make_filter(language),
         evaluation_splits=("test",),
         few_shots_split="train",
-        metrics=get_metrics_for_formulation(
-            formulation,
-            [
-                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
-                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
-            ],
-        ),
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=1,
     )
-    for language in exams_subjects_by_lang.keys()
-    for subject in exams_subjects_by_lang[language]
-    for formulation in [
-        MCFFormulation(),
-        CFFormulation(),
-        HybridFormulation(),
+    for language in _LANGUAGES
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,             -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,            -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],    1),
     ]
 ]
diff --git a/src/lighteval/tasks/multilingual/tasks/fineval.py b/src/lighteval/tasks/multilingual/tasks/fineval.py
new file mode 100644
index 000000000..afdf8acd8
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/fineval.py
@@ -0,0 +1,69 @@
+"""
+name:
+FinEval
+
+dataset:
+SUFE-AIFLM-Lab/FinEval
+
+abstract:
+FinEval is a Chinese financial knowledge benchmark covering finance, economics,
+accounting, and professional certificates, drawn from university-level exams.
+Single-language Chinese task — no language suffix in task name.
+
+languages:
+chinese
+
+tags:
+finance, knowledge, multilingual, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/2308.09975
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _fineval_adapter(line):
+    choices = [line["A"], line["B"], line["C"], line["D"]]
+    gold = list("ABCD").index(line["answer"])
+    return {"question": line["question"], "choices": choices, "gold_idx": gold}
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"fineval:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.CHINESE, _fineval_adapter, formulation=formulation),
+        hf_repo="SUFE-AIFLM-Lab/FinEval",
+        hf_subset="default",
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen in [
+        ("cf",     CFFormulation(),  _CF_METRICS,             -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,            -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],    1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/frenchmedmcqa.py b/src/lighteval/tasks/multilingual/tasks/frenchmedmcqa.py
new file mode 100644
index 000000000..393dd5ab2
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/frenchmedmcqa.py
@@ -0,0 +1,66 @@
+"""
+name:
+FrenchMedMCQA French Medical Licensing QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/frenchmedmcqa.parquet)
+
+abstract:
+French medical licensing exam multiple-choice benchmark. 3,105 questions
+from TitanEval-MCQ, 0-shot.
+
+languages:
+french
+
+tags:
+medical, multiple-choice, qa, french
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"frenchmedmcqa:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.FRENCH, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="frenchmedmcqa",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/jecqa.py b/src/lighteval/tasks/multilingual/tasks/jecqa.py
new file mode 100644
index 000000000..31102c69b
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/jecqa.py
@@ -0,0 +1,66 @@
+"""
+name:
+JEC-QA Chinese Law Exam
+
+dataset:
+titaneval_local (local parquet — data/titaneval/jecqa.parquet)
+
+abstract:
+Chinese judicial examination multiple-choice benchmark. 1,998 questions
+from TitanEval-MCQ, 0-shot.
+
+languages:
+chinese
+
+tags:
+law, multiple-choice, qa, chinese
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"jecqa:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="jecqa",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/lexeval.py b/src/lighteval/tasks/multilingual/tasks/lexeval.py
new file mode 100644
index 000000000..c3a4d1e85
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/lexeval.py
@@ -0,0 +1,66 @@
+"""
+name:
+LexEval Legal Evaluation QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/lexeval.parquet)
+
+abstract:
+Legal evaluation multiple-choice benchmark (Chinese). 10,920 questions
+from TitanEval-MCQ, 0-shot.
+
+languages:
+chinese
+
+tags:
+law, multiple-choice, qa, chinese
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"lexeval:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.CHINESE, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="lexeval",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/medexpqa.py b/src/lighteval/tasks/multilingual/tasks/medexpqa.py
new file mode 100644
index 000000000..29de5b3b1
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/medexpqa.py
@@ -0,0 +1,86 @@
+"""
+name:
+MedExpQA
+
+dataset:
+HiTZ/MedExpQA
+
+abstract:
+MedExpQA is a multilingual medical expert QA benchmark based on Spanish
+board-exam style questions, with translations to French, Italian, and English.
+Each question has 4-5 options with a single correct answer.
+
+languages:
+english, french, italian, spanish
+
+tags:
+medical, multilingual, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/2307.00099
+"""
+
+import ast
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+# (lighteval Language, HF config name)
+_LANGUAGES = [
+    (Language.SPANISH, "es"),
+    (Language.FRENCH,  "fr"),
+    (Language.ITALIAN, "it"),
+    (Language.ENGLISH, "en"),
+]
+
+
+def _make_adapter():
+    def adapter(line):
+        opts = ast.literal_eval(line["options"]) if isinstance(line["options"], str) else line["options"]
+        keys = sorted(opts.keys(), key=lambda x: int(x))
+        choices = [opts[k] for k in keys]
+        gold = keys.index(str(int(line["correct_option"])))
+        return {"question": line["full_question"], "choices": choices, "gold_idx": gold}
+    return adapter
+
+
+_adapter = _make_adapter()
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"medexpqa:{language.value}:{suffix}",
+        prompt_function=get_mcq_prompt_function(language, _adapter, formulation=formulation),
+        hf_repo="HiTZ/MedExpQA",
+        hf_subset=hf_config,
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for language, hf_config in _LANGUAGES
+    for suffix, formulation, metrics, gen in [
+        ("cf",     CFFormulation(),  _CF_METRICS,             -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,            -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],    1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mediqal.py b/src/lighteval/tasks/multilingual/tasks/mediqal.py
new file mode 100644
index 000000000..47bab111f
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mediqal.py
@@ -0,0 +1,66 @@
+"""
+name:
+MediQAL French Medical QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/mediqal.parquet)
+
+abstract:
+French medical QA multiple-choice benchmark. 27,634 questions
+from TitanEval-MCQ, 0-shot.
+
+languages:
+french
+
+tags:
+medical, multiple-choice, qa, french
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
+from lighteval.utils.language import Language
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": line["answer_index"],
+    }
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"mediqal:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.FRENCH, _adapter, formulation=formulation),
+        hf_repo="titaneval_local",
+        hf_subset="mediqal",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,              -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,             -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],     1),
+    ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py
index 410268f9e..b8ce1f301 100644
--- a/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py
+++ b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py
@@ -6,7 +6,8 @@
 AYueksel/TurkishMMLU
 
 abstract:
-Turkish Mmlu multilingual benchmark.
+TurkishMMLU is a Turkish-language multiple-choice benchmark modelled after
+MMLU, covering 9 school subjects.
 
 languages:
 turkish
@@ -19,22 +20,27 @@
 
 from string import ascii_uppercase
 
-from lighteval.metrics.dynamic_metrics import (
-    LogLikelihoodAccMetric,
-)
-from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset
 from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
-from lighteval.tasks.templates.utils.formulation import (
-    CFFormulation,
-    HybridFormulation,
-    MCFFormulation,
-)
+from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation
 from lighteval.utils.language import Language
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
 
-TURKISH_MMLU_SUBSET = [
+# HF subset names (capitalized); task names use lowercase.
+TURKISH_MMLU_SUBSETS = [
     "Biology",
     "Chemistry",
     "Geography",
@@ -47,35 +53,31 @@
 ]
 
 
+def _adapter(line):
+    return {
+        "question": line["question"],
+        "choices": line["choices"],
+        "gold_idx": ascii_uppercase.index(line["answer"]),
+    }
+
+
 TASKS_TABLE = [
     LightevalTaskConfig(
-        name=f"mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
-        prompt_function=get_mcq_prompt_function(
-            Language.TURKISH,
-            lambda line: {
-                "question": line["question"],
-                "choices": line["choices"],
-                "gold_idx": ascii_uppercase.index(line["answer"]),
-            },
-            formulation=formulation,
-        ),
+        name=f"turkishmmlu:{subset.lower()}:{suffix}",
+        prompt_function=get_mcq_prompt_function(Language.TURKISH, _adapter, formulation=formulation),
         hf_repo="AYueksel/TurkishMMLU",
         hf_subset=subset,
         evaluation_splits=("test",),
         few_shots_split="dev",
-        metrics=get_metrics_for_formulation(
-            formulation,
-            [
-                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
-                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
-                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
-            ],
-        ),
+        generation_size=gen_size,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=1,
     )
-    for subset in TURKISH_MMLU_SUBSET
-    for formulation in [
-        MCFFormulation(),
-        CFFormulation(),
-        HybridFormulation(),
+    for subset in TURKISH_MMLU_SUBSETS
+    for suffix, formulation, metrics, gen_size in [
+        ("cf",     CFFormulation(),  _CF_METRICS,             -1),
+        ("mcf",    MCFFormulation(), _MCF_METRICS,            -1),
+        ("mcf_em", MCFFormulation(), [Metrics.exact_match],    1),
     ]
 ]
diff --git a/src/lighteval/tasks/tasks/boolq.py b/src/lighteval/tasks/tasks/boolq.py
index f927f9491..115915cd6 100644
--- a/src/lighteval/tasks/tasks/boolq.py
+++ b/src/lighteval/tasks/tasks/boolq.py
@@ -18,102 +18,94 @@
 https://arxiv.org/abs/1905.11946
 """
 
-from string import ascii_uppercase
-
-from inspect_ai.dataset import Sample
-from inspect_ai.scorer import choice
-from inspect_ai.solver import multiple_choice
-
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
 from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _question(line) -> str:
+    q = line["question"]
+    return q[:-1] if q.endswith("??") else q
 
-def boolq_prompt(line, task_name: str = None):
-    question = line["question"][:-1] if line["question"][-2:] == "??" else line["question"]
+
+def _gold(line) -> int:
+    return 0 if line["answer"] == "Yes" else 1
+
+
+def boolq_cf_prompt(line, task_name: str = None):
+    """CF variant: score full answer text via logprobs."""
     return Doc(
         task_name=task_name,
-        query=f"Passage: {line['passage']}\nQuestion: {question}\nAnswer:",
+        query=f"Passage: {line['passage']}\nQuestion: {_question(line)}\nAnswer:",
         choices=[" Yes", " No"],
-        gold_index=["Yes", "No"].index(line["answer"]),
+        gold_index=_gold(line),
     )
 
 
-def boolq_contrastset_prompt(line, task_name: str = None):
-    if line["contrast_inputs"] in [None, ""]:
-        return boolq_prompt(line)
-
-    return [
-        Doc(
-            task_name=task_name,
-            query=f"{passage}\nQuestion: {question}\nAnswer:",
-            choices=["Yes", "No"],
-            gold_index=["No", "Yes"].index(line["answer"]),
-        )
-        for passage, question in zip(line["contrast_inputs"]["passage"], line["contrast_inputs"]["question"])
-    ][0]
-
-
-def record_to_sample(record):
-    choices = ["Yes", "No"]
-    query = f"{record['passage']}\n{record['question']}"
-    target = ascii_uppercase[choices.index(record["answer"])]
-    return Sample(input=query, target=target, choices=choices)
-
-
-def record_to_sample_contrastset(record):
-    if record["contrast_inputs"] in [None, ""]:
-        return record_to_sample(record)
-
-    choices = ["Yes", "No"]
-    query = f"{record['contrast_inputs']['passage']}\n{record['contrast_inputs']['question']}"
-    target = ascii_uppercase[choices.index(record["answer"])]
-
-    return Sample(input=query, target=target, choices=choices)
-
-
-boolq = LightevalTaskConfig(
-    name="boolq",
-    prompt_function=boolq_prompt,
-    hf_repo="lighteval/boolq_helm",
-    hf_subset="default",
-    hf_avail_splits=["train", "validation"],
-    evaluation_splits=["validation"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=5,
-    metrics=[
-        Metrics.exact_match,
-    ],
-    stop_sequence=["\n"],
-    version=0,
-    sample_fields=record_to_sample,
-    solver=[multiple_choice(cache=True)],
-    scorer=choice(),
-)
-
-
-boolq_contrastset = LightevalTaskConfig(
-    name="boolq:contrastset",
-    prompt_function=boolq_contrastset_prompt,
-    hf_repo="lighteval/boolq_helm",
-    hf_subset="default",
-    hf_avail_splits=["validation"],
-    evaluation_splits=["validation"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=5,
-    metrics=[
-        Metrics.exact_match,
-    ],
-    stop_sequence=["\n"],
-    version=0,
-    sample_fields=record_to_sample_contrastset,
-    solver=[multiple_choice(cache=True)],
-    scorer=choice(),
-)
+def boolq_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled A/B options, score label tokens via logprobs."""
+    return Doc(
+        task_name=task_name,
+        query=f"Passage: {line['passage']}\nQuestion: {_question(line)}\n A. Yes\n B. No\nAnswer:",
+        choices=[" A", " B"],
+        gold_index=_gold(line),
+    )
+
 
 TASKS_TABLE = [
-    boolq,
-    boolq_contrastset,
+    LightevalTaskConfig(
+        name="boolq:cf",
+        prompt_function=boolq_cf_prompt,
+        hf_repo="lighteval/boolq_helm",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=-1,
+        metrics=_CF_METRICS,
+        stop_sequence=["\n"],
+        version=1,
+    ),
+    LightevalTaskConfig(
+        name="boolq:mcf",
+        prompt_function=boolq_mcf_prompt,
+        hf_repo="lighteval/boolq_helm",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=-1,
+        metrics=_MCF_METRICS,
+        stop_sequence=["\n"],
+        version=1,
+    ),
+    LightevalTaskConfig(
+        name="boolq:mcf_em",
+        prompt_function=boolq_mcf_prompt,
+        hf_repo="lighteval/boolq_helm",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=1,
+        metrics=[Metrics.exact_match],
+        stop_sequence=["\n"],
+        version=1,
+    ),
 ]
diff --git a/src/lighteval/tasks/tasks/chembench.py b/src/lighteval/tasks/tasks/chembench.py
new file mode 100644
index 000000000..340b70761
--- /dev/null
+++ b/src/lighteval/tasks/tasks/chembench.py
@@ -0,0 +1,78 @@
+"""
+name:
+ChemBench Chemistry QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/chembench.parquet)
+
+abstract:
+Chemistry multiple-choice benchmark covering analytical, organic, and physical
+chemistry. 2,542 questions from TitanEval-MCQ, 0-shot.
+
+languages:
+english
+
+tags:
+chemistry, multiple-choice, qa, science
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def chembench_cf_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def chembench_mcf_prompt(line, task_name: str = None):
+    choices = line["choices"]
+    labels = list("ABCDEFGHIJ"[: len(choices)])
+    options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"chembench:{suffix}",
+        prompt_function=fn,
+        hf_repo="titaneval_local",
+        hf_subset="chembench",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     chembench_cf_prompt,  _CF_METRICS,          -1),
+        ("mcf",    chembench_mcf_prompt, _MCF_METRICS,         -1),
+        ("mcf_em", chembench_mcf_prompt, [Metrics.exact_match],  1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/coqa.py b/src/lighteval/tasks/tasks/coqa.py
index f1bd3bad6..e1748ea73 100644
--- a/src/lighteval/tasks/tasks/coqa.py
+++ b/src/lighteval/tasks/tasks/coqa.py
@@ -122,7 +122,7 @@ def coqa_bpb_prompt(line, task_name: str = None):
         few_shots_select=None,
         generation_size=50,
         stop_sequence=["\n\n"],
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         version=1,
     ),
 ]
diff --git a/src/lighteval/tasks/tasks/cybermetric.py b/src/lighteval/tasks/tasks/cybermetric.py
new file mode 100644
index 000000000..5bec7d71c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/cybermetric.py
@@ -0,0 +1,204 @@
+"""
+name:
+CyberMetric + SecQA
+
+datasets:
+tihanyin/CyberMetric
+zefang-liu/secqa (secqa_v1, secqa_v2)
+
+abstract:
+Cybersecurity multiple-choice benchmarks. CyberMetric covers cybersecurity
+concepts. SecQA (Security QA) consists of expert-written 4-choice security
+questions in two versions of increasing difficulty.
+
+languages:
+english
+
+tags:
+cybersecurity, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/2411.02228 (CyberMetric)
+"""
+
+import ast
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+# ---- CyberMetric ----
+
+def _cybermetric_parse(line):
+    """Parse nested 'questions' field into question/choices/gold."""
+    d = line["questions"]
+    if isinstance(d, str):
+        d = ast.literal_eval(d)
+    keys = sorted(d["answers"].keys())
+    choices = [d["answers"][k] for k in keys]
+    answer_key = d.get("correct_solution") or d.get("solution")
+    gold = keys.index(answer_key)
+    return d["question"], choices, gold
+
+
+def cybermetric_cf_prompt(line, task_name: str = None):
+    question, choices, gold = _cybermetric_parse(line)
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {question}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=gold,
+    )
+
+
+def cybermetric_mcf_prompt(line, task_name: str = None):
+    question, choices, gold = _cybermetric_parse(line)
+    labels = list("ABCD")[: len(choices)]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {question}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=gold,
+    )
+
+
+# ---- SecQA ----
+
+def secqa_cf_prompt(line, task_name: str = None):
+    choices = [line["A"], line["B"], line["C"], line["D"]]
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['Question']}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=list("ABCD").index(line["Answer"]),
+    )
+
+
+def secqa_mcf_prompt(line, task_name: str = None):
+    choices = [line["A"], line["B"], line["C"], line["D"]]
+    options = "\n".join(f" {l}. {c}" for l, c in zip("ABCD", choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['Question']}\n{options}\nAnswer:",
+        choices=[" A", " B", " C", " D"],
+        gold_index=list("ABCD").index(line["Answer"]),
+    )
+
+
+def _cybermetric_configs():
+    return [
+        LightevalTaskConfig(
+            name=f"cybermetric:{suffix}",
+            prompt_function=fn,
+            hf_repo="tihanyin/CyberMetric",
+            hf_subset="default",
+            hf_avail_splits=["train"],
+            evaluation_splits=["train"],
+            few_shots_split="train",
+            few_shots_select="random_sampling",
+            generation_size=gen,
+            metrics=metrics,
+            stop_sequence=["\n"],
+            version=0,
+        )
+        for suffix, fn, metrics, gen in [
+            ("cf",     cybermetric_cf_prompt,  _CF_METRICS,             -1),
+            ("mcf",    cybermetric_mcf_prompt, _MCF_METRICS,            -1),
+            ("mcf_em", cybermetric_mcf_prompt, [Metrics.exact_match],    1),
+        ]
+    ]
+
+
+def _secqa_configs(version: str):
+    return [
+        LightevalTaskConfig(
+            name=f"secqa:{version}:{suffix}",
+            prompt_function=fn,
+            hf_repo="zefang-liu/secqa",
+            hf_subset=f"secqa_{version}",
+            hf_avail_splits=["test"],
+            evaluation_splits=["test"],
+            few_shots_split="test",
+            few_shots_select="random_sampling",
+            generation_size=gen,
+            metrics=metrics,
+            stop_sequence=["\n"],
+            version=0,
+        )
+        for suffix, fn, metrics, gen in [
+            ("cf",     secqa_cf_prompt,  _CF_METRICS,             -1),
+            ("mcf",    secqa_mcf_prompt, _MCF_METRICS,            -1),
+            ("mcf_em", secqa_mcf_prompt, [Metrics.exact_match],    1),
+        ]
+    ]
+
+
+# ── cybermetric: titaneval_local (cached parquet, pre-flattened) ──
+
+def _cybermetric_titaneval_cf_prompt(line, task_name: str = None):
+    """CF variant: completion-style, logprob on full answer text."""
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def _cybermetric_titaneval_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled options, score label tokens via logprobs."""
+    labels = list("ABCD")[: len(line["choices"])]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, line["choices"]))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+def _cybermetric_titaneval_configs():
+    return [
+        LightevalTaskConfig(
+            name=f"cybermetric:{suffix}",
+            prompt_function=fn,
+            hf_repo="titaneval_local",
+            hf_subset="cybermetric",
+            hf_avail_splits=["test"],
+            evaluation_splits=["test"],
+            few_shots_split="test",
+            few_shots_select="random_sampling",
+            generation_size=gen,
+            metrics=metrics,
+            stop_sequence=["\n"],
+            version=1,
+        )
+        for suffix, fn, metrics, gen in [
+            ("cf",     _cybermetric_titaneval_cf_prompt,  _CF_METRICS,             -1),
+            ("mcf",    _cybermetric_titaneval_mcf_prompt, _MCF_METRICS,            -1),
+            ("mcf_em", _cybermetric_titaneval_mcf_prompt, [Metrics.exact_match],    1),
+        ]
+    ]
+
+
+TASKS_TABLE = (
+    _cybermetric_configs()
+    + _secqa_configs("v1")
+    + _secqa_configs("v2")
+    + _cybermetric_titaneval_configs()
+)
diff --git a/src/lighteval/tasks/tasks/esgenius.py b/src/lighteval/tasks/tasks/esgenius.py
new file mode 100644
index 000000000..8d4d5a0e0
--- /dev/null
+++ b/src/lighteval/tasks/tasks/esgenius.py
@@ -0,0 +1,78 @@
+"""
+name:
+ESGenius ESG/Sustainability QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/esgenius.parquet)
+
+abstract:
+ESG (Environmental, Social, Governance) and sustainability multiple-choice
+benchmark. 1,136 questions from TitanEval-MCQ, 0-shot.
+
+languages:
+english
+
+tags:
+esg, sustainability, multiple-choice, qa
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def esgenius_cf_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def esgenius_mcf_prompt(line, task_name: str = None):
+    choices = line["choices"]
+    labels = list("ABCDEFGHIJ"[: len(choices)])
+    options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"esgenius:{suffix}",
+        prompt_function=fn,
+        hf_repo="titaneval_local",
+        hf_subset="esgenius",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     esgenius_cf_prompt,  _CF_METRICS,          -1),
+        ("mcf",    esgenius_mcf_prompt, _MCF_METRICS,         -1),
+        ("mcf_em", esgenius_mcf_prompt, [Metrics.exact_match],  1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/formationeval.py b/src/lighteval/tasks/tasks/formationeval.py
new file mode 100644
index 000000000..a760c6c98
--- /dev/null
+++ b/src/lighteval/tasks/tasks/formationeval.py
@@ -0,0 +1,85 @@
+"""
+name:
+FormationEval
+
+dataset:
+AlmazErmilov/FormationEval
+
+abstract:
+FormationEval is a multiple-choice benchmark for formation evaluation and
+petroleum engineering knowledge.
+
+languages:
+english
+
+tags:
+multiple-choice, petroleum, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+_LABELS = list("ABCDE")
+
+
+def formationeval_cf_prompt(line, task_name: str = None):
+    import ast
+    choices = ast.literal_eval(line["choices"]) if isinstance(line["choices"], str) else line["choices"]
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=int(line["answer_index"]),
+    )
+
+
+def formationeval_mcf_prompt(line, task_name: str = None):
+    import ast
+    choices = ast.literal_eval(line["choices"]) if isinstance(line["choices"], str) else line["choices"]
+    labels = _LABELS[: len(choices)]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=int(line["answer_index"]),
+    )
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"formationeval:{suffix}",
+        prompt_function=fn,
+        hf_repo="AlmazErmilov/FormationEval",
+        hf_subset="default",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     formationeval_cf_prompt,  _CF_METRICS,             -1),
+        ("mcf",    formationeval_mcf_prompt, _MCF_METRICS,            -1),
+        ("mcf_em", formationeval_mcf_prompt, [Metrics.exact_match],    1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/geobench.py b/src/lighteval/tasks/tasks/geobench.py
new file mode 100644
index 000000000..75d8358b3
--- /dev/null
+++ b/src/lighteval/tasks/tasks/geobench.py
@@ -0,0 +1,78 @@
+"""
+name:
+GeoBench Geoscience QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/geobench.parquet)
+
+abstract:
+Geoscience multiple-choice benchmark. 1,390 questions from TitanEval-MCQ
+(originally GeoBench-VLM, text-only subset used here), 0-shot.
+
+languages:
+english
+
+tags:
+geoscience, multiple-choice, qa, science
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def geobench_cf_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def geobench_mcf_prompt(line, task_name: str = None):
+    choices = line["choices"]
+    labels = list("ABCDEFGHIJ"[: len(choices)])
+    options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"geobench:{suffix}",
+        prompt_function=fn,
+        hf_repo="titaneval_local",
+        hf_subset="geobench",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     geobench_cf_prompt,  _CF_METRICS,          -1),
+        ("mcf",    geobench_mcf_prompt, _MCF_METRICS,         -1),
+        ("mcf_em", geobench_mcf_prompt, [Metrics.exact_match],  1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/gpqa.py b/src/lighteval/tasks/tasks/gpqa.py
index 88a31c97c..31b955c66 100644
--- a/src/lighteval/tasks/tasks/gpqa.py
+++ b/src/lighteval/tasks/tasks/gpqa.py
@@ -23,6 +23,7 @@
 https://arxiv.org/abs/2311.12022
 """
 
+import hashlib
 import random
 from string import ascii_uppercase
 
@@ -30,10 +31,38 @@
 from inspect_ai.scorer import choice
 from inspect_ai.solver import multiple_choice
 
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
 from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _stable_choices(line):
+    """Deterministic shuffle of [Correct, Incorrect1, Incorrect2, Incorrect3] by question hash."""
+    items = [
+        line["Correct Answer"],
+        line["Incorrect Answer 1"],
+        line["Incorrect Answer 2"],
+        line["Incorrect Answer 3"],
+    ]
+    seed = int(hashlib.md5(line["Question"].encode()).hexdigest(), 16) % (2**32)
+    rng = random.Random(seed)
+    rng.shuffle(items)
+    gold = items.index(line["Correct Answer"])
+    return items, gold
+
 
 def record_to_sample(record):
     gold_index = random.randint(0, 3)
@@ -178,9 +207,74 @@ def gpqa_instruct_prompt(line, task_name: str = None):
     version=0,
 )
 
+def gpqa_cf_prompt(line, task_name: str = None):
+    """CF variant: score full answer texts via logprobs (deterministic shuffle)."""
+    choices, gold = _stable_choices(line)
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['Question'].strip()}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=gold,
+    )
+
+
+def gpqa_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled A/B/C/D options, score label tokens via logprobs."""
+    choices, gold = _stable_choices(line)
+    options = "\n".join(f" {l}. {c}" for l, c in zip(ascii_uppercase, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['Question'].strip()}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in ascii_uppercase[:4]],
+        gold_index=gold,
+    )
+
+
+_GPQA_COMMON = dict(
+    hf_repo="Idavidrein/gpqa",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split="train",
+    few_shots_select="random_sampling",
+    stop_sequence=["\n"],
+)
+
+gpqa_diamond_cf = LightevalTaskConfig(
+    name="gpqa:diamond:cf",
+    prompt_function=gpqa_cf_prompt,
+    hf_subset="gpqa_diamond",
+    generation_size=-1,
+    metrics=_CF_METRICS,
+    version=0,
+    **_GPQA_COMMON,
+)
+
+gpqa_diamond_mcf = LightevalTaskConfig(
+    name="gpqa:diamond:mcf",
+    prompt_function=gpqa_mcf_prompt,
+    hf_subset="gpqa_diamond",
+    generation_size=-1,
+    metrics=_MCF_METRICS,
+    version=0,
+    **_GPQA_COMMON,
+)
+
+gpqa_diamond_mcf_em = LightevalTaskConfig(
+    name="gpqa:diamond:mcf_em",
+    prompt_function=gpqa_mcf_prompt,
+    hf_subset="gpqa_diamond",
+    generation_size=1,
+    metrics=[Metrics.exact_match],
+    version=0,
+    **_GPQA_COMMON,
+)
+
 TASKS_TABLE = [
     gpqa,
     gpqa_diamond_instruct,
     gpqa_extended_instruct,
     gpqa_main_instruct,
+    gpqa_diamond_cf,
+    gpqa_diamond_mcf,
+    gpqa_diamond_mcf_em,
 ]
diff --git a/src/lighteval/tasks/tasks/gsm_plus.py b/src/lighteval/tasks/tasks/gsm_plus.py
index 27a1dca27..ddb958e23 100644
--- a/src/lighteval/tasks/tasks/gsm_plus.py
+++ b/src/lighteval/tasks/tasks/gsm_plus.py
@@ -76,8 +76,8 @@ def gsm_plus_prompt(line, task_name: str = None):
     hf_subset="default",
     hf_avail_splits=["test", "testmini"],
     evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select=None,
+    few_shots_split="testmini",
+    few_shots_select="random_sampling",
     generation_size=512,
     metrics=[Metrics.expr_gold_metric],
     stop_sequence=["Question:", "\n####", "####"],
diff --git a/src/lighteval/tasks/tasks/gsm_symbolic.py b/src/lighteval/tasks/tasks/gsm_symbolic.py
index 464f51e11..e3c3595d8 100644
--- a/src/lighteval/tasks/tasks/gsm_symbolic.py
+++ b/src/lighteval/tasks/tasks/gsm_symbolic.py
@@ -66,7 +66,7 @@ def gsm_symbolic_prompt(line, task_name: str = None):
         hf_avail_splits=["test"],
         evaluation_splits=["test"],
         few_shots_split="test",
-        few_shots_select="random_sampling_from_train",
+        few_shots_select="random_sampling",
         generation_size=512,
         metrics=[Metrics.expr_gold_metric],
         stop_sequence=["Problem:", "\n####", "####"],
diff --git a/src/lighteval/tasks/tasks/headqa.py b/src/lighteval/tasks/tasks/headqa.py
index 85c025c5f..83f99bd8a 100644
--- a/src/lighteval/tasks/tasks/headqa.py
+++ b/src/lighteval/tasks/tasks/headqa.py
@@ -22,12 +22,26 @@
 https://arxiv.org/abs/1906.04701
 """
 
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
 from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
 
-def headqa_prompt(line, task_name: str = None):
+def headqa_cf_prompt(line, task_name: str = None):
+    """CF variant: score full answer texts via logprobs."""
     return Doc(
         task_name=task_name,
         query=f"Question: {line['qtext']}\nAnswer:",
@@ -36,42 +50,146 @@ def headqa_prompt(line, task_name: str = None):
     )
 
 
-headqa_en = LightevalTaskConfig(
-    name="headqa:en",
-    prompt_function=headqa_prompt,
-    hf_repo="lighteval/headqa_harness",
-    hf_subset="en",
-    hf_avail_splits=["train", "test", "validation"],
+def headqa_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled options, score label tokens via logprobs."""
+    labels = list("ABCDE")[: len(line["answers"])]
+    options = "\n".join(f" {l}. {a['atext']}" for l, a in zip(labels, line["answers"]))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['qtext']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=int(line["ra"]) - 1,
+    )
+
+
+def _configs(lang: str):
+    return [
+        LightevalTaskConfig(
+            name=f"headqa:{lang}:cf",
+            prompt_function=headqa_cf_prompt,
+            hf_repo="lighteval/headqa_harness",
+            hf_subset=lang,
+            hf_avail_splits=["train", "test", "validation"],
+            evaluation_splits=["test"],
+            few_shots_split="train",
+            few_shots_select="random_sampling_from_train",
+            generation_size=-1,
+            metrics=_CF_METRICS,
+            stop_sequence=["\n"],
+            version=1,
+        ),
+        LightevalTaskConfig(
+            name=f"headqa:{lang}:mcf",
+            prompt_function=headqa_mcf_prompt,
+            hf_repo="lighteval/headqa_harness",
+            hf_subset=lang,
+            hf_avail_splits=["train", "test", "validation"],
+            evaluation_splits=["test"],
+            few_shots_split="train",
+            few_shots_select="random_sampling_from_train",
+            generation_size=-1,
+            metrics=_MCF_METRICS,
+            stop_sequence=["\n"],
+            version=1,
+        ),
+        LightevalTaskConfig(
+            name=f"headqa:{lang}:mcf_em",
+            prompt_function=headqa_mcf_prompt,
+            hf_repo="lighteval/headqa_harness",
+            hf_subset=lang,
+            hf_avail_splits=["train", "test", "validation"],
+            evaluation_splits=["test"],
+            few_shots_split="train",
+            few_shots_select="random_sampling_from_train",
+            generation_size=1,
+            metrics=[Metrics.exact_match],
+            stop_sequence=["\n"],
+            version=1,
+        ),
+    ]
+
+
+# ── headqa: titaneval_local (cached parquet, en+es combined) ──
+
+def _headqa_en_filter(row) -> bool:
+    """Select English-only rows (domain has format 'category/en')."""
+    domain = row.get("domain") or ""
+    return domain.endswith("/en")
+
+
+def _headqa_titaneval_cf_prompt(line, task_name: str = None):
+    """CF variant: completion-style, logprob on full answer text."""
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def _headqa_titaneval_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled options, score label tokens via logprobs."""
+    labels = list("ABCDE")[: len(line["choices"])]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, line["choices"]))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+headqa_titaneval_cf = LightevalTaskConfig(
+    name="headqa:cf",
+    prompt_function=_headqa_titaneval_cf_prompt,
+    hf_repo="titaneval_local",
+    hf_subset="headqa",
+    hf_avail_splits=["test"],
     evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select=None,
+    few_shots_split="test",
+    few_shots_select="random_sampling",
+    hf_filter=_headqa_en_filter,
     generation_size=-1,
-    metrics=[
-        Metrics.loglikelihood_acc,
-    ],
+    metrics=_CF_METRICS,
     stop_sequence=["\n"],
-    version=0,
+    version=1,
 )
 
-
-headqa_es = LightevalTaskConfig(
-    name="headqa:es",
-    prompt_function=headqa_prompt,
-    hf_repo="lighteval/headqa_harness",
-    hf_subset="es",
-    hf_avail_splits=["train", "test", "validation"],
+headqa_titaneval_mcf = LightevalTaskConfig(
+    name="headqa:mcf",
+    prompt_function=_headqa_titaneval_mcf_prompt,
+    hf_repo="titaneval_local",
+    hf_subset="headqa",
+    hf_avail_splits=["test"],
     evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select=None,
+    few_shots_split="test",
+    few_shots_select="random_sampling",
+    hf_filter=_headqa_en_filter,
     generation_size=-1,
-    metrics=[
-        Metrics.loglikelihood_acc,
-    ],
+    metrics=_MCF_METRICS,
     stop_sequence=["\n"],
-    version=0,
+    version=1,
 )
 
+headqa_titaneval_mcf_em = LightevalTaskConfig(
+    name="headqa:mcf_em",
+    prompt_function=_headqa_titaneval_mcf_prompt,
+    hf_repo="titaneval_local",
+    hf_subset="headqa",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split="test",
+    few_shots_select="random_sampling",
+    hf_filter=_headqa_en_filter,
+    generation_size=1,
+    metrics=[Metrics.exact_match],
+    stop_sequence=["\n"],
+    version=1,
+)
+
+
 TASKS_TABLE = [
-    headqa_en,
-    headqa_es,
+    headqa_titaneval_cf,
+    headqa_titaneval_mcf,
+    headqa_titaneval_mcf_em,
 ]
diff --git a/src/lighteval/tasks/tasks/jeopardy.py b/src/lighteval/tasks/tasks/jeopardy.py
index 67cf40360..21d7ba996 100644
--- a/src/lighteval/tasks/tasks/jeopardy.py
+++ b/src/lighteval/tasks/tasks/jeopardy.py
@@ -125,7 +125,7 @@ def jeopardy_mc_mcf_prompt(line, task_name: str = None):
         few_shots_select="random_sampling_from_train",
         generation_size=50,
         stop_sequence=["\n\n", "Question:", "Category:"],
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         version=0,
     ),
     LightevalTaskConfig(
diff --git a/src/lighteval/tasks/tasks/labbench.py b/src/lighteval/tasks/tasks/labbench.py
new file mode 100644
index 000000000..f37be5480
--- /dev/null
+++ b/src/lighteval/tasks/tasks/labbench.py
@@ -0,0 +1,99 @@
+"""
+name:
+LAB-Bench (TableQA)
+
+dataset:
+futurehouse/LAB-Bench (TableQA config)
+
+abstract:
+LAB-Bench is a biology laboratory capabilities benchmark. The TableQA subset
+tests reading comprehension of scientific tables with 4-choice MCQ. Note: the
+original tables are provided as images; this text-only variant evaluates
+without visual context.
+
+languages:
+english
+
+tags:
+biology, multiple-choice, qa, science
+
+paper:
+https://arxiv.org/abs/2407.10362
+"""
+
+import ast
+import hashlib
+import random
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _stable_choices(line):
+    """Deterministic shuffle of [ideal] + distractors to avoid position bias."""
+    ideal = line["ideal"]
+    distractors = ast.literal_eval(line["distractors"]) if isinstance(line["distractors"], str) else line["distractors"]
+    items = [ideal] + list(distractors)
+    seed = int(hashlib.md5(line["question"].encode()).hexdigest(), 16) % (2**32)
+    rng = random.Random(seed)
+    rng.shuffle(items)
+    return items, items.index(ideal)
+
+
+def labbench_cf_prompt(line, task_name: str = None):
+    choices, gold = _stable_choices(line)
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=gold,
+    )
+
+
+def labbench_mcf_prompt(line, task_name: str = None):
+    choices, gold = _stable_choices(line)
+    labels = list("ABCD")[: len(choices)]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=gold,
+    )
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"labbench:{suffix}",
+        prompt_function=fn,
+        hf_repo="futurehouse/LAB-Bench",
+        hf_subset="TableQA",
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     labbench_cf_prompt,  _CF_METRICS,             -1),
+        ("mcf",    labbench_mcf_prompt, _MCF_METRICS,            -1),
+        ("mcf_em", labbench_mcf_prompt, [Metrics.exact_match],    1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/mascqa.py b/src/lighteval/tasks/tasks/mascqa.py
new file mode 100644
index 000000000..c122c05d5
--- /dev/null
+++ b/src/lighteval/tasks/tasks/mascqa.py
@@ -0,0 +1,98 @@
+"""
+name:
+MaScQA
+
+dataset:
+heegyu/mascqa
+
+abstract:
+MaScQA (Materials Science Question Answering) is a multiple-choice benchmark
+for materials science knowledge, sourced from standardized exams. Answer choices
+are embedded in the question text in (A) / (B) / (C) / (D) format.
+
+languages:
+english
+
+tags:
+materials-science, multiple-choice, qa, science
+
+paper:
+https://arxiv.org/abs/2209.09088
+"""
+
+import re
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+_LABELS = list("ABCDE")
+
+
+def _parse(line):
+    """Extract question and choices from embedded format '... (A) c1 (B) c2 ...'."""
+    text = line["questions"].strip()
+    m = re.search(r'\s*\(A\)', text)
+    question = text[: m.start()].strip() if m else text
+    raw = re.findall(r'\(([A-E])\)\s*(.*?)(?=\s*\([A-E]\)|$)', text)
+    choices = [v.strip() for _, v in raw]
+    labels = [k for k, _ in raw]
+    gold = labels.index(line["label"]) if line["label"] in labels else 0
+    return question, choices, labels, gold
+
+
+def mascqa_cf_prompt(line, task_name: str = None):
+    question, choices, _, gold = _parse(line)
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {question}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=gold,
+    )
+
+
+def mascqa_mcf_prompt(line, task_name: str = None):
+    question, choices, labels, gold = _parse(line)
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {question}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=gold,
+    )
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"mascqa:{suffix}",
+        prompt_function=fn,
+        hf_repo="heegyu/mascqa",
+        hf_subset="default",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     mascqa_cf_prompt,  _CF_METRICS,             -1),
+        ("mcf",    mascqa_mcf_prompt, _MCF_METRICS,            -1),
+        ("mcf_em", mascqa_mcf_prompt, [Metrics.exact_match],    1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/math_500.py b/src/lighteval/tasks/tasks/math_500.py
index 1ca2e2aa6..5299c8817 100644
--- a/src/lighteval/tasks/tasks/math_500.py
+++ b/src/lighteval/tasks/tasks/math_500.py
@@ -64,8 +64,8 @@ def record_to_sample(record):
     hf_subset="default",
     hf_avail_splits=["test"],
     evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select=None,
+    few_shots_split="test",
+    few_shots_select="random_sampling",
     generation_size=1024,
     metrics=[
         # Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}),
diff --git a/src/lighteval/tasks/tasks/med.py b/src/lighteval/tasks/tasks/med.py
index fa4f2f19f..b94568948 100644
--- a/src/lighteval/tasks/tasks/med.py
+++ b/src/lighteval/tasks/tasks/med.py
@@ -82,6 +82,16 @@ def med_qa_prompt(line, task_name: str = None):
     )
 
 
+def med_qa_cf_prompt(line, task_name: str = None):
+    """CF variant: completion-style prompt with full answer texts as choices."""
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + opt["value"] for opt in line["options"]],
+        gold_index=list(ascii_uppercase).index(line["answer_idx"]),
+    )
+
+
 def med_qa_mcf_em_prompt(line, task_name: str = None):
     """Greedy variant: generate 1 token, compare with gold letter."""
     query = f"Give a letter answer among A, B, C or D.\nQuestion: {line['question']}\n"
@@ -187,10 +197,89 @@ def med_mcqa_cf_prompt(line, task_name: str = None):
     version=0,
 )
 
+# ── med_qa: titaneval_local (cached parquet) ──
+
+def _med_qa_titaneval_cf_prompt(line, task_name: str = None):
+    """CF variant: completion-style, logprob on full answer text."""
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def _med_qa_titaneval_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled A/B/C/D options, score label tokens via logprobs."""
+    labels = list(ascii_uppercase)[: len(line["choices"])]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, line["choices"]))
+    return Doc(
+        task_name=task_name,
+        query=f"Give a letter answer among A, B, C or D.\nQuestion: {line['question']}\n{options}\nAnswer:",
+        choices=[" " + l for l in labels],
+        gold_index=line["answer_index"],
+        instruction="Give a letter answer among A, B, C or D.\n",
+    )
+
+
+med_qa_titaneval_cf = LightevalTaskConfig(
+    name="med_qa:cf",
+    prompt_function=_med_qa_titaneval_cf_prompt,
+    hf_repo="titaneval_local",
+    hf_subset="med_qa",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split="test",
+    few_shots_select="random_sampling",
+    generation_size=-1,
+    metrics=_CF_METRICS,
+    stop_sequence=["\n"],
+    version=1,
+)
+
+med_qa_titaneval_mcf = LightevalTaskConfig(
+    name="med_qa:mcf",
+    prompt_function=_med_qa_titaneval_mcf_prompt,
+    hf_repo="titaneval_local",
+    hf_subset="med_qa",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split="test",
+    few_shots_select="random_sampling",
+    generation_size=-1,
+    metrics=_MCF_METRICS,
+    stop_sequence=["\n"],
+    version=1,
+)
+
+med_qa_titaneval_mcf_em = LightevalTaskConfig(
+    name="med_qa:mcf_em",
+    prompt_function=_med_qa_titaneval_mcf_prompt,
+    hf_repo="titaneval_local",
+    hf_subset="med_qa",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split="test",
+    few_shots_select="random_sampling",
+    generation_size=1,
+    metrics=[Metrics.exact_match],
+    stop_sequence=["\n"],
+    version=1,
+)
+
+# ── original bigbio/med_qa configs (broken: script-based dataset) ──
+# med_qa_cf = LightevalTaskConfig(
+#     name="med_qa:cf",
+#     prompt_function=med_qa_cf_prompt,
+#     hf_repo="bigbio/med_qa",
+#     ...
+# )
+
 TASKS_TABLE = [
     med_mcqa_mcf_em,
     med_mcqa_mcf,
     med_mcqa_cf,
-    med_qa_mcf,
-    med_qa_mcf_em,
+    med_qa_titaneval_cf,
+    med_qa_titaneval_mcf,
+    med_qa_titaneval_mcf_em,
 ]
diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py
index 2ede01f50..66378cbfb 100644
--- a/src/lighteval/tasks/tasks/mmlu_pro.py
+++ b/src/lighteval/tasks/tasks/mmlu_pro.py
@@ -88,8 +88,19 @@ def record_to_sample(record):
     return Sample(input=record["question"], target=record["answer"], choices=record["options"])
 
 
-mmlu_pro = LightevalTaskConfig(
-    name="mmlu_pro",
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+mmlu_pro_cot = LightevalTaskConfig(
+    name="mmlu_pro:cot",
     prompt_function=mmlu_pro_prompt_function,
     sample_fields=record_to_sample,
     solver=[multiple_choice(cache=True)],
@@ -102,8 +113,8 @@ def record_to_sample(record):
     metrics=[Metrics.gpqa_instruct_metric],
 )
 
-mmlu_pro_mcf = LightevalTaskConfig(
-    name="mmlu_pro_mcf",
+mmlu_pro_mcf_em = LightevalTaskConfig(
+    name="mmlu_pro:mcf_em",
     prompt_function=mmlu_pro_mcf_prompt_function,
     sample_fields=record_to_sample,
     solver=[multiple_choice(cache=True)],
@@ -113,16 +124,28 @@ def record_to_sample(record):
     hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea",
     evaluation_splits=("test",),
     few_shots_split="validation",
-    metrics=[
-        Metrics.exact_match,
-        # Metrics.gpqa_instruct_metric
-        ],
     generation_size=1,
-    stop_sequence=["\n", "\n\n"],
+    stop_sequence=["\n"],
+    metrics=[Metrics.exact_match],
+)
+
+mmlu_pro_mcf = LightevalTaskConfig(
+    name="mmlu_pro:mcf",
+    prompt_function=mmlu_pro_mcf_prompt_function,
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
+    hf_repo="TIGER-Lab/MMLU-Pro",
+    hf_subset="default",
+    hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea",
+    evaluation_splits=("test",),
+    few_shots_split="validation",
+    generation_size=-1,
+    metrics=_MCF_METRICS,
 )
 
 mmlu_pro_cf = LightevalTaskConfig(
-    name="mmlu_pro_cf",
+    name="mmlu_pro:cf",
     prompt_function=mmlu_pro_cf_prompt_function,
     sample_fields=record_to_sample,
     solver=[multiple_choice(cache=True)],
@@ -132,11 +155,8 @@ def record_to_sample(record):
     hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea",
     evaluation_splits=("test",),
     few_shots_split="validation",
-    metrics=[
-        LogLikelihoodAccMetric(),
-        LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
-        Metrics.target_bits_per_byte,
-        ],
+    generation_size=-1,
+    metrics=_CF_METRICS,
 )
 
-TASKS_TABLE = [mmlu_pro, mmlu_pro_mcf, mmlu_pro_cf]
+TASKS_TABLE = [mmlu_pro_cot, mmlu_pro_cf, mmlu_pro_mcf, mmlu_pro_mcf_em]
diff --git a/src/lighteval/tasks/tasks/natural_questions.py b/src/lighteval/tasks/tasks/natural_questions.py
index 5a39fa8ff..b532c51c8 100644
--- a/src/lighteval/tasks/tasks/natural_questions.py
+++ b/src/lighteval/tasks/tasks/natural_questions.py
@@ -81,7 +81,7 @@ def nq_bpb_prompt(line, task_name: str = None):
         few_shots_select="random_sampling_from_train",
         generation_size=50,
         stop_sequence=["Question:", "Q:", "\n\n"],
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         version=1,
     ),
 ]
diff --git a/src/lighteval/tasks/tasks/popqa.py b/src/lighteval/tasks/tasks/popqa.py
index ce44c8de3..11f74f582 100644
--- a/src/lighteval/tasks/tasks/popqa.py
+++ b/src/lighteval/tasks/tasks/popqa.py
@@ -67,7 +67,7 @@ def popqa_bpb_prompt(line, task_name: str = None):
         few_shots_split="test",
         few_shots_select="random_sampling",
         generation_size=8,
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         stop_sequence=["\n"],
         version=1,
     ),
diff --git a/src/lighteval/tasks/tasks/preflight.py b/src/lighteval/tasks/tasks/preflight.py
new file mode 100644
index 000000000..b4ea34fba
--- /dev/null
+++ b/src/lighteval/tasks/tasks/preflight.py
@@ -0,0 +1,78 @@
+"""
+name:
+Preflight Aviation Safety QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/preflight.parquet)
+
+abstract:
+Aviation safety multiple-choice questions derived from international airport ground
+operations manuals and FAA/ICAO regulations. 300 questions, 0-shot.
+
+languages:
+english
+
+tags:
+aviation, multiple-choice, qa, safety
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def preflight_cf_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def preflight_mcf_prompt(line, task_name: str = None):
+    choices = line["choices"]
+    labels = list("ABCDEFGHIJ"[: len(choices)])
+    options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"preflight:{suffix}",
+        prompt_function=fn,
+        hf_repo="titaneval_local",
+        hf_subset="preflight",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     preflight_cf_prompt,  _CF_METRICS,          -1),
+        ("mcf",    preflight_mcf_prompt, _MCF_METRICS,         -1),
+        ("mcf_em", preflight_mcf_prompt, [Metrics.exact_match],  1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/pubmedqa.py b/src/lighteval/tasks/tasks/pubmedqa.py
index 326789bb6..17654760a 100644
--- a/src/lighteval/tasks/tasks/pubmedqa.py
+++ b/src/lighteval/tasks/tasks/pubmedqa.py
@@ -3,10 +3,11 @@
 Pubmedqa
 
 dataset:
-pubmed_qa
+qiaojin/PubMedQA
 
 abstract:
 PubMedQA is a dataset for biomedical research question answering.
+Each question is answerable with yes, no, or maybe based on a PubMed abstract.
 
 languages:
 english
@@ -18,37 +19,91 @@
 https://pubmedqa.github.io/
 """
 
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
 from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
+_CHOICES = ["yes", "no", "maybe"]
+_GOLD = {c: i for i, c in enumerate(_CHOICES)}
 
-def pubmed_qa_prompt(line, task_name: str = None):
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def pubmedqa_cf_prompt(line, task_name: str = None):
+    """CF variant: score full answer text (yes/no/maybe) via logprobs."""
+    ctx = " ".join(line["CONTEXTS"]) if isinstance(line["CONTEXTS"], list) else line["CONTEXTS"]
     return Doc(
         task_name=task_name,
-        query=f"{line['QUESTION']}\n{line['CONTEXTS']}\nAnswer: ",
-        choices=[line["final_decision"]],
-        gold_index=0,
+        query=f"Abstract: {ctx}\nQuestion: {line['QUESTION']}\nAnswer:",
+        choices=[f" {c}" for c in _CHOICES],
+        gold_index=_GOLD[line["final_decision"].lower()],
     )
 
 
-pubmedqa = LightevalTaskConfig(
-    name="pubmedqa",
-    prompt_function=pubmed_qa_prompt,
-    hf_repo="pubmed_qa",
-    hf_subset="pqa_labeled",
-    hf_avail_splits=["train"],
-    evaluation_splits=["train"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=1,
-    metrics=[
-        Metrics.exact_match,
-    ],
-    stop_sequence=["\n"],
-    version=0,
-)
+def pubmedqa_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled A/B/C options, score label tokens via logprobs."""
+    ctx = " ".join(line["CONTEXTS"]) if isinstance(line["CONTEXTS"], list) else line["CONTEXTS"]
+    options = "\n".join(f" {l}. {c}" for l, c in zip("ABC", _CHOICES))
+    return Doc(
+        task_name=task_name,
+        query=f"Abstract: {ctx}\nQuestion: {line['QUESTION']}\n{options}\nAnswer:",
+        choices=[" A", " B", " C"],
+        gold_index=_GOLD[line["final_decision"].lower()],
+    )
+
 
 TASKS_TABLE = [
-    pubmedqa,
+    LightevalTaskConfig(
+        name="pubmedqa:cf",
+        prompt_function=pubmedqa_cf_prompt,
+        hf_repo="qiaojin/PubMedQA",
+        hf_subset="pqa_labeled",
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select="random_sampling",
+        generation_size=-1,
+        metrics=_CF_METRICS,
+        stop_sequence=["\n"],
+        version=1,
+    ),
+    LightevalTaskConfig(
+        name="pubmedqa:mcf",
+        prompt_function=pubmedqa_mcf_prompt,
+        hf_repo="qiaojin/PubMedQA",
+        hf_subset="pqa_labeled",
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select="random_sampling",
+        generation_size=-1,
+        metrics=_MCF_METRICS,
+        stop_sequence=["\n"],
+        version=1,
+    ),
+    LightevalTaskConfig(
+        name="pubmedqa:mcf_em",
+        prompt_function=pubmedqa_mcf_prompt,
+        hf_repo="qiaojin/PubMedQA",
+        hf_subset="pqa_labeled",
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select="random_sampling",
+        generation_size=1,
+        metrics=[Metrics.exact_match],
+        stop_sequence=["\n"],
+        version=1,
+    ),
 ]
diff --git a/src/lighteval/tasks/tasks/qasc.py b/src/lighteval/tasks/tasks/qasc.py
new file mode 100644
index 000000000..bb229f9a0
--- /dev/null
+++ b/src/lighteval/tasks/tasks/qasc.py
@@ -0,0 +1,110 @@
+"""
+name:
+Qasc
+
+dataset:
+allenai/qasc
+
+abstract:
+QASC is a question-and-answer dataset that focuses on sentence composition.
+It consists of 8-way multiple choice questions requiring combining two facts
+from a large corpus to derive an answer.
+
+languages:
+english
+
+tags:
+multiple-choice, qa, reasoning, science
+
+paper:
+https://arxiv.org/abs/1910.11473
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+_LABELS = list("ABCDEFGH")
+
+
+def qasc_cf_prompt(line, task_name: str = None):
+    """CF variant: score full answer texts via logprobs."""
+    choices = line["choices"]["text"]
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=_LABELS.index(line["answerKey"]),
+    )
+
+
+def qasc_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled A-H options, score label tokens via logprobs."""
+    choices = line["choices"]["text"]
+    labels = _LABELS[: len(choices)]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=_LABELS.index(line["answerKey"]),
+    )
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name="qasc:cf",
+        prompt_function=qasc_cf_prompt,
+        hf_repo="allenai/qasc",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=-1,
+        metrics=_CF_METRICS,
+        stop_sequence=["\n"],
+        version=0,
+    ),
+    LightevalTaskConfig(
+        name="qasc:mcf",
+        prompt_function=qasc_mcf_prompt,
+        hf_repo="allenai/qasc",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=-1,
+        metrics=_MCF_METRICS,
+        stop_sequence=["\n"],
+        version=0,
+    ),
+    LightevalTaskConfig(
+        name="qasc:mcf_em",
+        prompt_function=qasc_mcf_prompt,
+        hf_repo="allenai/qasc",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=1,
+        metrics=[Metrics.exact_match],
+        stop_sequence=["\n"],
+        version=0,
+    ),
+]
diff --git a/src/lighteval/tasks/tasks/simpleqa.py b/src/lighteval/tasks/tasks/simpleqa.py
index 4493fc322..6e7c1628f 100644
--- a/src/lighteval/tasks/tasks/simpleqa.py
+++ b/src/lighteval/tasks/tasks/simpleqa.py
@@ -31,6 +31,14 @@
 from lighteval.tasks.requests import Doc
 
 
+def record_to_sample(record):
+    query = record["problem"]
+    target = record["answer"]
+    return Sample(input=query, target=target)
+
+
+# ---- Original graded variant (kept for compatibility) ----
+
 def simpleqa_prompt(line, task_name: str = None):
     query = f"Question: {line['question']}\n"
     query += "".join(
@@ -45,12 +53,6 @@ def simpleqa_prompt(line, task_name: str = None):
     )
 
 
-def record_to_sample(record):
-    query = record["problem"]
-    target = record["answer"]
-    return Sample(input=query, target=target)
-
-
 simpleqa = LightevalTaskConfig(
     name="simpleqa",
     prompt_function=simpleqa_prompt,
@@ -69,6 +71,67 @@ def record_to_sample(record):
     scorer=model_graded_fact(),
 )
 
+# ---- GenQA variants (our convention: gen{em,f1} + decoupled bpb) ----
+
+def simpleqa_gen_prompt(line, task_name: str = None):
+    """GenQA variant: generate short answer, score with F1/EM."""
+    answer = line["answer"]
+    prefix = " " if line.get("__few_shots", False) else ""
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['problem']}\nAnswer:",
+        choices=[f"{prefix}{answer}"],
+        gold_index=0,
+    )
+
+
+def simpleqa_bpb_prompt(line, task_name: str = None):
+    """BPB variant: score the gold answer continuation."""
+    answer = line["answer"]
+    if not answer:
+        return None
+    if not answer[0].isspace():
+        answer = " " + answer
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['problem']}\nAnswer:",
+        choices=[answer],
+        gold_index=0,
+    )
+
+
+simpleqa_gen = LightevalTaskConfig(
+    name="simpleqa:gen",
+    prompt_function=simpleqa_gen_prompt,
+    hf_repo="lighteval/SimpleQA",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split="few_shot",
+    few_shots_select="random_sampling",
+    generation_size=50,
+    metrics=[Metrics.qa_f1, Metrics.qa_em],
+    stop_sequence=["\n"],
+    version=1,
+)
+
+simpleqa_bpb = LightevalTaskConfig(
+    name="simpleqa:bpb",
+    prompt_function=simpleqa_bpb_prompt,
+    hf_repo="lighteval/SimpleQA",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split="few_shot",
+    few_shots_select="random_sampling",
+    generation_size=-1,
+    metrics=[Metrics.target_bits_per_byte],
+    stop_sequence=["\n"],
+    version=1,
+)
+
 TASKS_TABLE = [
     simpleqa,
+    simpleqa_gen,
+    simpleqa_bpb,
 ]
diff --git a/src/lighteval/tasks/tasks/squad.py b/src/lighteval/tasks/tasks/squad.py
index 6baf16d6b..ff1ba8acf 100644
--- a/src/lighteval/tasks/tasks/squad.py
+++ b/src/lighteval/tasks/tasks/squad.py
@@ -82,7 +82,7 @@ def squad_bpb_prompt(line, task_name: str = None):
         few_shots_select="random_sampling_from_train",
         generation_size=50,
         stop_sequence=["Title:", "\n\n"],
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         version=1,
     ),
 ]
diff --git a/src/lighteval/tasks/tasks/squad_v2.py b/src/lighteval/tasks/tasks/squad_v2.py
index 6891fb709..2c912843c 100644
--- a/src/lighteval/tasks/tasks/squad_v2.py
+++ b/src/lighteval/tasks/tasks/squad_v2.py
@@ -79,7 +79,7 @@ def squad_v2_bpb_prompt(line, task_name: str = None):
     few_shots_split="train",
     few_shots_select="random_sampling_from_train",
     generation_size=200,
-    metrics=[Metrics.f1_score, Metrics.exact_match],
+    metrics=[Metrics.qa_f1, Metrics.qa_em],
     stop_sequence=["\n", "Question:", "question:"],
     version=1,
 )
diff --git a/src/lighteval/tasks/tasks/swag.py b/src/lighteval/tasks/tasks/swag.py
index 67febe8ec..d0f7a4399 100644
--- a/src/lighteval/tasks/tasks/swag.py
+++ b/src/lighteval/tasks/tasks/swag.py
@@ -25,36 +25,91 @@
 https://arxiv.org/abs/1808.05326
 """
 
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
 from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+_ENDINGS = ["ending0", "ending1", "ending2", "ending3"]
+
 
-def swag_prompt(line, task_name: str = None):
-    choices = [line["ending0"], line["ending1"], line["ending2"], line["ending3"]]
+def swag_cf_prompt(line, task_name: str = None):
+    """CF variant: score full completion texts via logprobs."""
+    choices = [line[e] for e in _ENDINGS]
     return Doc(
         task_name=task_name,
         query=line["startphrase"],
-        choices=choices,
+        choices=[" " + c for c in choices],
         gold_index=int(line["label"]),
     )
 
 
-swag = LightevalTaskConfig(
-    name="swag",
-    prompt_function=swag_prompt,
-    hf_repo="allenai/swag",
-    hf_subset="regular",
-    hf_avail_splits=["train", "validation"],
-    evaluation_splits=["validation"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=-1,
-    metrics=[Metrics.loglikelihood_acc],
-    stop_sequence=["\n"],
-    version=0,
-)
+def swag_mcf_prompt(line, task_name: str = None):
+    """MCF variant: labeled A/B/C/D options, score label tokens via logprobs."""
+    choices = [line[e] for e in _ENDINGS]
+    options = "\n".join(f" {l}. {c}" for l, c in zip("ABCD", choices))
+    query = f"{line['startphrase']}\n{options}\nAnswer:"
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=[" A", " B", " C", " D"],
+        gold_index=int(line["label"]),
+    )
+
 
 TASKS_TABLE = [
-    swag,
+    LightevalTaskConfig(
+        name="swag:cf",
+        prompt_function=swag_cf_prompt,
+        hf_repo="allenai/swag",
+        hf_subset="regular",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=-1,
+        metrics=_CF_METRICS,
+        stop_sequence=["\n"],
+        version=1,
+    ),
+    LightevalTaskConfig(
+        name="swag:mcf",
+        prompt_function=swag_mcf_prompt,
+        hf_repo="allenai/swag",
+        hf_subset="regular",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=-1,
+        metrics=_MCF_METRICS,
+        stop_sequence=["\n"],
+        version=1,
+    ),
+    LightevalTaskConfig(
+        name="swag:mcf_em",
+        prompt_function=swag_mcf_prompt,
+        hf_repo="allenai/swag",
+        hf_subset="regular",
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select="random_sampling_from_train",
+        generation_size=1,
+        metrics=[Metrics.exact_match],
+        stop_sequence=["\n"],
+        version=1,
+    ),
 ]
diff --git a/src/lighteval/tasks/tasks/teleqna.py b/src/lighteval/tasks/tasks/teleqna.py
new file mode 100644
index 000000000..4cf686943
--- /dev/null
+++ b/src/lighteval/tasks/tasks/teleqna.py
@@ -0,0 +1,90 @@
+"""
+name:
+TeleQnA
+
+dataset:
+netop/TeleQnA (gated — must be pre-cached)
+
+abstract:
+TeleQnA is a multiple-choice benchmark covering telecommunications standards
+from 3GPP, IEEE, and other telecom bodies.
+
+languages:
+english
+
+tags:
+multiple-choice, qa, telecom
+
+paper:
+https://arxiv.org/abs/2310.15051
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def _choices(line):
+    choices = line["choices"]
+    if isinstance(choices, str):
+        import ast
+        choices = ast.literal_eval(choices)
+    return choices
+
+
+def teleqna_cf_prompt(line, task_name: str = None):
+    choices = _choices(line)
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in choices],
+        gold_index=int(line["answer"]),
+    )
+
+
+def teleqna_mcf_prompt(line, task_name: str = None):
+    choices = _choices(line)
+    labels = list("ABCDE")[: len(choices)]
+    options = "\n".join(f" {l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=int(line["answer"]),
+    )
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"teleqna:{suffix}",
+        prompt_function=fn,
+        hf_repo="netop/TeleQnA",
+        hf_subset="default",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     teleqna_cf_prompt,  _CF_METRICS,             -1),
+        ("mcf",    teleqna_mcf_prompt, _MCF_METRICS,            -1),
+        ("mcf_em", teleqna_mcf_prompt, [Metrics.exact_match],    1),
+    ]
+]
diff --git a/src/lighteval/tasks/tasks/triviaqa.py b/src/lighteval/tasks/tasks/triviaqa.py
index 94891a09c..4979a5890 100644
--- a/src/lighteval/tasks/tasks/triviaqa.py
+++ b/src/lighteval/tasks/tasks/triviaqa.py
@@ -76,7 +76,7 @@ def triviaqa_bpb_prompt(line, task_name: str = None):
         few_shots_split="train",
         few_shots_select="random_sampling_from_train",
         generation_size=20,
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         stop_sequence=["\n", ".", ","],
         version=1,
     ),
diff --git a/src/lighteval/tasks/tasks/wikifact.py b/src/lighteval/tasks/tasks/wikifact.py
index c22d16852..b5328509d 100644
--- a/src/lighteval/tasks/tasks/wikifact.py
+++ b/src/lighteval/tasks/tasks/wikifact.py
@@ -146,7 +146,7 @@ def _gen_config(subset: str) -> LightevalTaskConfig:
         # num_fewshot (not num_fewshot+1), so 5-shot doesn't overflow those tiny pools.
         few_shots_select="random_sampling_from_train",
         generation_size=8,
-        metrics=[Metrics.f1_score, Metrics.exact_match],
+        metrics=[Metrics.qa_f1, Metrics.qa_em],
         stop_sequence=["\n"],
         version=1,
     )
diff --git a/src/lighteval/tasks/tasks/winogrande.py b/src/lighteval/tasks/tasks/winogrande.py
index 72c2099c2..9373ad1ca 100644
--- a/src/lighteval/tasks/tasks/winogrande.py
+++ b/src/lighteval/tasks/tasks/winogrande.py
@@ -164,6 +164,8 @@ def winogrande_rc_prompt(line, task_name: str = None):
     generation_size=-1,
     metrics=[
         LogLikelihoodAccMetric(),
+        LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+        Metrics.target_bits_per_byte,
     ],
     stop_sequence=["\n"],
     version=0,
diff --git a/src/lighteval/tasks/tasks/xfinbench.py b/src/lighteval/tasks/tasks/xfinbench.py
new file mode 100644
index 000000000..0ccaa5e5b
--- /dev/null
+++ b/src/lighteval/tasks/tasks/xfinbench.py
@@ -0,0 +1,86 @@
+"""
+name:
+XFinBench Cross-lingual Finance QA
+
+dataset:
+titaneval_local (local parquet — data/titaneval/xfinbench.parquet)
+
+abstract:
+Cross-lingual finance multiple-choice benchmark (English subset). 588 valid
+questions from TitanEval-MCQ, 0-shot.
+
+languages:
+english
+
+tags:
+finance, multiple-choice, qa
+"""
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import LogProbCharNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+_CF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+    Metrics.target_bits_per_byte,
+]
+_MCF_METRICS = [
+    LogLikelihoodAccMetric(),
+    LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+]
+
+
+def xfinbench_cf_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\nAnswer:",
+        choices=[" " + c for c in line["choices"]],
+        gold_index=line["answer_index"],
+    )
+
+
+def xfinbench_mcf_prompt(line, task_name: str = None):
+    choices = line["choices"]
+    labels = list("ABCDEFGHIJ"[: len(choices)])
+    options = "\n".join(f"{l}. {c}" for l, c in zip(labels, choices))
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['question']}\n{options}\nAnswer:",
+        choices=[f" {l}" for l in labels],
+        gold_index=line["answer_index"],
+    )
+
+
+def _valid_row(row) -> bool:
+    """Skip rows with empty choices (xfinbench has a few invalid rows)."""
+    choices = row.get("choices") or []
+    ans = row.get("answer_index")
+    return bool(choices) and any(c.strip() for c in choices) and ans is not None and 0 <= ans < len(choices)
+
+
+
+TASKS_TABLE = [
+    LightevalTaskConfig(
+        name=f"xfinbench:{suffix}",
+        prompt_function=fn,
+        hf_repo="titaneval_local",
+        hf_subset="xfinbench",
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split="test",
+        few_shots_select="random_sampling",
+        generation_size=gen,
+        metrics=metrics,
+        hf_filter=_valid_row,
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for suffix, fn, metrics, gen in [
+        ("cf",     xfinbench_cf_prompt,  _CF_METRICS,          -1),
+        ("mcf",    xfinbench_mcf_prompt, _MCF_METRICS,         -1),
+        ("mcf_em", xfinbench_mcf_prompt, [Metrics.exact_match],  1),
+    ]
+]