Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions rdagent/scenarios/rl/autorl_bench/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class BenchmarkConfig:
"max_steps": 50,
"env_num": 134, # 完整评测集(valid_unseen),之前调试时设为 1
},
expose_files=["eval.py", "react_prompts.json"],
expose_files=["eval.py"],
),
"webshop": BenchmarkConfig(
id="webshop",
Expand All @@ -97,7 +97,7 @@ class BenchmarkConfig:
data_module="rdagent.scenarios.rl.autorl_bench.benchmarks.deepsearchqa.data",
description="DeepSearchQA - Google DeepMind 多步信息检索基准(900题,17领域)",
eval_config={
"num_samples": 100, # 快速评测用 100,完整评测用 900
"num_samples": 200, # fixed held-out evaluation split after 100/200 train/eval partition
"max_steps": 6, # ReAct 最大搜索轮次
# api_key": "...", # 可选,不填则用 DuckDuckGo
},
Expand Down
52 changes: 46 additions & 6 deletions rdagent/scenarios/rl/autorl_bench/benchmarks/deepsearchqa/data.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,54 @@
# benchmarks/deepsearchqa/data.py
import json
import shutil
from pathlib import Path

from datasets import load_dataset
from datasets import Dataset, load_dataset

DATASET_NAME = "google/deepsearchqa"
SOURCE_SPLIT = "eval"
SPLIT_SEED = 42
TRAIN_SIZE = 100
DEFAULT_EVAL_SIZE = 200
TOTAL_SIZE = 900
UNUSED_SIZE = TOTAL_SIZE - TRAIN_SIZE - DEFAULT_EVAL_SIZE


def load_source_dataset() -> Dataset:
"""Load the single official DeepSearchQA split."""
return load_dataset(DATASET_NAME, split=SOURCE_SPLIT)


def split_dataset(dataset: Dataset) -> tuple[Dataset, Dataset]:
"""Create a deterministic 100/200 train/eval split from the 900-item eval set."""
shuffled = dataset.shuffle(seed=SPLIT_SEED)
train = shuffled.select(range(min(TRAIN_SIZE, len(shuffled))))
eval_start = min(TRAIN_SIZE, len(shuffled))
eval_end = min(TRAIN_SIZE + DEFAULT_EVAL_SIZE, len(shuffled))
eval_set = shuffled.select(range(eval_start, eval_end))
return train, eval_set


def download_train_data(target_dir: Path):
"""下载 deepsearchqa 数据到本地"""
"""Download and persist the held-in 100-sample training split for agents."""
target_dir.mkdir(parents=True, exist_ok=True)

# 只下载 eval split(DeepSearchQA 只有 eval split)
dataset = load_dataset("google/deepsearchqa", split="eval")
dataset.save_to_disk(str(target_dir / "deepsearchqa"))
print(f"DeepSearchQA saved to {target_dir}")
dataset = load_source_dataset()
train, eval_set = split_dataset(dataset)

output_dir = target_dir / "deepsearchqa"
if output_dir.exists():
shutil.rmtree(output_dir)
train.save_to_disk(str(output_dir))

split_meta = {
"dataset": DATASET_NAME,
"source_split": SOURCE_SPLIT,
"shuffle_seed": SPLIT_SEED,
"train_size": len(train),
"eval_size": len(eval_set),
"unused_size": max(0, len(dataset) - len(train) - len(eval_set)),
"total_size": len(dataset),
}
(target_dir / "split_meta.json").write_text(json.dumps(split_meta, indent=2), encoding="utf-8")
print(f"DeepSearchQA train split saved to {output_dir} ({len(train)} train / {len(eval_set)} eval)")
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
## 数据集
- 来源: google/deepsearchqa (HuggingFace)
- 规模: 900 题
- 本地协议: 固定随机种子切分为 100 题训练 / 200 题评测(其余样本保留不用)
- 答案类型: Single Answer (35%) / Set Answer (65%)

## Rollout 流程
Expand Down
83 changes: 61 additions & 22 deletions rdagent/scenarios/rl/autorl_bench/benchmarks/deepsearchqa/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Dict, List, Optional, Tuple

import requests
from datasets import load_dataset

from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.rl.autorl_bench.benchmarks.deepsearchqa.data import (
DATASET_NAME,
DEFAULT_EVAL_SIZE,
SOURCE_SPLIT,
TRAIN_SIZE,
load_source_dataset,
split_dataset,
)
from rdagent.scenarios.rl.autorl_bench.core.evaluator import BaseEvaluator

REACT_SYSTEM_PROMPT = """You are a research assistant that answers questions by searching the web.
Expand Down Expand Up @@ -64,15 +72,15 @@ def run_eval(self, model_path: str, workspace_path: str, **kwargs) -> Dict[str,
result["error"] = f"Model not found: {model_path}"
return result

# load datasets
num_samples = self.eval_config.get("num_samples", 100)
dataset = load_dataset(
"google/deepsearchqa",
split="eval",
# 如果已下载到本地可用 data_dir 参数
# Deterministic held-out evaluation split: 100 train / 800 eval.
num_samples = self.eval_config.get("num_samples", DEFAULT_EVAL_SIZE)
dataset = load_source_dataset()
_, eval_dataset = split_dataset(dataset)
samples = list(eval_dataset.select(range(min(num_samples, len(eval_dataset)))))
logger.info(
f"DeepSearchQA held-out eval: {len(samples)} samples "
f"(train={TRAIN_SIZE}, eval={len(eval_dataset)}, source={DATASET_NAME}/{SOURCE_SPLIT})"
)
samples = list(dataset.select(range(min(num_samples, len(dataset)))))
logger.info(f"DeepSearchQA: {len(samples)} samples")

# load model (vLLM)
logger.info(f"Loading model: {model_path}")
Expand All @@ -92,8 +100,7 @@ def run_eval(self, model_path: str, workspace_path: str, **kwargs) -> Dict[str,
search_fn = self._get_search_function()

# evaluation loop
correct = 0
results_detail = []
generated_records = []

for i, sample in enumerate(samples):
question = sample["problem"]
Expand All @@ -111,25 +118,54 @@ def run_eval(self, model_path: str, workspace_path: str, **kwargs) -> Dict[str,
answer_type,
)

# LLM Judge score
score = self._judge_answer(predicted, gold_answer, answer_type)
if score:
correct += 1

results_detail.append(
generated_records.append(
{
"idx": i,
"question": question[:100],
"gold": gold_answer,
"predicted": predicted,
"answer_type": answer_type,
"correct": score,
}
)
logger.info(f" Predicted: {predicted[:80]}")
logger.info(f" Gold: {gold_answer[:80]}")
logger.info(f" Correct: {score}")
running_acc = correct / (i + 1)
logger.info(f" Running accuracy: {correct}/{i+1} = {running_acc:.2%}")

judge_workers = int(self.eval_config.get("judge_workers", 8))
logger.info(f"Running parallel answer judging with {judge_workers} workers")

results_detail = [None] * len(generated_records)
correct = 0
completed = 0

with ThreadPoolExecutor(max_workers=max(1, judge_workers)) as executor:
future_to_record = {
executor.submit(
self._judge_answer,
record["predicted"],
record["gold"],
record["answer_type"],
): record
for record in generated_records
}

for future in as_completed(future_to_record):
record = future_to_record[future]
score = future.result()
if score:
correct += 1
completed += 1

results_detail[record["idx"]] = {
"question": record["question"],
"gold": record["gold"],
"predicted": record["predicted"],
"answer_type": record["answer_type"],
"correct": score,
}
logger.info(
f" Judge {completed}/{len(generated_records)} | "
f"Correct={score} | Running accuracy: {correct}/{completed} = {correct / completed:.2%}"
)

accuracy = correct / len(samples) if samples else 0.0
result["score"] = accuracy * 100
Expand All @@ -155,6 +191,8 @@ def _react_loop(
answer_type: str,
) -> str:
"""ReAct multi-step reasoning loop, return final answer string"""
from vllm import SamplingParams

max_steps = self.eval_config.get("max_steps", 6)

conversation = f"Question: {question}\n" f"Answer type: {answer_type}\n\n" "Thought:"
Expand Down Expand Up @@ -314,7 +352,8 @@ def _judge_answer(
.strip()
.lower()
)
return "correct" in response
normalized = response.splitlines()[0].strip().strip(".!,;: \t\r\n").lower()
return normalized == "correct"
except Exception as e:
logger.warning(f"Judge failed: {e}, falling back to string match")
return self._string_match(predicted, gold, answer_type)
Expand Down
Loading
Loading