wip

xzrderek · xzrderek · commit b6a6ec5d293a · 2025-08-11T22:49:45.000-07:00
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -117,10 +117,15 @@ async def default_agent_rollout_processor(
 ) -> List[EvaluationRow]:
     dataset: Dataset = []
     for row in rows:
-        agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
-        await agent.setup()
-        await agent.call_agent()
-        dataset.append(agent.evaluation_row)
-        if agent.mcp_client:
-            await agent.mcp_client.cleanup()
+        try:
+            agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
+            await agent.setup()
+            await agent.call_agent()
+            dataset.append(agent.evaluation_row)
+            if agent.mcp_client:
+                await agent.mcp_client.cleanup()
+        except Exception as e:
+            row.rollout_status.status = "error"
+            row.rollout_status.error_message = str(e)
+            dataset.append(row)
     return dataset
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -87,7 +87,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
     async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
         async with semaphore:
-            return await process_row(r)
+            try:
+                return await process_row(r)
+            except Exception as e:
+                r.rollout_status.status = "error"
+                r.rollout_status.error_message = str(e)
+                return r
 
     tasks = [_sem_wrapper(row) for row in rows]
     dataset = list(await asyncio.gather(*tasks))
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -401,133 +401,132 @@ def _log_eval_error(
                         logger=active_logger,
                     )
 
+                    max_retry = int(os.getenv("EP_MAX_RETRY", "0"))
+
                     for i in range(num_runs):
-                        # Regenerate outputs each run by deep-copying the pristine dataset
-                        # so model responses are not reused across runs.
                         run_id = generate_id()
-                        fresh_dataset = [r.model_copy(deep=True) for r in data]
-
-                        # apply new run_id to fresh_dataset
-                        for row in fresh_dataset:
-                            row.run_id = run_id
-
-                        # generate new rollout_id for each row
-                        for row in fresh_dataset:
-                            row.rollout_id = generate_id()
-
-                        # log the fresh_dataset
-                        for row in fresh_dataset:
-                            active_logger.log(row)
-
-                        # filter out rows that already have completed rollouts via checkpointing
-                        rows_to_process = []
-                        completed_rollout_ids = set()
-
-                        finished_logs = active_logger.read()
-
-                        for finished_row in finished_logs:
-                            # need to add finished rows to all_results so that we can aggregate them later.
-                            all_results.append(finished_row)
-                            # TODO: need to also add the num_run to track which run the row belongs to.
-                            # TODO: ask why we made row_id optional in the first place. checkpointing won't work without some ID.
-                            if finished_row.input_metadata and finished_row.input_metadata.row_id:
-                                completed_rollout_ids.add(finished_row.input_metadata.row_id)
-
-                        for row in fresh_dataset:
-                            row_id = row.input_metadata.row_id if row.input_metadata else None
-                            if row_id not in completed_rollout_ids:
-                                rows_to_process.append(row)
-
-                        if len(rows_to_process) < len(fresh_dataset):
-                            print(
-                                f"Checkpointing: Found {len(fresh_dataset) - len(rows_to_process)} completed rows, processing {len(rows_to_process)} remaining rows"
-                            )
-
-                        if rows_to_process:
-                            processed_dataset = execute_function(
-                                rollout_processor, rows=rows_to_process, config=config
-                            )
-
-                        if mode == "pointwise":
-                            # Pointwise mode: apply the evaluator function to each row
-                            for row in processed_dataset:
-                                result = execute_with_params(
+                        retry_attempt = 0
+                        current_data = data
+
+                        while retry_attempt <= max_retry:
+                            if retry_attempt > 0:
+                                logged_rows = active_logger.read()
+                                failed_rows = [
+                                    row
+                                    for row in logged_rows
+                                    if row.rollout_status
+                                    and row.rollout_status.status == "error"
+                                    and row.run_id == run_id
+                                ]
+                                if not failed_rows:
+                                    break
+                                current_data = failed_rows
+
+                            # Regenerate outputs each run by deep-copying the pristine dataset
+                            # so model responses are not reused across runs.
+                            fresh_dataset = [r.model_copy(deep=True) for r in current_data]
+
+                            # apply new run_id to fresh_dataset
+                            for row in fresh_dataset:
+                                row.run_id = run_id
+
+                            # generate new rollout_id for each row
+                            for row in fresh_dataset:
+                                row.rollout_id = generate_id()
+
+                            # log the fresh_dataset
+                            for row in fresh_dataset:
+                                active_logger.log(row)
+
+                            processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config)
+
+                            if mode == "pointwise":
+                                # Pointwise mode: apply the evaluator function to each row
+                                for row in processed_dataset:
+                                    result = execute_with_params(
+                                        test_func,
+                                        processed_row=row,
+                                        evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
+                                    )
+                                    if result is None or not isinstance(result, EvaluationRow):
+                                        raise ValueError(
+                                            f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
+                                        )
+                                    # TODO: not this simple, only append ones that are not error
+                                    all_results[i].append(result)
+                            else:
+                                # Batch mode: call the test function with the full dataset
+                                results = execute_with_params(
                                     test_func,
-                                    processed_row=row,
+                                    processed_dataset=processed_dataset,
                                     evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
                                 )
-                                if result is None or not isinstance(result, EvaluationRow):
+                                if results is None:
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results[i].append(result)
-                        else:
-                            # Batch mode: call the test function with the full dataset
-                            results = execute_with_params(
-                                test_func,
-                                processed_dataset=processed_dataset,
-                                evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
-                            )
-                            if results is None:
-                                raise ValueError(
-                                    f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
-                                )
-                            if not isinstance(results, list):
-                                raise ValueError(
-                                    f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
-                                )
-                            if not results:
-                                raise ValueError(
-                                    f"Test function {test_func.__name__} returned an empty list. You must return a non-empty list of EvaluationRow instances from your test function decorated with @evaluation_test."
-                                )
-                            if not all(isinstance(r, EvaluationRow) for r in results):
-                                raise ValueError(
-                                    f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                                if not isinstance(results, list):
+                                    raise ValueError(
+                                        f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                                    )
+                                if not results:
+                                    raise ValueError(
+                                        f"Test function {test_func.__name__} returned an empty list. You must return a non-empty list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                                    )
+                                if not all(isinstance(r, EvaluationRow) for r in results):
+                                    raise ValueError(
+                                        f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                                    )
+                                # TODO: not this simple, only append ones that are not error
+                                all_results[i] = results
+
+                            retry_attempt += 1
+
+                        scores = [
+                            sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
+                            for result in all_results
+                        ]
+                        agg_score = aggregate(scores, aggregation_method)
+                        score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
+
+                        # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
+                        ci_low: float | None = None
+                        ci_high: float | None = None
+                        if aggregation_method == "mean":
+                            try:
+                                result_ci = compute_fixed_set_mu_ci(
+                                    [item for sublist in all_results for item in sublist]
                                 )
-                            all_results[i] = results
-
-                    scores = [
-                        sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
-                        for result in all_results
-                    ]
-                    agg_score = aggregate(scores, aggregation_method)
-                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
-
-                    # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
-                    ci_low: float | None = None
-                    ci_high: float | None = None
-                    if aggregation_method == "mean":
-                        try:
-                            result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
-                            mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
-                            if mu_ci_low is not None and mu_ci_high is not None:
-                                ci_low = float(mu_ci_low)
-                                ci_high = float(mu_ci_high)
-                                # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
-                        except Exception:
-                            ci_low = None
-                            ci_high = None
-
-                    # Determine if the evaluation passed based on threshold
-                    passed = None
-
-                    if threshold is not None:
-                        success_passed, std_passed = True, True
-
-                        success_passed = agg_score >= threshold.success
+                                mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
+                                if mu_ci_low is not None and mu_ci_high is not None:
+                                    ci_low = float(mu_ci_low)
+                                    ci_high = float(mu_ci_high)
+                                    # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
+                            except Exception:
+                                ci_low = None
+                                ci_high = None
 
-                        if threshold.standard_deviation is not None:
-                            std_passed = score_std <= threshold.standard_deviation
+                        # Determine if the evaluation passed based on threshold
+                        passed = None
+
+                        if threshold is not None:
+                            success_passed, std_passed = True, True
+
+                            success_passed = agg_score >= threshold.success
+
+                            if threshold.standard_deviation is not None:
+                                std_passed = score_std <= threshold.standard_deviation
 
-                        passed = success_passed and std_passed
+                            passed = success_passed and std_passed
 
                     # Update eval metadata status and passed field for all results
                     for result in all_results:
                         for r in result:
-                            if r.eval_metadata is not None:
-                                r.eval_metadata.status = "finished"
-                                r.eval_metadata.passed = passed
-                        default_logger.log(r)
+                            if r.rollout_status is not None:
+                                if r.rollout_status.status != "error":
+                                    r.rollout_status.status = "finished"
+                                r.rollout_status.passed = passed
+                            active_logger.log(r)
 
                     # Optional: print and/or persist a summary artifact for CI
                     try:
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -12,8 +12,8 @@
     max_dataset_rows value set in the decorator).
 """
 
-import os
 import logging
+import os
 from typing import Optional
 
 
@@ -32,17 +32,13 @@ def pytest_addoption(parser) -> None:
         "--ep-print-summary",
         action="store_true",
         default=False,
-        help=(
-            "Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test."
-        ),
+        help=("Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test."),
     )
     group.addoption(
         "--ep-summary-json",
         action="store",
         default=None,
-        help=(
-            "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."
-        ),
+        help=("Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."),
     )
     group.addoption(
         "--ep-input-param",
@@ -63,6 +59,13 @@ def pytest_addoption(parser) -> None:
             "Values: low|medium|high"
         ),
     )
+    group.addoption(
+        "--ep-max-retry",
+        action="store",
+        type=int,
+        default=None,
+        help=("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times."),
+    )
 
 
 def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
@@ -104,10 +107,15 @@ def pytest_configure(config) -> None:
     if summary_json_path:
         os.environ["EP_SUMMARY_JSON"] = summary_json_path
 
+    max_retry = config.getoption("--ep-max-retry")
+    if max_retry is not None:
+        os.environ["EP_MAX_RETRY"] = str(max_retry)
+
     # Allow ad-hoc overrides of input params via CLI flags
     try:
         import json as _json
         import pathlib as _pathlib
+
         merged: dict = {}
         input_params_opts = config.getoption("--ep-input-param")
         if input_params_opts:
@@ -140,5 +148,3 @@ def pytest_configure(config) -> None:
     except Exception:
         # best effort, do not crash pytest session
         pass
-
-
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
@@ -58,7 +58,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
 
         rows.append(eval_row)
 
-    return rows
+    return rows[0:1]
 
 
 @evaluation_test(
@@ -68,7 +68,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
     rollout_processor=default_mcp_gym_rollout_processor,
     passed_threshold={"success": 0.4, "standard_deviation": 0.1},
-    num_runs=8,
+    num_runs=1,
     mode="pointwise",
     max_concurrent_rollouts=50,
     server_script_path="examples/tau2_mcp/server.py",
diff --git a/tests/test_rollout_error_handling.py b/tests/test_rollout_error_handling.py