File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 55and comparing the output against expected results in a pointwise manner.
66"""
77
8- import logging
9- import time
108from typing import Any , Dict , List
119
1210from eval_protocol .models import EvaluateResult , EvaluationRow , Message
1311from eval_protocol .pytest import default_single_turn_rollout_processor , evaluation_test
1412from eval_protocol .rewards .code_execution import execute_python_code , extract_code_blocks
1513
16- logger = logging .getLogger (__name__ )
17-
1814
1915def coding_dataset_to_evaluation_row (data : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
2016 """
@@ -43,22 +39,18 @@ def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
4339 """
4440 Evaluation function that tests code correctness by executing it locally.
4541
46-
4742 This function:
4843 1. Extracts Python code from the assistant's response
4944 2. Executes the code locally with timeout=10
5045 3. Compares the output to ground_truth
5146 4. Returns a score of 1.0 if output matches, 0.0 otherwise
5247
53-
5448 Args:
5549 row: EvaluationRow containing the conversation messages and expected_output in ground_truth
5650
57-
5851 Returns:
5952 EvaluationRow with the evaluation result
6053 """
61- logger .info (f"STARTING TO EVALUATE ROW: { row .input_metadata .row_id } at time { time .time ()} " )
6254 # Check if we have an assistant response
6355 if len (row .messages ) < 2 or row .messages [- 1 ].role != "assistant" :
6456 row .evaluation_result = EvaluateResult (score = 0.0 , reason = "No assistant response found" )
You can’t perform that action at this time.
0 commit comments