diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index f48529ec..3b8de40a 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -129,7 +129,7 @@ def test_evaluator(): with open(eval_set_file_path, "w") as f: json.dump(EVAL_SET_DATA, f) - base_evaluator.generate_eval_data(file_path=eval_set_file_path) + base_evaluator.build_eval_set(file_path=eval_set_file_path) assert len(base_evaluator.invocation_list) == 1 assert len(base_evaluator.invocation_list[0].invocations) == 1 @@ -149,7 +149,7 @@ def test_tracing_file_to_evalset(): with open(tracing_file_path, "w") as f: json.dump(TRACE_SET_DATA, f) - base_evaluator.generate_eval_data(file_path=tracing_file_path) + base_evaluator.build_eval_set(file_path=tracing_file_path) assert len(base_evaluator.invocation_list) == 1 assert len(base_evaluator.invocation_list[0].invocations) == 1 diff --git a/veadk/evaluation/adk_evaluator/__init__.py b/veadk/evaluation/adk_evaluator/__init__.py index 7f463206..d5159d15 100644 --- a/veadk/evaluation/adk_evaluator/__init__.py +++ b/veadk/evaluation/adk_evaluator/__init__.py @@ -11,3 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .adk_evaluator import ADKEvaluator + +__all__ = ["ADKEvaluator"] diff --git a/veadk/evaluation/adk_evaluator/adk_evaluator.py b/veadk/evaluation/adk_evaluator/adk_evaluator.py index e152ec6e..e9798a49 100644 --- a/veadk/evaluation/adk_evaluator/adk_evaluator.py +++ b/veadk/evaluation/adk_evaluator/adk_evaluator.py @@ -16,30 +16,24 @@ import time import uuid from os import path -from typing import Any, Optional -from google.adk import Runner -from google.adk.agents.base_agent import BaseAgent -from google.adk.artifacts import BaseArtifactService, InMemoryArtifactService from google.adk.evaluation.agent_evaluator import ( - NUM_RUNS, RESPONSE_MATCH_SCORE_KEY, TOOL_TRAJECTORY_SCORE_KEY, AgentEvaluator, ) -from google.adk.evaluation.eval_case import IntermediateData, Invocation, SessionInput -from google.adk.evaluation.eval_set import EvalSet -from google.adk.evaluation.evaluation_generator import ( - EvalCaseResponses, - EvaluationGenerator, -) -from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult -from google.adk.sessions import BaseSessionService, InMemorySessionService +from google.adk.evaluation.eval_case import IntermediateData, Invocation +from google.adk.evaluation.evaluator import EvalStatus from typing_extensions import override +from veadk.evaluation.base_evaluator import BaseEvaluator +from types import SimpleNamespace +from google.genai import types as genai_types -from veadk.agent import Agent - -from ..base_evaluator import BaseEvaluator +from google.adk.evaluation.eval_metrics import EvalMetric +from google.adk.evaluation.metric_evaluator_registry import ( + DEFAULT_METRIC_EVALUATOR_REGISTRY, +) +import inspect def formatted_timestamp(): @@ -47,186 +41,6 @@ def formatted_timestamp(): return time.strftime("%Y%m%d%H%M%S", time.localtime()) -class VeEvaluationGenerator(EvaluationGenerator): - @staticmethod - async def _ve_process_query( # done - invocations: list[Invocation], - agent: Agent, - agent_name: Optional[str] = None, - initial_session: Optional[SessionInput] = None, - ): - agent_to_evaluate = agent - if agent_name: - agent_to_evaluate = agent.find_agent(agent_name) - assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found." - - return await VeEvaluationGenerator._ve_generate_inferences_from_root_agent( - invocations, agent_to_evaluate, None, initial_session - ) - - @staticmethod - async def ve_generate_responses( # done - eval_set: EvalSet, - agent: Agent, - repeat_num: int = 3, - agent_name: str | None = None, - ): - results = [] - - for eval_case in eval_set.eval_cases: - responses = [] - for _ in range(repeat_num): - response_invocations = await VeEvaluationGenerator._ve_process_query( - invocations=eval_case.conversation, - agent=agent, - agent_name=agent_name, - initial_session=eval_case.session_input, - ) - responses.append(response_invocations) - - results.append(EvalCaseResponses(eval_case=eval_case, responses=responses)) - - return results - - @staticmethod - async def _ve_generate_inferences_from_root_agent( - invocations: list[Invocation], - root_agent: BaseAgent, - reset_func: Any, - initial_session: Optional[SessionInput] = None, - session_id: Optional[str] = None, - session_service: Optional[BaseSessionService] = None, - artifact_service: Optional[BaseArtifactService] = None, - ) -> list[Invocation]: - """Scrapes the root agent given the list of Invocations.""" - if not session_service: - session_service = InMemorySessionService() - - app_name = ( - initial_session.app_name if initial_session else "EvaluationGenerator" - ) - user_id = initial_session.user_id if initial_session else "test_user_id" - session_id = session_id if session_id else str(uuid.uuid4()) - - _ = await session_service.create_session( - app_name=app_name, - user_id=user_id, - state=initial_session.state if initial_session else {}, - session_id=session_id, - ) - - if not artifact_service: - artifact_service = InMemoryArtifactService() - - runner = Runner( - app_name=app_name, - agent=root_agent, - artifact_service=artifact_service, - session_service=session_service, - memory_service=root_agent.long_term_memory - if isinstance(root_agent, Agent) - else None, - ) - - # Reset agent state for each query - if callable(reset_func): - reset_func() - - response_invocations = [] - - for invocation in invocations: - final_response = None - user_content = invocation.user_content - tool_uses = [] - invocation_id = "" - - async for event in runner.run_async( - user_id=user_id, session_id=session_id, new_message=user_content - ): - invocation_id = ( - event.invocation_id if not invocation_id else invocation_id - ) - - if event.is_final_response() and event.content and event.content.parts: - final_response = event.content - elif event.get_function_calls(): - for call in event.get_function_calls(): - tool_uses.append(call) - - response_invocations.append( - Invocation( - invocation_id=invocation_id, - user_content=user_content, - final_response=final_response, - intermediate_data=IntermediateData(tool_uses=tool_uses), - ) - ) - - return response_invocations - - -class VeAgentEvaluator(AgentEvaluator): - def __init__( - self, - ): - super().__init__() - - @staticmethod - async def ve_evaluate_eval_set( - agent: Agent, - eval_set: EvalSet, - criteria: dict[str, float], - num_runs=NUM_RUNS, - agent_name=None, - print_detailed_results: bool = True, - ): - eval_case_responses_list = await VeEvaluationGenerator.ve_generate_responses( - eval_set=eval_set, - agent=agent, - repeat_num=num_runs, - agent_name=agent_name, - ) - failures = [] - evaluation_result_list = [] - - for eval_case_responses in eval_case_responses_list: - actual_invocations = [ - invocation - for invocations in eval_case_responses.responses - for invocation in invocations - ] - expected_invocations = eval_case_responses.eval_case.conversation * num_runs - - for metric_name, threshold in criteria.items(): - metric_evaluator = AgentEvaluator._get_metric_evaluator( - metric_name=metric_name, threshold=threshold - ) - - evaluation_result: EvaluationResult = ( - metric_evaluator.evaluate_invocations( - actual_invocations=actual_invocations, - expected_invocations=expected_invocations, - ) - ) - - if print_detailed_results: - AgentEvaluator._print_details( - evaluation_result=evaluation_result, - metric_name=metric_name, - threshold=threshold, - ) - - # Gather all the failures. - if evaluation_result.overall_eval_status != EvalStatus.PASSED: - failures.append( - f"{metric_name} for {agent.name} Failed. Expected {threshold}," - f" but got {evaluation_result.overall_score}." - ) - evaluation_result_list.append(evaluation_result) - - return evaluation_result_list, failures - - class ADKEvaluator(BaseEvaluator): def __init__( self, @@ -235,10 +49,8 @@ def __init__( ): super().__init__(agent=agent, name=name) - # TODO: implement - @override - async def eval( + async def evaluate( self, eval_set_file_path: str, eval_id: str = f"test_{formatted_timestamp()}", @@ -247,6 +59,26 @@ async def eval( num_runs: int = 2, print_detailed_results: bool = True, ): + """ + End-to-end evaluation flow: + 1) Discover test files (.test.json) or accept a single path. + 2) Build metric criteria (metric_name -> threshold). + 3) For each file, build in-memory eval cases via BaseEvaluator. + 4) For each eval case, construct expected ADK Invocations from expected data. + 5) Repeat for num_runs: + - Reset all session_ids to isolate state. + - Generate actual outputs via BaseEvaluator and convert to ADK Invocations. + 6) Repeat expected invocations to match num_runs for 1:1 alignment. + 7) For each metric: + - Create EvalMetric and get the evaluator from ADK's registry. + - Call evaluate_invocations (await if async) to get EvaluationResult with: + overall_score/overall_eval_status + per_invocation_results. + - Optionally pretty print via AgentEvaluator._print_details. + - Record failure if overall status != PASSED. + 8) Return (all evaluation_result objects, failures) to the caller. + """ + + # Resolve eval files: accept a directory (scan *.test.json) or a single file test_files = [] eval_dataset_file_path_or_dir = eval_set_file_path if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir( @@ -259,28 +91,149 @@ async def eval( else: test_files = [eval_dataset_file_path_or_dir] - initial_session = AgentEvaluator._get_initial_session() + # Build metric criteria (metric_name -> threshold) + criteria = { + TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 means perfect tool call trajectory + RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 default threshold + } + # Aggregate all evaluation results and failures across files result = [] failures = [] + + # Iterate each test file and evaluate per-case, per-metric for test_file in test_files: - criteria = { - TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 is perfect. - RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 is default. - } - eval_set = AgentEvaluator._load_eval_set_from_file( - test_file, criteria, initial_session - ) + # Build in-memory evaluation cases via BaseEvaluator from the provided file + self.build_eval_set(test_file) + + evaluation_result_list = [] + + # For each eval case, generate actual outputs num_runs times using BaseEvaluator + for case_idx, eval_case_data in enumerate(self.invocation_list): + # Convert BaseEvaluator's expected data into ADK Invocation list + expected_invocations: list[Invocation] = [] + for inv in eval_case_data.invocations: + user_content = genai_types.Content( + role="user", + parts=[genai_types.Part(text=inv.input or "")], + ) + expected_final = genai_types.Content( + role=None, + parts=[genai_types.Part(text=inv.expected_output or "")], + ) + expected_tool_calls = [ + SimpleNamespace(name=t.get("name"), args=t.get("args", {})) + for t in (inv.expected_tool or []) + ] + # Pack a full expected Invocation for ADK metrics + expected_invocations.append( + Invocation( + invocation_id=inv.invocation_id, + user_content=user_content, + final_response=expected_final, + intermediate_data=IntermediateData( + tool_uses=expected_tool_calls + ), + ) + ) + + # Collect actual invocations across runs + actual_invocations_all_runs: list[Invocation] = [] + for _ in range(num_runs): + for agent_information in self.agent_information_list: + agent_information["session_id"] = str(uuid.uuid4()) + + # Generate actual outputs for all cases in this run via BaseEvaluator + await self.generate_actual_outputs() + + # Convert BaseEvaluator's actual data into ADK Invocation list + for inv in eval_case_data.invocations: + user_content = genai_types.Content( + role="user", + parts=[genai_types.Part(text=inv.input or "")], + ) + actual_final = genai_types.Content( + role=None, + parts=[genai_types.Part(text=inv.actual_output or "")], + ) + # Collect the tool calls observed during actual execution + actual_tool_calls = [ + SimpleNamespace(name=t.get("name"), args=t.get("args", {})) + for t in (inv.actual_tool or []) + ] + # Pack a full actual Invocation for ADK metrics + actual_invocations_all_runs.append( + Invocation( + invocation_id=inv.invocation_id, + user_content=user_content, + final_response=actual_final, + intermediate_data=IntermediateData( + tool_uses=actual_tool_calls + ), + ) + ) + + # Repeat expected invocations to align with num_runs + expected_invocations_repeated = expected_invocations * num_runs + + # Evaluate per metric via ADK metric evaluators obtained from the registry + for metric_name, threshold in criteria.items(): + eval_metric = EvalMetric( + metric_name=metric_name, threshold=threshold + ) + metric_evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator( + eval_metric=eval_metric + ) - res, fail = await VeAgentEvaluator.ve_evaluate_eval_set( - agent=self.agent, - eval_set=eval_set, - criteria=criteria, - num_runs=num_runs, - agent_name=self.agent.name, - print_detailed_results=print_detailed_results, - ) - result.append(res) - failures.extend(fail) + if inspect.iscoroutinefunction( + metric_evaluator.evaluate_invocations + ): + evaluation_result = await metric_evaluator.evaluate_invocations( + actual_invocations=actual_invocations_all_runs, + expected_invocations=expected_invocations_repeated, + ) + else: + evaluation_result = metric_evaluator.evaluate_invocations( + actual_invocations=actual_invocations_all_runs, + expected_invocations=expected_invocations_repeated, + ) + + if print_detailed_results: + per_items = [] + for i, per in enumerate( + getattr(evaluation_result, "per_invocation_results", []) + or [] + ): + per_items.append( + SimpleNamespace( + actual_invocation=actual_invocations_all_runs[i], + expected_invocation=expected_invocations_repeated[ + i + ], + eval_metric_result=SimpleNamespace( + eval_status=per.eval_status, + score=per.score, + threshold=threshold, + ), + ) + ) + + AgentEvaluator._print_details( + eval_metric_result_with_invocations=per_items, + overall_eval_status=evaluation_result.overall_eval_status, + overall_score=evaluation_result.overall_score, + metric_name=metric_name, + threshold=threshold, + ) + + if evaluation_result.overall_eval_status != EvalStatus.PASSED: + failures.append( + f"{metric_name} for {self.agent.name} Failed. Expected {threshold}," + f" but got {evaluation_result.overall_score}." + ) + + evaluation_result_list.append(evaluation_result) + + result.append(evaluation_result_list) return result, failures diff --git a/veadk/evaluation/base_evaluator.py b/veadk/evaluation/base_evaluator.py index d02f5eaa..53348027 100644 --- a/veadk/evaluation/base_evaluator.py +++ b/veadk/evaluation/base_evaluator.py @@ -28,7 +28,13 @@ from veadk.utils.misc import formatted_timestamp -class InvocationTestData(BaseModel): +class ToolInvocation(BaseModel): + tool_name: str + tool_args: dict[str, Any] = {} + tool_result: Any = None + + +class Invocation(BaseModel): invocation_id: str = "" input: str actual_output: str @@ -38,8 +44,8 @@ class InvocationTestData(BaseModel): latency: str = "" # ms -class EvalCaseData(BaseModel): - invocations: list[InvocationTestData] +class EvalTestCase(BaseModel): + invocations: list[Invocation] class MetricResult(BaseModel): @@ -78,23 +84,23 @@ def __init__( ): self.name = name self.agent = agent - self.invocation_list: list[EvalCaseData] = [] + self.invocation_list: list[EvalTestCase] = [] self.result_list: list[EvalResultData] = [] self.agent_information_list: list[dict] = [] - def _load_eval_set(self, eval_set_file: str) -> EvalSet: - from .eval_set_file_loader import load_eval_set_from_file + def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet: + from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file - return load_eval_set_from_file(eval_set_file) + return load_eval_set_from_file(eval_json_path) - def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet: + def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet: try: - with open(tracing_file, "r") as f: + with open(tracing_json_path, "r") as f: tracing_data = json.load(f) except json.JSONDecodeError as e: - raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}") + raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}") except Exception as e: - raise ValueError(f"Error reading file {tracing_file}: {e}") + raise ValueError(f"Error reading file {tracing_json_path}: {e}") # Group spans by trace_id trace_groups = {} @@ -188,9 +194,9 @@ def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet: return evalset - def generate_eval_data(self, file_path: str): + def build_eval_set(self, file_path: str): """Generate evaluation data from a given file and assign it to the class attribute `invocation_list`.""" - eval_case_data_list: list[EvalCaseData] = [] + eval_case_data_list: list[EvalTestCase] = [] try: with open(file_path, "r") as f: @@ -201,7 +207,7 @@ def generate_eval_data(self, file_path: str): raise ValueError(f"Error reading file {file_path}: {e}") if isinstance(file_content, dict) and "eval_cases" in file_content: - eval_cases = self._load_eval_set(file_path).eval_cases + eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases elif ( isinstance(file_content, list) and len(file_content) > 0 @@ -209,14 +215,14 @@ def generate_eval_data(self, file_path: str): isinstance(span, dict) and "trace_id" in span for span in file_content ) ): - eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases + eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases else: raise ValueError( f"Unsupported file format in {file_path}. Please provide a valid file." ) for eval_case in eval_cases: - eval_case_data = EvalCaseData(invocations=[]) + eval_case_data = EvalTestCase(invocations=[]) if eval_case.session_input: self.agent_information_list.append( { @@ -247,7 +253,7 @@ def generate_eval_data(self, file_path: str): ) eval_case_data.invocations.append( - InvocationTestData( + Invocation( invocation_id=invocation.invocation_id, input=_input, actual_output="", @@ -261,7 +267,7 @@ def generate_eval_data(self, file_path: str): eval_case_data_list.append(eval_case_data) self.invocation_list = eval_case_data_list - async def _run_agent_for_actual_data(self): + async def generate_actual_outputs(self): for eval_case_data, agent_information in zip( self.invocation_list, self.agent_information_list ): @@ -333,7 +339,7 @@ async def _run_agent_for_actual_data(self): invocation.actual_tool = _actual_tool invocation.latency = _latency - def get_data(self) -> list[list[dict[str, Any]]]: + def get_eval_set_information(self) -> list[list[dict[str, Any]]]: """Merge the evaluation data and return it in the format of list[list[dict]]""" result = [] for i, eval_case in enumerate(self.invocation_list): @@ -360,7 +366,7 @@ def get_data(self) -> list[list[dict[str, Any]]]: return result @abstractmethod - async def eval( + async def evaluate( self, eval_set_file_path: str, metrics: list[Any], diff --git a/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py b/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py index ed656aa0..3f60bc82 100644 --- a/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +++ b/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py @@ -27,8 +27,11 @@ from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata from veadk.utils.logger import get_logger -from ..base_evaluator import BaseEvaluator, EvalResultData, MetricResult -from ..utils.prometheus import PrometheusPushgatewayConfig, push_to_prometheus +from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult +from veadk.evaluation.utils.prometheus import ( + PrometheusPushgatewayConfig, + push_to_prometheus, +) logger = get_logger(__name__) @@ -66,7 +69,7 @@ def __init__( self.prometheus_config = prometheus_config @override - async def eval( + async def evaluate( self, eval_set_file_path: str, metrics: list[BaseMetric], @@ -74,11 +77,11 @@ async def eval( ): """Target to Google ADK, we will use the same evaluation case format as Google ADK.""" # Get evaluation data by parsing eval set file - self.generate_eval_data(eval_set_file_path) + self.build_eval_set(eval_set_file_path) # Get actual data by running agent logger.info("Start to run agent for actual data.") - await self._run_agent_for_actual_data() + await self.generate_actual_outputs() eval_case_data_list = self.invocation_list # Build test cases in Deepeval format diff --git a/veadk_tutorial.ipynb b/veadk_tutorial.ipynb index 0289432f..d36053d6 100644 --- a/veadk_tutorial.ipynb +++ b/veadk_tutorial.ipynb @@ -1788,7 +1788,7 @@ " ToolCorrectnessMetric(threshold=0.5),\n", "]\n", "\n", - "await evaluator.eval(eval_set_file_path=eval_set_path, metrics=metrics)" + "await evaluator.evaluate(eval_set_file_path=eval_set_path, metrics=metrics)" ] }, {