diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index f48529ec..3b8de40a 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -129,7 +129,7 @@ def test_evaluator():
     with open(eval_set_file_path, "w") as f:
         json.dump(EVAL_SET_DATA, f)
 
-    base_evaluator.generate_eval_data(file_path=eval_set_file_path)
+    base_evaluator.build_eval_set(file_path=eval_set_file_path)
 
     assert len(base_evaluator.invocation_list) == 1
     assert len(base_evaluator.invocation_list[0].invocations) == 1
@@ -149,7 +149,7 @@ def test_tracing_file_to_evalset():
     with open(tracing_file_path, "w") as f:
         json.dump(TRACE_SET_DATA, f)
 
-    base_evaluator.generate_eval_data(file_path=tracing_file_path)
+    base_evaluator.build_eval_set(file_path=tracing_file_path)
 
     assert len(base_evaluator.invocation_list) == 1
     assert len(base_evaluator.invocation_list[0].invocations) == 1
diff --git a/veadk/evaluation/adk_evaluator/__init__.py b/veadk/evaluation/adk_evaluator/__init__.py
index 7f463206..d5159d15 100644
--- a/veadk/evaluation/adk_evaluator/__init__.py
+++ b/veadk/evaluation/adk_evaluator/__init__.py
@@ -11,3 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .adk_evaluator import ADKEvaluator
+
+__all__ = ["ADKEvaluator"]
diff --git a/veadk/evaluation/adk_evaluator/adk_evaluator.py b/veadk/evaluation/adk_evaluator/adk_evaluator.py
index e152ec6e..e9798a49 100644
--- a/veadk/evaluation/adk_evaluator/adk_evaluator.py
+++ b/veadk/evaluation/adk_evaluator/adk_evaluator.py
@@ -16,30 +16,24 @@
 import time
 import uuid
 from os import path
-from typing import Any, Optional
 
-from google.adk import Runner
-from google.adk.agents.base_agent import BaseAgent
-from google.adk.artifacts import BaseArtifactService, InMemoryArtifactService
 from google.adk.evaluation.agent_evaluator import (
-    NUM_RUNS,
     RESPONSE_MATCH_SCORE_KEY,
     TOOL_TRAJECTORY_SCORE_KEY,
     AgentEvaluator,
 )
-from google.adk.evaluation.eval_case import IntermediateData, Invocation, SessionInput
-from google.adk.evaluation.eval_set import EvalSet
-from google.adk.evaluation.evaluation_generator import (
-    EvalCaseResponses,
-    EvaluationGenerator,
-)
-from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult
-from google.adk.sessions import BaseSessionService, InMemorySessionService
+from google.adk.evaluation.eval_case import IntermediateData, Invocation
+from google.adk.evaluation.evaluator import EvalStatus
 from typing_extensions import override
+from veadk.evaluation.base_evaluator import BaseEvaluator
+from types import SimpleNamespace
+from google.genai import types as genai_types
 
-from veadk.agent import Agent
-
-from ..base_evaluator import BaseEvaluator
+from google.adk.evaluation.eval_metrics import EvalMetric
+from google.adk.evaluation.metric_evaluator_registry import (
+    DEFAULT_METRIC_EVALUATOR_REGISTRY,
+)
+import inspect
 
 
 def formatted_timestamp():
@@ -47,186 +41,6 @@ def formatted_timestamp():
     return time.strftime("%Y%m%d%H%M%S", time.localtime())
 
 
-class VeEvaluationGenerator(EvaluationGenerator):
-    @staticmethod
-    async def _ve_process_query(  # done
-        invocations: list[Invocation],
-        agent: Agent,
-        agent_name: Optional[str] = None,
-        initial_session: Optional[SessionInput] = None,
-    ):
-        agent_to_evaluate = agent
-        if agent_name:
-            agent_to_evaluate = agent.find_agent(agent_name)
-            assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
-
-        return await VeEvaluationGenerator._ve_generate_inferences_from_root_agent(
-            invocations, agent_to_evaluate, None, initial_session
-        )
-
-    @staticmethod
-    async def ve_generate_responses(  # done
-        eval_set: EvalSet,
-        agent: Agent,
-        repeat_num: int = 3,
-        agent_name: str | None = None,
-    ):
-        results = []
-
-        for eval_case in eval_set.eval_cases:
-            responses = []
-            for _ in range(repeat_num):
-                response_invocations = await VeEvaluationGenerator._ve_process_query(
-                    invocations=eval_case.conversation,
-                    agent=agent,
-                    agent_name=agent_name,
-                    initial_session=eval_case.session_input,
-                )
-                responses.append(response_invocations)
-
-            results.append(EvalCaseResponses(eval_case=eval_case, responses=responses))
-
-        return results
-
-    @staticmethod
-    async def _ve_generate_inferences_from_root_agent(
-        invocations: list[Invocation],
-        root_agent: BaseAgent,
-        reset_func: Any,
-        initial_session: Optional[SessionInput] = None,
-        session_id: Optional[str] = None,
-        session_service: Optional[BaseSessionService] = None,
-        artifact_service: Optional[BaseArtifactService] = None,
-    ) -> list[Invocation]:
-        """Scrapes the root agent given the list of Invocations."""
-        if not session_service:
-            session_service = InMemorySessionService()
-
-        app_name = (
-            initial_session.app_name if initial_session else "EvaluationGenerator"
-        )
-        user_id = initial_session.user_id if initial_session else "test_user_id"
-        session_id = session_id if session_id else str(uuid.uuid4())
-
-        _ = await session_service.create_session(
-            app_name=app_name,
-            user_id=user_id,
-            state=initial_session.state if initial_session else {},
-            session_id=session_id,
-        )
-
-        if not artifact_service:
-            artifact_service = InMemoryArtifactService()
-
-        runner = Runner(
-            app_name=app_name,
-            agent=root_agent,
-            artifact_service=artifact_service,
-            session_service=session_service,
-            memory_service=root_agent.long_term_memory
-            if isinstance(root_agent, Agent)
-            else None,
-        )
-
-        # Reset agent state for each query
-        if callable(reset_func):
-            reset_func()
-
-        response_invocations = []
-
-        for invocation in invocations:
-            final_response = None
-            user_content = invocation.user_content
-            tool_uses = []
-            invocation_id = ""
-
-            async for event in runner.run_async(
-                user_id=user_id, session_id=session_id, new_message=user_content
-            ):
-                invocation_id = (
-                    event.invocation_id if not invocation_id else invocation_id
-                )
-
-                if event.is_final_response() and event.content and event.content.parts:
-                    final_response = event.content
-                elif event.get_function_calls():
-                    for call in event.get_function_calls():
-                        tool_uses.append(call)
-
-            response_invocations.append(
-                Invocation(
-                    invocation_id=invocation_id,
-                    user_content=user_content,
-                    final_response=final_response,
-                    intermediate_data=IntermediateData(tool_uses=tool_uses),
-                )
-            )
-
-        return response_invocations
-
-
-class VeAgentEvaluator(AgentEvaluator):
-    def __init__(
-        self,
-    ):
-        super().__init__()
-
-    @staticmethod
-    async def ve_evaluate_eval_set(
-        agent: Agent,
-        eval_set: EvalSet,
-        criteria: dict[str, float],
-        num_runs=NUM_RUNS,
-        agent_name=None,
-        print_detailed_results: bool = True,
-    ):
-        eval_case_responses_list = await VeEvaluationGenerator.ve_generate_responses(
-            eval_set=eval_set,
-            agent=agent,
-            repeat_num=num_runs,
-            agent_name=agent_name,
-        )
-        failures = []
-        evaluation_result_list = []
-
-        for eval_case_responses in eval_case_responses_list:
-            actual_invocations = [
-                invocation
-                for invocations in eval_case_responses.responses
-                for invocation in invocations
-            ]
-            expected_invocations = eval_case_responses.eval_case.conversation * num_runs
-
-            for metric_name, threshold in criteria.items():
-                metric_evaluator = AgentEvaluator._get_metric_evaluator(
-                    metric_name=metric_name, threshold=threshold
-                )
-
-                evaluation_result: EvaluationResult = (
-                    metric_evaluator.evaluate_invocations(
-                        actual_invocations=actual_invocations,
-                        expected_invocations=expected_invocations,
-                    )
-                )
-
-                if print_detailed_results:
-                    AgentEvaluator._print_details(
-                        evaluation_result=evaluation_result,
-                        metric_name=metric_name,
-                        threshold=threshold,
-                    )
-
-                # Gather all the failures.
-                if evaluation_result.overall_eval_status != EvalStatus.PASSED:
-                    failures.append(
-                        f"{metric_name} for {agent.name} Failed. Expected {threshold},"
-                        f" but got {evaluation_result.overall_score}."
-                    )
-                evaluation_result_list.append(evaluation_result)
-
-        return evaluation_result_list, failures
-
-
 class ADKEvaluator(BaseEvaluator):
     def __init__(
         self,
@@ -235,10 +49,8 @@ def __init__(
     ):
         super().__init__(agent=agent, name=name)
 
-    # TODO: implement
-
     @override
-    async def eval(
+    async def evaluate(
         self,
         eval_set_file_path: str,
         eval_id: str = f"test_{formatted_timestamp()}",
@@ -247,6 +59,26 @@ async def eval(
         num_runs: int = 2,
         print_detailed_results: bool = True,
     ):
+        """
+        End-to-end evaluation flow:
+        1) Discover test files (.test.json) or accept a single path.
+        2) Build metric criteria (metric_name -> threshold).
+        3) For each file, build in-memory eval cases via BaseEvaluator.
+        4) For each eval case, construct expected ADK Invocations from expected data.
+        5) Repeat for num_runs:
+           - Reset all session_ids to isolate state.
+           - Generate actual outputs via BaseEvaluator and convert to ADK Invocations.
+        6) Repeat expected invocations to match num_runs for 1:1 alignment.
+        7) For each metric:
+           - Create EvalMetric and get the evaluator from ADK's registry.
+           - Call evaluate_invocations (await if async) to get EvaluationResult with:
+             overall_score/overall_eval_status + per_invocation_results.
+           - Optionally pretty print via AgentEvaluator._print_details.
+           - Record failure if overall status != PASSED.
+        8) Return (all evaluation_result objects, failures) to the caller.
+        """
+
+        # Resolve eval files: accept a directory (scan *.test.json) or a single file
         test_files = []
         eval_dataset_file_path_or_dir = eval_set_file_path
         if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
@@ -259,28 +91,149 @@ async def eval(
         else:
             test_files = [eval_dataset_file_path_or_dir]
 
-        initial_session = AgentEvaluator._get_initial_session()
+        # Build metric criteria (metric_name -> threshold)
+        criteria = {
+            TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold,  # 1-point scale; 1.0 means perfect tool call trajectory
+            RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold,  # Rouge-1 text match; 0.8 default threshold
+        }
 
+        # Aggregate all evaluation results and failures across files
         result = []
         failures = []
+
+        # Iterate each test file and evaluate per-case, per-metric
         for test_file in test_files:
-            criteria = {
-                TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold,  # 1-point scale; 1.0 is perfect.
-                RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold,  # Rouge-1 text match; 0.8 is default.
-            }
-            eval_set = AgentEvaluator._load_eval_set_from_file(
-                test_file, criteria, initial_session
-            )
+            # Build in-memory evaluation cases via BaseEvaluator from the provided file
+            self.build_eval_set(test_file)
+
+            evaluation_result_list = []
+
+            # For each eval case, generate actual outputs num_runs times using BaseEvaluator
+            for case_idx, eval_case_data in enumerate(self.invocation_list):
+                # Convert BaseEvaluator's expected data into ADK Invocation list
+                expected_invocations: list[Invocation] = []
+                for inv in eval_case_data.invocations:
+                    user_content = genai_types.Content(
+                        role="user",
+                        parts=[genai_types.Part(text=inv.input or "")],
+                    )
+                    expected_final = genai_types.Content(
+                        role=None,
+                        parts=[genai_types.Part(text=inv.expected_output or "")],
+                    )
+                    expected_tool_calls = [
+                        SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
+                        for t in (inv.expected_tool or [])
+                    ]
+                    # Pack a full expected Invocation for ADK metrics
+                    expected_invocations.append(
+                        Invocation(
+                            invocation_id=inv.invocation_id,
+                            user_content=user_content,
+                            final_response=expected_final,
+                            intermediate_data=IntermediateData(
+                                tool_uses=expected_tool_calls
+                            ),
+                        )
+                    )
+
+                # Collect actual invocations across runs
+                actual_invocations_all_runs: list[Invocation] = []
+                for _ in range(num_runs):
+                    for agent_information in self.agent_information_list:
+                        agent_information["session_id"] = str(uuid.uuid4())
+
+                    # Generate actual outputs for all cases in this run via BaseEvaluator
+                    await self.generate_actual_outputs()
+
+                    # Convert BaseEvaluator's actual data into ADK Invocation list
+                    for inv in eval_case_data.invocations:
+                        user_content = genai_types.Content(
+                            role="user",
+                            parts=[genai_types.Part(text=inv.input or "")],
+                        )
+                        actual_final = genai_types.Content(
+                            role=None,
+                            parts=[genai_types.Part(text=inv.actual_output or "")],
+                        )
+                        # Collect the tool calls observed during actual execution
+                        actual_tool_calls = [
+                            SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
+                            for t in (inv.actual_tool or [])
+                        ]
+                        # Pack a full actual Invocation for ADK metrics
+                        actual_invocations_all_runs.append(
+                            Invocation(
+                                invocation_id=inv.invocation_id,
+                                user_content=user_content,
+                                final_response=actual_final,
+                                intermediate_data=IntermediateData(
+                                    tool_uses=actual_tool_calls
+                                ),
+                            )
+                        )
+
+                # Repeat expected invocations to align with num_runs
+                expected_invocations_repeated = expected_invocations * num_runs
+
+                # Evaluate per metric via ADK metric evaluators obtained from the registry
+                for metric_name, threshold in criteria.items():
+                    eval_metric = EvalMetric(
+                        metric_name=metric_name, threshold=threshold
+                    )
+                    metric_evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(
+                        eval_metric=eval_metric
+                    )
 
-            res, fail = await VeAgentEvaluator.ve_evaluate_eval_set(
-                agent=self.agent,
-                eval_set=eval_set,
-                criteria=criteria,
-                num_runs=num_runs,
-                agent_name=self.agent.name,
-                print_detailed_results=print_detailed_results,
-            )
-            result.append(res)
-            failures.extend(fail)
+                    if inspect.iscoroutinefunction(
+                        metric_evaluator.evaluate_invocations
+                    ):
+                        evaluation_result = await metric_evaluator.evaluate_invocations(
+                            actual_invocations=actual_invocations_all_runs,
+                            expected_invocations=expected_invocations_repeated,
+                        )
+                    else:
+                        evaluation_result = metric_evaluator.evaluate_invocations(
+                            actual_invocations=actual_invocations_all_runs,
+                            expected_invocations=expected_invocations_repeated,
+                        )
+
+                    if print_detailed_results:
+                        per_items = []
+                        for i, per in enumerate(
+                            getattr(evaluation_result, "per_invocation_results", [])
+                            or []
+                        ):
+                            per_items.append(
+                                SimpleNamespace(
+                                    actual_invocation=actual_invocations_all_runs[i],
+                                    expected_invocation=expected_invocations_repeated[
+                                        i
+                                    ],
+                                    eval_metric_result=SimpleNamespace(
+                                        eval_status=per.eval_status,
+                                        score=per.score,
+                                        threshold=threshold,
+                                    ),
+                                )
+                            )
+
+                        AgentEvaluator._print_details(
+                            eval_metric_result_with_invocations=per_items,
+                            overall_eval_status=evaluation_result.overall_eval_status,
+                            overall_score=evaluation_result.overall_score,
+                            metric_name=metric_name,
+                            threshold=threshold,
+                        )
+
+                    if evaluation_result.overall_eval_status != EvalStatus.PASSED:
+                        failures.append(
+                            f"{metric_name} for {self.agent.name} Failed. Expected {threshold},"
+                            f" but got {evaluation_result.overall_score}."
+                        )
+
+                    evaluation_result_list.append(evaluation_result)
+
+            result.append(evaluation_result_list)
 
         return result, failures
diff --git a/veadk/evaluation/base_evaluator.py b/veadk/evaluation/base_evaluator.py
index d02f5eaa..53348027 100644
--- a/veadk/evaluation/base_evaluator.py
+++ b/veadk/evaluation/base_evaluator.py
@@ -28,7 +28,13 @@
 from veadk.utils.misc import formatted_timestamp
 
 
-class InvocationTestData(BaseModel):
+class ToolInvocation(BaseModel):
+    tool_name: str
+    tool_args: dict[str, Any] = {}
+    tool_result: Any = None
+
+
+class Invocation(BaseModel):
     invocation_id: str = ""
     input: str
     actual_output: str
@@ -38,8 +44,8 @@ class InvocationTestData(BaseModel):
     latency: str = ""  # ms
 
 
-class EvalCaseData(BaseModel):
-    invocations: list[InvocationTestData]
+class EvalTestCase(BaseModel):
+    invocations: list[Invocation]
 
 
 class MetricResult(BaseModel):
@@ -78,23 +84,23 @@ def __init__(
     ):
         self.name = name
         self.agent = agent
-        self.invocation_list: list[EvalCaseData] = []
+        self.invocation_list: list[EvalTestCase] = []
         self.result_list: list[EvalResultData] = []
         self.agent_information_list: list[dict] = []
 
-    def _load_eval_set(self, eval_set_file: str) -> EvalSet:
-        from .eval_set_file_loader import load_eval_set_from_file
+    def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
+        from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
 
-        return load_eval_set_from_file(eval_set_file)
+        return load_eval_set_from_file(eval_json_path)
 
-    def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
+    def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
         try:
-            with open(tracing_file, "r") as f:
+            with open(tracing_json_path, "r") as f:
                 tracing_data = json.load(f)
         except json.JSONDecodeError as e:
-            raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
+            raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
         except Exception as e:
-            raise ValueError(f"Error reading file {tracing_file}: {e}")
+            raise ValueError(f"Error reading file {tracing_json_path}: {e}")
 
         # Group spans by trace_id
         trace_groups = {}
@@ -188,9 +194,9 @@ def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
 
         return evalset
 
-    def generate_eval_data(self, file_path: str):
+    def build_eval_set(self, file_path: str):
         """Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
-        eval_case_data_list: list[EvalCaseData] = []
+        eval_case_data_list: list[EvalTestCase] = []
 
         try:
             with open(file_path, "r") as f:
@@ -201,7 +207,7 @@ def generate_eval_data(self, file_path: str):
             raise ValueError(f"Error reading file {file_path}: {e}")
 
         if isinstance(file_content, dict) and "eval_cases" in file_content:
-            eval_cases = self._load_eval_set(file_path).eval_cases
+            eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
         elif (
             isinstance(file_content, list)
             and len(file_content) > 0
@@ -209,14 +215,14 @@ def generate_eval_data(self, file_path: str):
                 isinstance(span, dict) and "trace_id" in span for span in file_content
             )
         ):
-            eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
+            eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases
         else:
             raise ValueError(
                 f"Unsupported file format in {file_path}. Please provide a valid file."
             )
 
         for eval_case in eval_cases:
-            eval_case_data = EvalCaseData(invocations=[])
+            eval_case_data = EvalTestCase(invocations=[])
             if eval_case.session_input:
                 self.agent_information_list.append(
                     {
@@ -247,7 +253,7 @@ def generate_eval_data(self, file_path: str):
                         )
 
                 eval_case_data.invocations.append(
-                    InvocationTestData(
+                    Invocation(
                         invocation_id=invocation.invocation_id,
                         input=_input,
                         actual_output="",
@@ -261,7 +267,7 @@ def generate_eval_data(self, file_path: str):
             eval_case_data_list.append(eval_case_data)
         self.invocation_list = eval_case_data_list
 
-    async def _run_agent_for_actual_data(self):
+    async def generate_actual_outputs(self):
         for eval_case_data, agent_information in zip(
             self.invocation_list, self.agent_information_list
         ):
@@ -333,7 +339,7 @@ async def _run_agent_for_actual_data(self):
                 invocation.actual_tool = _actual_tool
                 invocation.latency = _latency
 
-    def get_data(self) -> list[list[dict[str, Any]]]:
+    def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
         """Merge the evaluation data and return it in the format of list[list[dict]]"""
         result = []
         for i, eval_case in enumerate(self.invocation_list):
@@ -360,7 +366,7 @@ def get_data(self) -> list[list[dict[str, Any]]]:
         return result
 
     @abstractmethod
-    async def eval(
+    async def evaluate(
         self,
         eval_set_file_path: str,
         metrics: list[Any],
diff --git a/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py b/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py
index ed656aa0..3f60bc82 100644
--- a/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py
+++ b/veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py
@@ -27,8 +27,11 @@
 from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
 from veadk.utils.logger import get_logger
 
-from ..base_evaluator import BaseEvaluator, EvalResultData, MetricResult
-from ..utils.prometheus import PrometheusPushgatewayConfig, push_to_prometheus
+from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
+from veadk.evaluation.utils.prometheus import (
+    PrometheusPushgatewayConfig,
+    push_to_prometheus,
+)
 
 logger = get_logger(__name__)
 
@@ -66,7 +69,7 @@ def __init__(
         self.prometheus_config = prometheus_config
 
     @override
-    async def eval(
+    async def evaluate(
         self,
         eval_set_file_path: str,
         metrics: list[BaseMetric],
@@ -74,11 +77,11 @@ async def eval(
     ):
         """Target to Google ADK, we will use the same evaluation case format as Google ADK."""
         # Get evaluation data by parsing eval set file
-        self.generate_eval_data(eval_set_file_path)
+        self.build_eval_set(eval_set_file_path)
 
         # Get actual data by running agent
         logger.info("Start to run agent for actual data.")
-        await self._run_agent_for_actual_data()
+        await self.generate_actual_outputs()
         eval_case_data_list = self.invocation_list
 
         # Build test cases in Deepeval format
diff --git a/veadk_tutorial.ipynb b/veadk_tutorial.ipynb
index 0289432f..d36053d6 100644
--- a/veadk_tutorial.ipynb
+++ b/veadk_tutorial.ipynb
@@ -1788,7 +1788,7 @@
     "    ToolCorrectnessMetric(threshold=0.5),\n",
     "]\n",
     "\n",
-    "await evaluator.eval(eval_set_file_path=eval_set_path, metrics=metrics)"
+    "await evaluator.evaluate(eval_set_file_path=eval_set_path, metrics=metrics)"
    ]
   },
   {