Azure · slister1001 · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_4da51ba6ab"
+  "Tag": "python/evaluation/azure-ai-evaluation_802ffe01e9"
 }
@@ -43,6 +43,47 @@
 
 LOGGER = logging.getLogger(__name__)
 
+# Metric names that differ between the sync evals endpoint and the legacy annotation endpoint.
+# Key = sync endpoint metric name, Value = legacy annotation API metric name.
+# Used bidirectionally: forward lookup for sync→legacy, reverse for legacy→sync.
+# Note: only metrics where the API request metric name differs should be here.
+# For XPIA and ECI, the legacy API uses the annotation_task, not MetricList,
+# so the metric name doesn't need remapping — but the response key does.
+_SYNC_TO_LEGACY_METRIC_NAMES: Dict[str, str] = {
+    "hate_unfairness": "hate_fairness",
+    "groundedness": "generic_groundedness",
+}
+
+# Legacy response key lookup: the annotation API may return results under a different
+# key than the sync metric name. This is a superset of _SYNC_TO_LEGACY_METRIC_NAMES.
+_SYNC_TO_LEGACY_RESPONSE_KEYS: Dict[str, str] = {
+    **_SYNC_TO_LEGACY_METRIC_NAMES,
+    "indirect_attack": "xpia",
+    "election_critical_information": "eci",
+}
+
+# Reverse mapping: legacy metric name → sync metric name (built once at module level)
+_LEGACY_TO_SYNC_METRIC_NAMES: Dict[str, str] = {v: k for k, v in _SYNC_TO_LEGACY_METRIC_NAMES.items()}
+
+
+def _normalize_metric_for_endpoint(metric_name, use_legacy_endpoint, metric_display_name=None):
+    """Normalize metric name based on which endpoint is being used.
+
+    Returns (metric_name, metric_display_name) tuple with the correct metric name
+    for the target endpoint, and metric_display_name set to preserve output key names.
+    """
+    metric_name_str = metric_name.value if hasattr(metric_name, "value") else metric_name
+    if use_legacy_endpoint:
+        legacy_name = _SYNC_TO_LEGACY_METRIC_NAMES.get(metric_name_str)
+        if legacy_name:
+            return legacy_name, (metric_display_name or metric_name_str)
+    else:
+        sync_name = _LEGACY_TO_SYNC_METRIC_NAMES.get(metric_name_str)
+        if sync_name:
+            return sync_name, metric_display_name
+    return metric_name, metric_display_name
+
+
 USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
     "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
 }
@@ -453,9 +494,19 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
                 )
                 result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
             return result
+        # Check for metric_name in response; also check legacy response key name if different.
+        # Note: parse_response is only called from legacy endpoint functions (evaluate_with_rai_service
+        # and evaluate_with_rai_service_multimodal), so this fallback is inherently legacy-only.
+        response_key = metric_name
         if metric_name not in batch_response[0]:
-            return {}
-        response = batch_response[0][metric_name]
+            legacy_key = _SYNC_TO_LEGACY_RESPONSE_KEYS.get(
+                metric_name.value if hasattr(metric_name, "value") else metric_name
+            )
+            if legacy_key and legacy_key in batch_response[0]:
+                response_key = legacy_key
+            else:
+                return {}
+        response = batch_response[0][response_key]
         response = response.replace("false", "False")
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
@@ -547,13 +598,23 @@ def _parse_content_harm_response(
     }
 
     response = batch_response[0]
+    # Check for metric_name in response; also check legacy response key name if different.
+    # Note: _parse_content_harm_response is only called from parse_response, which is
+    # only called from legacy endpoint functions, so this fallback is inherently legacy-only.
+    response_key = metric_name
     if metric_name not in response:
-        return result
+        legacy_key = _SYNC_TO_LEGACY_RESPONSE_KEYS.get(
+            metric_name.value if hasattr(metric_name, "value") else metric_name
+        )
+        if legacy_key and legacy_key in response:
+            response_key = legacy_key
+        else:
+            return result
 
     try:
-        harm_response = literal_eval(response[metric_name])
+        harm_response = literal_eval(response[response_key])
     except Exception:  # pylint: disable=broad-exception-caught
-        harm_response = response[metric_name]
+        harm_response = response[response_key]
 
     total_tokens = 0
     prompt_tokens = 0
@@ -1044,6 +1105,10 @@ async def evaluate_with_rai_service_sync(
     :return: The EvalRunOutputItem containing the evaluation results (or parsed dict if legacy).
     :rtype: Union[EvalRunOutputItem, Dict[str, Union[str, float]]]
     """
+    metric_name, metric_display_name = _normalize_metric_for_endpoint(
+        metric_name, use_legacy_endpoint, metric_display_name
+    )
+
     # Route to legacy endpoint if requested
     if use_legacy_endpoint:
         return await evaluate_with_rai_service(
@@ -1261,13 +1326,16 @@ async def evaluate_with_rai_service_sync_multimodal(
     :return: The EvalRunOutputItem or legacy response payload.
     :rtype: Union[Dict, EvalRunOutputItem]
     """
+    metric_name, metric_display_name = _normalize_metric_for_endpoint(metric_name, use_legacy_endpoint)
+
     # Route to legacy endpoint if requested
     if use_legacy_endpoint:
         return await evaluate_with_rai_service_multimodal(
             messages=messages,
             metric_name=metric_name,
             project_scope=project_scope,
             credential=credential,
+            metric_display_name=metric_display_name,
         )
 
     # Sync evals endpoint implementation (default)
@@ -1316,6 +1384,7 @@ async def evaluate_with_rai_service_multimodal(
     metric_name: str,
     project_scope: Union[str, AzureAIProject],
     credential: TokenCredential,
+    metric_display_name: Optional[str] = None,
 ):
     """Evaluate the content safety of the response using Responsible AI service (legacy endpoint)
     :param messages: The normalized list of messages.
@@ -1327,6 +1396,8 @@ async def evaluate_with_rai_service_multimodal(
     :type project_scope: Union[str, AzureAIProject]
     :param credential: The Azure authentication credential.
     :type credential: ~azure.core.credentials.TokenCredential
+    :param metric_display_name: The display name for the metric in output keys. If None, uses metric_name.
+    :type metric_display_name: Optional[str]
     :return: The parsed annotation result.
     :rtype: List[List[Dict]]
     """
@@ -1341,7 +1412,7 @@ async def evaluate_with_rai_service_multimodal(
         await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
         operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
         annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
-        result = parse_response(annotation_response, metric_name)
+        result = parse_response(annotation_response, metric_name, metric_display_name)
         return result
     else:
         token = await fetch_or_reuse_token(credential)
@@ -1350,5 +1421,5 @@ async def evaluate_with_rai_service_multimodal(
         # Submit annotation request and fetch result
         operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
         annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
-        result = parse_response(annotation_response, metric_name)
+        result = parse_response(annotation_response, metric_name, metric_display_name)
         return result
@@ -125,17 +125,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
     async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
         """Evaluates content according to this evaluator's metric.
         Evaluates each turn separately to maintain per-turn granularity.
+        When using the legacy endpoint, sends the entire conversation in a single call
+        (matching pre-sync-migration behavior) via the sync wrapper for metric normalization.
         """
         validate_conversation(conversation)
         messages = conversation["messages"]
 
         # Convert enum to string value
         metric_value = self._eval_metric.value if hasattr(self._eval_metric, "value") else self._eval_metric
 
-        # Extract conversation turns (user-assistant pairs)
+        if self._use_legacy_endpoint:
+            # Legacy path: send entire conversation in a single call (pre-sync-migration behavior)
+            # Route through evaluate_with_rai_service_sync_multimodal for metric normalization.
+            result = await evaluate_with_rai_service_sync_multimodal(
+                messages=messages,
+                metric_name=metric_value,
+                project_scope=self._azure_ai_project,
+                credential=self._credential,
+                use_legacy_endpoint=True,
+            )
+            # Wrap as single-turn result and aggregate to produce evaluation_per_turn structure
+            return self._aggregate_results([result])
+
+        # Sync path: evaluate each turn separately for per-turn granularity
         turns = self._extract_turns(messages)
 
-        # Evaluate each turn separately
         per_turn_results = []
         for turn in turns:
             turn_result = await evaluate_with_rai_service_sync_multimodal(
@@ -213,6 +227,10 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
             use_legacy_endpoint=self._use_legacy_endpoint,
         )
 
+        # Legacy endpoint returns a pre-parsed dict from parse_response(); return directly
+        if self._use_legacy_endpoint:
+            return eval_result
+
         # Parse the EvalRunOutputItem format to the expected dict format
         return self._parse_eval_result(eval_result)
 

@@ -28,7 +28,6 @@
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_sync
 from azure.ai.evaluation._common.utils import (
     get_default_threshold_for_evaluator,
-    is_onedp_project,
 )
 from azure.ai.evaluation._evaluate._utils import _write_output
 
@@ -154,15 +153,9 @@ async def evaluate_conversation(
             try:
                 self.logger.debug(f"Evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}")
 
-                use_sync_endpoint = is_onedp_project(self.azure_ai_project)
-                effective_metric_name = (
-                    "hate_unfairness" if use_sync_endpoint and metric_name == "hate_fairness" else metric_name
-                )
-
                 @retry(**self.retry_config["network_retry"])
                 async def evaluate_with_rai_service_with_retry():
                     try:
-                        # Always use sync_evals endpoint for all projects
                         return await evaluate_with_rai_service_sync(
                             data=query_response,
                             metric_name=metric_name,
@@ -171,6 +164,7 @@ async def evaluate_with_rai_service_with_retry():
                             annotation_task=annotation_task,
                             scan_session_id=self.scan_session_id,
                             use_legacy_endpoint=self._use_legacy_endpoint,
+                            evaluator_name=f"RedTeam.{risk_category.value}",
                         )
                     except (
                         httpx.ConnectTimeout,
@@ -213,7 +207,7 @@ async def evaluate_with_rai_service_with_retry():
 
                     # Find the result matching our metric/risk category
                     eval_result = None
-                    lookup_names = {metric_name, risk_cat_value, effective_metric_name}
+                    lookup_names = {metric_name, risk_cat_value}
                     for result_item in results:
                         result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
                         result_name = str(result_dict.get("name") or "")

@@ -42,6 +42,7 @@ def __init__(
         risk_category: RiskCategory,
         logger: Optional[logging.Logger] = None,
         dataset_config: Optional[Any] = None,
+        use_legacy_endpoint: bool = False,
     ) -> None:
         """Initialize the RAI Service scorer.
 
@@ -56,6 +57,8 @@ def __init__(
         :type logger: Optional[logging.Logger]
         :param dataset_config: Optional DatasetConfiguration for context lookup
         :type dataset_config: Optional[Any]
+        :param use_legacy_endpoint: Whether to use the legacy evaluation endpoint. Defaults to False.
+        :type use_legacy_endpoint: bool
         """
         # Initialize with a default validator for text scoring
         super().__init__(validator=ScorerPromptValidator(supported_data_types=["text"]))
@@ -64,6 +67,7 @@ def __init__(
         self.risk_category = risk_category
         self.logger = logger or logging.getLogger(__name__)
         self.dataset_config = dataset_config
+        self._use_legacy_endpoint = use_legacy_endpoint
         self._context_lookup: Dict[str, Dict[str, Any]] = {}
 
         # Build context lookup from dataset config if provided
@@ -160,18 +164,17 @@ async def _score_piece_async(
         metric_name = get_metric_from_risk_category(self.risk_category)
         annotation_task = get_annotation_task_from_risk_category(self.risk_category)
 
-        # The Sync API requires "hate_unfairness" instead of "hate_fairness"
-        if metric_name == "hate_fairness":
-            metric_name = "hate_unfairness"
+        # Metric name normalization (hate_fairness ↔ hate_unfairness) is handled by
+        # evaluate_with_rai_service_sync based on the use_legacy_endpoint flag.
 
         try:
-            # Call RAI service using sync_evals endpoint (matches legacy path behavior)
             eval_result = await evaluate_with_rai_service_sync(
                 eval_input,
                 metric_name=metric_name,
                 project_scope=self.azure_ai_project,
                 credential=self.credential,
                 annotation_task=annotation_task,
+                use_legacy_endpoint=self._use_legacy_endpoint,
             )
 
             self.logger.debug(f"RAI evaluation result: {eval_result}")

@@ -7,10 +7,11 @@
 from azure.ai.evaluation._constants import EvaluationMetrics, _InternalEvaluationMetrics
 from azure.ai.evaluation._common.constants import Tasks, _InternalAnnotationTasks
 
-# Mapping of risk categories to their corresponding evaluation metrics
-# Note: For HateUnfairness, the mapping defaults to HATE_FAIRNESS, but the Sync API
-# (used for all projects) requires HATE_UNFAIRNESS instead.
-# This is handled dynamically in _evaluation_processor.py.
+# Mapping of risk categories to their corresponding evaluation metrics.
+# Note: HateUnfairness maps to HATE_FAIRNESS because that is the metric name the legacy
+# annotation endpoint recognizes. The bidirectional mapping between hate_fairness and
+# hate_unfairness is handled at the routing layer in evaluate_with_rai_service_sync(),
+# which normalizes the metric name based on the endpoint being used (legacy vs sync).
 RISK_CATEGORY_METRIC_MAP = {
     RiskCategory.Violence: EvaluationMetrics.VIOLENCE,
     RiskCategory.HateUnfairness: EvaluationMetrics.HATE_FAIRNESS,

@@ -326,8 +326,9 @@ def simple_conversation():
 @pytest.fixture
 def redirect_openai_requests():
     """Route requests from the openai package to the test proxy."""
+    proxy_url = PROXY_URL() if callable(PROXY_URL) else PROXY_URL
     config = TestProxyConfig(
-        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL()
+        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=proxy_url
     )
 
     with TestProxyHttpxClientBase.record_with_proxy(config):