Azure · Copilot · Mar 16, 2026 · Mar 16, 2026
@@ -4,6 +4,7 @@
 
 ### Bugs Fixed
 - Fixed inconsistency where sample data in evaluation result items did not match the generated sample data from corresponding input rows, ensuring proper synchronization between row-level input samples and their associated evaluation output items.
+- Fixed `IndirectAttackEvaluator` results not displaying correctly in AI Foundry portal by restoring the `xpia_` output key prefix (e.g. `xpia_label`, `xpia_reason`, `xpia_manipulated_content`, `xpia_intrusion`, `xpia_information_gathering`) for backward compatibility with AI Foundry's expected column names.
 
 ## 1.16.0 (2026-03-10)
 

@@ -192,6 +192,9 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         EvaluationMetrics.LOGOS_AND_BRANDS,
         _InternalEvaluationMetrics.ECI,
         EvaluationMetrics.XPIA,
+        # "xpia" is the output key prefix used by IndirectAttackEvaluator (EvaluationMetrics.XPIA
+        # has value "indirect_attack" for service communication, but outputs use "xpia" prefix)
+        "xpia",
         EvaluationMetrics.CODE_VULNERABILITY,
         EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
     ]
@@ -318,6 +321,9 @@ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
 
     # Combine all known metrics
     all_known_metrics = evaluation_metrics_values + internal_metrics_values
+    # IndirectAttackEvaluator (EvaluationMetrics.XPIA) outputs token count columns with
+    # "xpia" prefix for backward compatibility, so include "xpia" explicitly.
+    all_known_metrics.append("xpia")
 
     # Find token count columns that belong to known metrics
     token_count_cols = [

@@ -1,6 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import math
 from typing import Any, Dict, List, TypeVar, Union, Optional
 
 from typing_extensions import override
@@ -203,6 +204,10 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
                 )
             input_data["context"] = context
 
+        # For XPIA, pass "xpia" as the metric display name to ensure output keys
+        # use the "xpia" prefix (e.g. "xpia_label") for backward compatibility.
+        metric_display_name = "xpia" if self._eval_metric == EvaluationMetrics.XPIA else None
+
         eval_result = await evaluate_with_rai_service_sync(  # type: ignore
             metric_name=self._eval_metric,
             data=input_data,
@@ -211,6 +216,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
             annotation_task=self._get_task(),
             evaluator_name=self.__class__.__name__,
             use_legacy_endpoint=self._use_legacy_endpoint,
+            metric_display_name=metric_display_name,
         )
 
         # Parse the EvalRunOutputItem format to the expected dict format
@@ -271,11 +277,37 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
                         # Convert string to boolean
                         label = label_str.lower() == "true" if isinstance(label_str, str) else bool(label_str)
 
+                        # Use "xpia" as the output key prefix for XPIA to maintain backward
+                        # compatibility with column naming expected by AI Foundry and downstream logic.
+                        output_prefix = (
+                            "xpia" if self._eval_metric == EvaluationMetrics.XPIA else self._eval_metric.value
+                        )
+
                         parsed_result = {
-                            f"{self._eval_metric.value}_label": label,
-                            f"{self._eval_metric.value}_reason": reason,
+                            f"{output_prefix}_label": label,
+                            f"{output_prefix}_reason": reason,
                         }
 
+                        # For XPIA, extract sub-metrics as flat keys (manipulated_content, intrusion,
+                        # information_gathering) so they appear as individual columns in the results.
+                        if self._eval_metric == EvaluationMetrics.XPIA:
+                            for sub_metric in ["manipulated_content", "intrusion", "information_gathering"]:
+                                # Service may return snake_case or camelCase keys; try both
+                                camel_map = {
+                                    "manipulated_content": "manipulatedContent",
+                                    "information_gathering": "informationGathering",
+                                }
+                                sub_value = score_properties.get(
+                                    sub_metric, score_properties.get(camel_map.get(sub_metric, sub_metric), math.nan)
+                                )
+                                if isinstance(sub_value, str):
+                                    sub_value = (
+                                        sub_value.lower() == "true"
+                                        if sub_value.lower() in ["true", "false"]
+                                        else math.nan
+                                    )
+                                parsed_result[f"{output_prefix}_{sub_metric}"] = sub_value
+
                         # For protected_material, also extract breakdown if available
                         if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
                             for component in ["fictional_characters", "logos_and_brands", "artwork"]:
@@ -295,7 +327,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
 
                         # Extract details from scoreProperties
                         if score_properties:
-                            parsed_result[f"{self._eval_metric. value}_details"] = _prepare_details(score_properties)
+                            parsed_result[f"{output_prefix}_details"] = _prepare_details(score_properties)
 
                         # Extract token counts from metrics
                         metrics = properties.get("metrics", {})
@@ -313,15 +345,15 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
                             total_tokens = ""
 
                         # Add token metadata (matching old format)
-                        parsed_result[f"{self._eval_metric. value}_total_tokens"] = total_tokens
-                        parsed_result[f"{self._eval_metric.value}_prompt_tokens"] = prompt_tokens
-                        parsed_result[f"{self._eval_metric.value}_completion_tokens"] = completion_tokens
+                        parsed_result[f"{output_prefix}_total_tokens"] = total_tokens
+                        parsed_result[f"{output_prefix}_prompt_tokens"] = prompt_tokens
+                        parsed_result[f"{output_prefix}_completion_tokens"] = completion_tokens
 
                         # Add empty placeholders for fields that sync_evals doesn't provide
-                        parsed_result[f"{self._eval_metric.value}_finish_reason"] = ""
-                        parsed_result[f"{self._eval_metric.value}_sample_input"] = ""
-                        parsed_result[f"{self._eval_metric.value}_sample_output"] = ""
-                        parsed_result[f"{self._eval_metric.value}_model"] = ""
+                        parsed_result[f"{output_prefix}_finish_reason"] = ""
+                        parsed_result[f"{output_prefix}_sample_input"] = ""
+                        parsed_result[f"{output_prefix}_sample_output"] = ""
+                        parsed_result[f"{output_prefix}_model"] = ""
 
                         return parsed_result
 
@@ -366,21 +398,29 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
         # check if it's already in the correct format (might be legacy response)
         if isinstance(eval_result, dict):
             # Check if it already has the expected keys
-            expected_key = (
-                f"{self._eval_metric.value}_label"
-                if self._eval_metric
-                in [
-                    EvaluationMetrics.CODE_VULNERABILITY,
-                    EvaluationMetrics.PROTECTED_MATERIAL,
-                    EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
-                    EvaluationMetrics.XPIA,
-                    _InternalEvaluationMetrics.ECI,
-                ]
-                else self._eval_metric.value
-            )
-
-            if expected_key in eval_result:
-                return eval_result
+            # For XPIA, use "xpia" prefix for backward compatibility
+            if self._eval_metric == EvaluationMetrics.XPIA:
+                if "xpia_label" in eval_result:
+                    return eval_result
+                # Handle legacy responses that use "indirect_attack_" prefix instead of "xpia_"
+                if "indirect_attack_label" in eval_result:
+                    return {
+                        key.replace("indirect_attack_", "xpia_"): value
+                        for key, value in eval_result.items()
+                    }
+            elif self._eval_metric in [
+                EvaluationMetrics.CODE_VULNERABILITY,
+                EvaluationMetrics.PROTECTED_MATERIAL,
+                EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
+                _InternalEvaluationMetrics.ECI,
+            ]:
+                expected_key = f"{self._eval_metric.value}_label"
+                if expected_key in eval_result:
+                    return eval_result
+            else:
+                expected_key = self._eval_metric.value
+                if expected_key in eval_result:
+                    return eval_result
 
         # Return empty dict if we can't parse
         return {}

@@ -634,3 +634,116 @@ def test_parse_eval_result_with_builtin_prefix(self):
         # Token counts should be extracted from properties.metrics
         assert result["violence_prompt_tokens"] == "15"
         assert result["violence_completion_tokens"] == "55"
+
+    def test_parse_eval_result_xpia_uses_xpia_prefix(self):
+        """Test that IndirectAttackEvaluator output uses 'xpia' prefix, not 'indirect_attack'."""
+        from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase
+        from azure.ai.evaluation._common.constants import EvaluationMetrics
+
+        # Simulate a sync_evals response for XPIA (indirect_attack)
+        eval_result = {
+            "results": [
+                {
+                    "name": "indirect_attack",
+                    "type": "azure_ai_evaluator",
+                    "metric": "indirect_attack",
+                    "score": 0.0,
+                    "reason": "No indirect attack detected.",
+                    "properties": {
+                        "scoreProperties": {
+                            "label": "false",
+                            "manipulated_content": "false",
+                            "intrusion": "false",
+                            "information_gathering": "false",
+                        },
+                        "metrics": {"promptTokens": "10", "completionTokens": "20"},
+                    },
+                }
+            ]
+        }
+
+        evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase)
+        evaluator._eval_metric = EvaluationMetrics.XPIA
+
+        result = evaluator._parse_eval_result(eval_result)
+
+        # Output keys MUST use "xpia" prefix for backward compatibility
+        assert "xpia_label" in result, f"Expected 'xpia_label' in result, got keys: {list(result.keys())}"
+        assert "xpia_reason" in result
+        assert "xpia_manipulated_content" in result
+        assert "xpia_intrusion" in result
+        assert "xpia_information_gathering" in result
+        # Must NOT have "indirect_attack_" prefixed keys
+        assert "indirect_attack_label" not in result
+        assert "indirect_attack_reason" not in result
+        # Verify values
+        assert result["xpia_label"] == False
+        assert result["xpia_reason"] == "No indirect attack detected."
+        assert result["xpia_manipulated_content"] == False
+        assert result["xpia_intrusion"] == False
+        assert result["xpia_information_gathering"] == False
+        assert result["xpia_prompt_tokens"] == "10"
+        assert result["xpia_completion_tokens"] == "20"
+
+    def test_parse_eval_result_xpia_legacy_indirect_attack_keys_renamed(self):
+        """Test that legacy responses with 'indirect_attack_' keys are renamed to 'xpia_'."""
+        from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase
+        from azure.ai.evaluation._common.constants import EvaluationMetrics
+
+        # Simulate a legacy response that uses "indirect_attack_" prefix
+        legacy_result = {
+            "indirect_attack_label": True,
+            "indirect_attack_reason": "Attack detected",
+            "indirect_attack_manipulated_content": True,
+            "indirect_attack_intrusion": False,
+            "indirect_attack_information_gathering": False,
+            "indirect_attack_total_tokens": "100",
+        }
+
+        evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase)
+        evaluator._eval_metric = EvaluationMetrics.XPIA
+
+        result = evaluator._parse_eval_result(legacy_result)
+
+        # Legacy "indirect_attack_" keys should be renamed to "xpia_"
+        assert "xpia_label" in result
+        assert "xpia_reason" in result
+        assert "xpia_manipulated_content" in result
+        assert "indirect_attack_label" not in result
+        assert result["xpia_label"] == True
+        assert result["xpia_reason"] == "Attack detected"
+
+    def test_parse_eval_result_xpia_with_camelcase_sub_metrics(self):
+        """Test that camelCase sub-metric keys from service are handled correctly."""
+        from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase
+        from azure.ai.evaluation._common.constants import EvaluationMetrics
+        import math
+
+        # Service may return camelCase keys like "manipulatedContent"
+        eval_result = {
+            "results": [
+                {
+                    "metric": "indirect_attack",
+                    "score": 0.0,
+                    "reason": "Attack detected.",
+                    "properties": {
+                        "scoreProperties": {
+                            "label": "true",
+                            "manipulatedContent": "true",
+                            "intrusion": "false",
+                            "informationGathering": "true",
+                        },
+                    },
+                }
+            ]
+        }
+
+        evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase)
+        evaluator._eval_metric = EvaluationMetrics.XPIA
+
+        result = evaluator._parse_eval_result(eval_result)
+
+        assert result["xpia_label"] == True
+        assert result["xpia_manipulated_content"] == True
+        assert result["xpia_intrusion"] == False
+        assert result["xpia_information_gathering"] == True
@@ -758,7 +758,25 @@ def test_aggregate_label_defect_metrics_with_nan_in_details(self):
         assert defect_rates["evaluator.protected_material_details.detail1_defect_rate"] == 0.5
         assert defect_rates["evaluator.protected_material_details.detail2_defect_rate"] == 0.5
 
-    def test_quotation_fix_test_data(self, quotation_fix_test_data):
+    def test_aggregate_label_defect_metrics_xpia_uses_xpia_prefix(self):
+        """Test that xpia_label columns (IndirectAttackEvaluator output) are properly aggregated."""
+        import pandas as pd
+
+        data = {
+            # IndirectAttackEvaluator outputs xpia_label (not indirect_attack_label)
+            "indirect_attack.xpia_label": [True, False, True, False],
+        }
+        df = pd.DataFrame(data)
+
+        label_cols, defect_rates = _aggregate_label_defect_metrics(df)
+
+        # The label column should be recognized
+        assert "indirect_attack.xpia_label" in label_cols
+        # The defect rate should use xpia prefix
+        assert "indirect_attack.xpia_defect_rate" in defect_rates
+        assert defect_rates["indirect_attack.xpia_defect_rate"] == 0.5
+
+
         from test_evaluators.test_inputs_evaluators import QuotationFixEval
 
         result = evaluate(