diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 813d89253e3b..4eab1a258e3f 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -4,6 +4,7 @@ ### Bugs Fixed - Fixed inconsistency where sample data in evaluation result items did not match the generated sample data from corresponding input rows, ensuring proper synchronization between row-level input samples and their associated evaluation output items. +- Fixed `IndirectAttackEvaluator` results not displaying correctly in AI Foundry portal by restoring the `xpia_` output key prefix (e.g. `xpia_label`, `xpia_reason`, `xpia_manipulated_content`, `xpia_intrusion`, `xpia_information_gathering`) for backward compatibility with AI Foundry's expected column names. ## 1.16.0 (2026-03-10) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index d46f3dd216fb..5d8693e7b14d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -192,6 +192,9 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s EvaluationMetrics.LOGOS_AND_BRANDS, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA, + # "xpia" is the output key prefix used by IndirectAttackEvaluator (EvaluationMetrics.XPIA + # has value "indirect_attack" for service communication, but outputs use "xpia" prefix) + "xpia", EvaluationMetrics.CODE_VULNERABILITY, EvaluationMetrics.UNGROUNDED_ATTRIBUTES, ] @@ -318,6 +321,9 @@ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]: # Combine all known metrics all_known_metrics = evaluation_metrics_values + internal_metrics_values + # IndirectAttackEvaluator (EvaluationMetrics.XPIA) outputs token count columns with + # "xpia" prefix for backward compatibility, so include "xpia" explicitly. + all_known_metrics.append("xpia") # Find token count columns that belong to known metrics token_count_cols = [ diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 446ff4ad1d70..b8c224b8bac0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -1,6 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +import math from typing import Any, Dict, List, TypeVar, Union, Optional from typing_extensions import override @@ -203,6 +204,10 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]: ) input_data["context"] = context + # For XPIA, pass "xpia" as the metric display name to ensure output keys + # use the "xpia" prefix (e.g. "xpia_label") for backward compatibility. + metric_display_name = "xpia" if self._eval_metric == EvaluationMetrics.XPIA else None + eval_result = await evaluate_with_rai_service_sync( # type: ignore metric_name=self._eval_metric, data=input_data, @@ -211,6 +216,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]: annotation_task=self._get_task(), evaluator_name=self.__class__.__name__, use_legacy_endpoint=self._use_legacy_endpoint, + metric_display_name=metric_display_name, ) # Parse the EvalRunOutputItem format to the expected dict format @@ -271,11 +277,37 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]: # Convert string to boolean label = label_str.lower() == "true" if isinstance(label_str, str) else bool(label_str) + # Use "xpia" as the output key prefix for XPIA to maintain backward + # compatibility with column naming expected by AI Foundry and downstream logic. + output_prefix = ( + "xpia" if self._eval_metric == EvaluationMetrics.XPIA else self._eval_metric.value + ) + parsed_result = { - f"{self._eval_metric.value}_label": label, - f"{self._eval_metric.value}_reason": reason, + f"{output_prefix}_label": label, + f"{output_prefix}_reason": reason, } + # For XPIA, extract sub-metrics as flat keys (manipulated_content, intrusion, + # information_gathering) so they appear as individual columns in the results. + if self._eval_metric == EvaluationMetrics.XPIA: + for sub_metric in ["manipulated_content", "intrusion", "information_gathering"]: + # Service may return snake_case or camelCase keys; try both + camel_map = { + "manipulated_content": "manipulatedContent", + "information_gathering": "informationGathering", + } + sub_value = score_properties.get( + sub_metric, score_properties.get(camel_map.get(sub_metric, sub_metric), math.nan) + ) + if isinstance(sub_value, str): + sub_value = ( + sub_value.lower() == "true" + if sub_value.lower() in ["true", "false"] + else math.nan + ) + parsed_result[f"{output_prefix}_{sub_metric}"] = sub_value + # For protected_material, also extract breakdown if available if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL: for component in ["fictional_characters", "logos_and_brands", "artwork"]: @@ -295,7 +327,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]: # Extract details from scoreProperties if score_properties: - parsed_result[f"{self._eval_metric. value}_details"] = _prepare_details(score_properties) + parsed_result[f"{output_prefix}_details"] = _prepare_details(score_properties) # Extract token counts from metrics metrics = properties.get("metrics", {}) @@ -313,15 +345,15 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]: total_tokens = "" # Add token metadata (matching old format) - parsed_result[f"{self._eval_metric. value}_total_tokens"] = total_tokens - parsed_result[f"{self._eval_metric.value}_prompt_tokens"] = prompt_tokens - parsed_result[f"{self._eval_metric.value}_completion_tokens"] = completion_tokens + parsed_result[f"{output_prefix}_total_tokens"] = total_tokens + parsed_result[f"{output_prefix}_prompt_tokens"] = prompt_tokens + parsed_result[f"{output_prefix}_completion_tokens"] = completion_tokens # Add empty placeholders for fields that sync_evals doesn't provide - parsed_result[f"{self._eval_metric.value}_finish_reason"] = "" - parsed_result[f"{self._eval_metric.value}_sample_input"] = "" - parsed_result[f"{self._eval_metric.value}_sample_output"] = "" - parsed_result[f"{self._eval_metric.value}_model"] = "" + parsed_result[f"{output_prefix}_finish_reason"] = "" + parsed_result[f"{output_prefix}_sample_input"] = "" + parsed_result[f"{output_prefix}_sample_output"] = "" + parsed_result[f"{output_prefix}_model"] = "" return parsed_result @@ -366,21 +398,29 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]: # check if it's already in the correct format (might be legacy response) if isinstance(eval_result, dict): # Check if it already has the expected keys - expected_key = ( - f"{self._eval_metric.value}_label" - if self._eval_metric - in [ - EvaluationMetrics.CODE_VULNERABILITY, - EvaluationMetrics.PROTECTED_MATERIAL, - EvaluationMetrics.UNGROUNDED_ATTRIBUTES, - EvaluationMetrics.XPIA, - _InternalEvaluationMetrics.ECI, - ] - else self._eval_metric.value - ) - - if expected_key in eval_result: - return eval_result + # For XPIA, use "xpia" prefix for backward compatibility + if self._eval_metric == EvaluationMetrics.XPIA: + if "xpia_label" in eval_result: + return eval_result + # Handle legacy responses that use "indirect_attack_" prefix instead of "xpia_" + if "indirect_attack_label" in eval_result: + return { + key.replace("indirect_attack_", "xpia_"): value + for key, value in eval_result.items() + } + elif self._eval_metric in [ + EvaluationMetrics.CODE_VULNERABILITY, + EvaluationMetrics.PROTECTED_MATERIAL, + EvaluationMetrics.UNGROUNDED_ATTRIBUTES, + _InternalEvaluationMetrics.ECI, + ]: + expected_key = f"{self._eval_metric.value}_label" + if expected_key in eval_result: + return eval_result + else: + expected_key = self._eval_metric.value + if expected_key in eval_result: + return eval_result # Return empty dict if we can't parse return {} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py index 1bf810ef080b..dfc94f758a99 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py @@ -634,3 +634,116 @@ def test_parse_eval_result_with_builtin_prefix(self): # Token counts should be extracted from properties.metrics assert result["violence_prompt_tokens"] == "15" assert result["violence_completion_tokens"] == "55" + + def test_parse_eval_result_xpia_uses_xpia_prefix(self): + """Test that IndirectAttackEvaluator output uses 'xpia' prefix, not 'indirect_attack'.""" + from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase + from azure.ai.evaluation._common.constants import EvaluationMetrics + + # Simulate a sync_evals response for XPIA (indirect_attack) + eval_result = { + "results": [ + { + "name": "indirect_attack", + "type": "azure_ai_evaluator", + "metric": "indirect_attack", + "score": 0.0, + "reason": "No indirect attack detected.", + "properties": { + "scoreProperties": { + "label": "false", + "manipulated_content": "false", + "intrusion": "false", + "information_gathering": "false", + }, + "metrics": {"promptTokens": "10", "completionTokens": "20"}, + }, + } + ] + } + + evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase) + evaluator._eval_metric = EvaluationMetrics.XPIA + + result = evaluator._parse_eval_result(eval_result) + + # Output keys MUST use "xpia" prefix for backward compatibility + assert "xpia_label" in result, f"Expected 'xpia_label' in result, got keys: {list(result.keys())}" + assert "xpia_reason" in result + assert "xpia_manipulated_content" in result + assert "xpia_intrusion" in result + assert "xpia_information_gathering" in result + # Must NOT have "indirect_attack_" prefixed keys + assert "indirect_attack_label" not in result + assert "indirect_attack_reason" not in result + # Verify values + assert result["xpia_label"] == False + assert result["xpia_reason"] == "No indirect attack detected." + assert result["xpia_manipulated_content"] == False + assert result["xpia_intrusion"] == False + assert result["xpia_information_gathering"] == False + assert result["xpia_prompt_tokens"] == "10" + assert result["xpia_completion_tokens"] == "20" + + def test_parse_eval_result_xpia_legacy_indirect_attack_keys_renamed(self): + """Test that legacy responses with 'indirect_attack_' keys are renamed to 'xpia_'.""" + from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase + from azure.ai.evaluation._common.constants import EvaluationMetrics + + # Simulate a legacy response that uses "indirect_attack_" prefix + legacy_result = { + "indirect_attack_label": True, + "indirect_attack_reason": "Attack detected", + "indirect_attack_manipulated_content": True, + "indirect_attack_intrusion": False, + "indirect_attack_information_gathering": False, + "indirect_attack_total_tokens": "100", + } + + evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase) + evaluator._eval_metric = EvaluationMetrics.XPIA + + result = evaluator._parse_eval_result(legacy_result) + + # Legacy "indirect_attack_" keys should be renamed to "xpia_" + assert "xpia_label" in result + assert "xpia_reason" in result + assert "xpia_manipulated_content" in result + assert "indirect_attack_label" not in result + assert result["xpia_label"] == True + assert result["xpia_reason"] == "Attack detected" + + def test_parse_eval_result_xpia_with_camelcase_sub_metrics(self): + """Test that camelCase sub-metric keys from service are handled correctly.""" + from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase + from azure.ai.evaluation._common.constants import EvaluationMetrics + import math + + # Service may return camelCase keys like "manipulatedContent" + eval_result = { + "results": [ + { + "metric": "indirect_attack", + "score": 0.0, + "reason": "Attack detected.", + "properties": { + "scoreProperties": { + "label": "true", + "manipulatedContent": "true", + "intrusion": "false", + "informationGathering": "true", + }, + }, + } + ] + } + + evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase) + evaluator._eval_metric = EvaluationMetrics.XPIA + + result = evaluator._parse_eval_result(eval_result) + + assert result["xpia_label"] == True + assert result["xpia_manipulated_content"] == True + assert result["xpia_intrusion"] == False + assert result["xpia_information_gathering"] == True diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 47ef67eb4baa..15a92c6e87f8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -758,7 +758,25 @@ def test_aggregate_label_defect_metrics_with_nan_in_details(self): assert defect_rates["evaluator.protected_material_details.detail1_defect_rate"] == 0.5 assert defect_rates["evaluator.protected_material_details.detail2_defect_rate"] == 0.5 - def test_quotation_fix_test_data(self, quotation_fix_test_data): + def test_aggregate_label_defect_metrics_xpia_uses_xpia_prefix(self): + """Test that xpia_label columns (IndirectAttackEvaluator output) are properly aggregated.""" + import pandas as pd + + data = { + # IndirectAttackEvaluator outputs xpia_label (not indirect_attack_label) + "indirect_attack.xpia_label": [True, False, True, False], + } + df = pd.DataFrame(data) + + label_cols, defect_rates = _aggregate_label_defect_metrics(df) + + # The label column should be recognized + assert "indirect_attack.xpia_label" in label_cols + # The defect rate should use xpia prefix + assert "indirect_attack.xpia_defect_rate" in defect_rates + assert defect_rates["indirect_attack.xpia_defect_rate"] == 0.5 + + from test_evaluators.test_inputs_evaluators import QuotationFixEval result = evaluate(