Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Bugs Fixed
- Fixed inconsistency where sample data in evaluation result items did not match the generated sample data from corresponding input rows, ensuring proper synchronization between row-level input samples and their associated evaluation output items.
- Fixed `IndirectAttackEvaluator` results not displaying correctly in AI Foundry portal by restoring the `xpia_` output key prefix (e.g. `xpia_label`, `xpia_reason`, `xpia_manipulated_content`, `xpia_intrusion`, `xpia_information_gathering`) for backward compatibility with AI Foundry's expected column names.

## 1.16.0 (2026-03-10)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
EvaluationMetrics.LOGOS_AND_BRANDS,
_InternalEvaluationMetrics.ECI,
EvaluationMetrics.XPIA,
# "xpia" is the output key prefix used by IndirectAttackEvaluator (EvaluationMetrics.XPIA
# has value "indirect_attack" for service communication, but outputs use "xpia" prefix)
"xpia",
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
]
Expand Down Expand Up @@ -318,6 +321,9 @@ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:

# Combine all known metrics
all_known_metrics = evaluation_metrics_values + internal_metrics_values
# IndirectAttackEvaluator (EvaluationMetrics.XPIA) outputs token count columns with
# "xpia" prefix for backward compatibility, so include "xpia" explicitly.
all_known_metrics.append("xpia")

# Find token count columns that belong to known metrics
token_count_cols = [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import math
from typing import Any, Dict, List, TypeVar, Union, Optional

from typing_extensions import override
Expand Down Expand Up @@ -203,6 +204,10 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
)
input_data["context"] = context

# For XPIA, pass "xpia" as the metric display name to ensure output keys
# use the "xpia" prefix (e.g. "xpia_label") for backward compatibility.
metric_display_name = "xpia" if self._eval_metric == EvaluationMetrics.XPIA else None

eval_result = await evaluate_with_rai_service_sync( # type: ignore
metric_name=self._eval_metric,
data=input_data,
Expand All @@ -211,6 +216,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
annotation_task=self._get_task(),
evaluator_name=self.__class__.__name__,
use_legacy_endpoint=self._use_legacy_endpoint,
metric_display_name=metric_display_name,
)

# Parse the EvalRunOutputItem format to the expected dict format
Expand Down Expand Up @@ -271,11 +277,37 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
# Convert string to boolean
label = label_str.lower() == "true" if isinstance(label_str, str) else bool(label_str)

# Use "xpia" as the output key prefix for XPIA to maintain backward
# compatibility with column naming expected by AI Foundry and downstream logic.
output_prefix = (
"xpia" if self._eval_metric == EvaluationMetrics.XPIA else self._eval_metric.value
)

parsed_result = {
f"{self._eval_metric.value}_label": label,
f"{self._eval_metric.value}_reason": reason,
f"{output_prefix}_label": label,
f"{output_prefix}_reason": reason,
}

# For XPIA, extract sub-metrics as flat keys (manipulated_content, intrusion,
# information_gathering) so they appear as individual columns in the results.
if self._eval_metric == EvaluationMetrics.XPIA:
for sub_metric in ["manipulated_content", "intrusion", "information_gathering"]:
# Service may return snake_case or camelCase keys; try both
camel_map = {
"manipulated_content": "manipulatedContent",
"information_gathering": "informationGathering",
}
sub_value = score_properties.get(
sub_metric, score_properties.get(camel_map.get(sub_metric, sub_metric), math.nan)
)
if isinstance(sub_value, str):
sub_value = (
sub_value.lower() == "true"
if sub_value.lower() in ["true", "false"]
else math.nan
)
parsed_result[f"{output_prefix}_{sub_metric}"] = sub_value

# For protected_material, also extract breakdown if available
if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
for component in ["fictional_characters", "logos_and_brands", "artwork"]:
Expand All @@ -295,7 +327,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:

# Extract details from scoreProperties
if score_properties:
parsed_result[f"{self._eval_metric. value}_details"] = _prepare_details(score_properties)
parsed_result[f"{output_prefix}_details"] = _prepare_details(score_properties)

# Extract token counts from metrics
metrics = properties.get("metrics", {})
Expand All @@ -313,15 +345,15 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
total_tokens = ""

# Add token metadata (matching old format)
parsed_result[f"{self._eval_metric. value}_total_tokens"] = total_tokens
parsed_result[f"{self._eval_metric.value}_prompt_tokens"] = prompt_tokens
parsed_result[f"{self._eval_metric.value}_completion_tokens"] = completion_tokens
parsed_result[f"{output_prefix}_total_tokens"] = total_tokens
parsed_result[f"{output_prefix}_prompt_tokens"] = prompt_tokens
parsed_result[f"{output_prefix}_completion_tokens"] = completion_tokens

# Add empty placeholders for fields that sync_evals doesn't provide
parsed_result[f"{self._eval_metric.value}_finish_reason"] = ""
parsed_result[f"{self._eval_metric.value}_sample_input"] = ""
parsed_result[f"{self._eval_metric.value}_sample_output"] = ""
parsed_result[f"{self._eval_metric.value}_model"] = ""
parsed_result[f"{output_prefix}_finish_reason"] = ""
parsed_result[f"{output_prefix}_sample_input"] = ""
parsed_result[f"{output_prefix}_sample_output"] = ""
parsed_result[f"{output_prefix}_model"] = ""

return parsed_result

Expand Down Expand Up @@ -366,21 +398,29 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
# check if it's already in the correct format (might be legacy response)
if isinstance(eval_result, dict):
# Check if it already has the expected keys
expected_key = (
f"{self._eval_metric.value}_label"
if self._eval_metric
in [
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.PROTECTED_MATERIAL,
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
EvaluationMetrics.XPIA,
_InternalEvaluationMetrics.ECI,
]
else self._eval_metric.value
)

if expected_key in eval_result:
return eval_result
# For XPIA, use "xpia" prefix for backward compatibility
if self._eval_metric == EvaluationMetrics.XPIA:
if "xpia_label" in eval_result:
return eval_result
# Handle legacy responses that use "indirect_attack_" prefix instead of "xpia_"
if "indirect_attack_label" in eval_result:
return {
key.replace("indirect_attack_", "xpia_"): value
for key, value in eval_result.items()
}
elif self._eval_metric in [
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.PROTECTED_MATERIAL,
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
_InternalEvaluationMetrics.ECI,
]:
expected_key = f"{self._eval_metric.value}_label"
if expected_key in eval_result:
return eval_result
else:
expected_key = self._eval_metric.value
if expected_key in eval_result:
return eval_result

# Return empty dict if we can't parse
return {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -634,3 +634,116 @@ def test_parse_eval_result_with_builtin_prefix(self):
# Token counts should be extracted from properties.metrics
assert result["violence_prompt_tokens"] == "15"
assert result["violence_completion_tokens"] == "55"

def test_parse_eval_result_xpia_uses_xpia_prefix(self):
"""Test that IndirectAttackEvaluator output uses 'xpia' prefix, not 'indirect_attack'."""
from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase
from azure.ai.evaluation._common.constants import EvaluationMetrics

# Simulate a sync_evals response for XPIA (indirect_attack)
eval_result = {
"results": [
{
"name": "indirect_attack",
"type": "azure_ai_evaluator",
"metric": "indirect_attack",
"score": 0.0,
"reason": "No indirect attack detected.",
"properties": {
"scoreProperties": {
"label": "false",
"manipulated_content": "false",
"intrusion": "false",
"information_gathering": "false",
},
"metrics": {"promptTokens": "10", "completionTokens": "20"},
},
}
]
}

evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase)
evaluator._eval_metric = EvaluationMetrics.XPIA

result = evaluator._parse_eval_result(eval_result)

# Output keys MUST use "xpia" prefix for backward compatibility
assert "xpia_label" in result, f"Expected 'xpia_label' in result, got keys: {list(result.keys())}"
assert "xpia_reason" in result
assert "xpia_manipulated_content" in result
assert "xpia_intrusion" in result
assert "xpia_information_gathering" in result
# Must NOT have "indirect_attack_" prefixed keys
assert "indirect_attack_label" not in result
assert "indirect_attack_reason" not in result
# Verify values
assert result["xpia_label"] == False
assert result["xpia_reason"] == "No indirect attack detected."
assert result["xpia_manipulated_content"] == False
assert result["xpia_intrusion"] == False
assert result["xpia_information_gathering"] == False
assert result["xpia_prompt_tokens"] == "10"
assert result["xpia_completion_tokens"] == "20"

def test_parse_eval_result_xpia_legacy_indirect_attack_keys_renamed(self):
"""Test that legacy responses with 'indirect_attack_' keys are renamed to 'xpia_'."""
from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase
from azure.ai.evaluation._common.constants import EvaluationMetrics

# Simulate a legacy response that uses "indirect_attack_" prefix
legacy_result = {
"indirect_attack_label": True,
"indirect_attack_reason": "Attack detected",
"indirect_attack_manipulated_content": True,
"indirect_attack_intrusion": False,
"indirect_attack_information_gathering": False,
"indirect_attack_total_tokens": "100",
}

evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase)
evaluator._eval_metric = EvaluationMetrics.XPIA

result = evaluator._parse_eval_result(legacy_result)

# Legacy "indirect_attack_" keys should be renamed to "xpia_"
assert "xpia_label" in result
assert "xpia_reason" in result
assert "xpia_manipulated_content" in result
assert "indirect_attack_label" not in result
assert result["xpia_label"] == True
assert result["xpia_reason"] == "Attack detected"

def test_parse_eval_result_xpia_with_camelcase_sub_metrics(self):
"""Test that camelCase sub-metric keys from service are handled correctly."""
from azure.ai.evaluation._evaluators._common._base_rai_svc_eval import RaiServiceEvaluatorBase
from azure.ai.evaluation._common.constants import EvaluationMetrics
import math

# Service may return camelCase keys like "manipulatedContent"
eval_result = {
"results": [
{
"metric": "indirect_attack",
"score": 0.0,
"reason": "Attack detected.",
"properties": {
"scoreProperties": {
"label": "true",
"manipulatedContent": "true",
"intrusion": "false",
"informationGathering": "true",
},
},
}
]
}

evaluator = RaiServiceEvaluatorBase.__new__(RaiServiceEvaluatorBase)
evaluator._eval_metric = EvaluationMetrics.XPIA

result = evaluator._parse_eval_result(eval_result)

assert result["xpia_label"] == True
assert result["xpia_manipulated_content"] == True
assert result["xpia_intrusion"] == False
assert result["xpia_information_gathering"] == True
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,25 @@ def test_aggregate_label_defect_metrics_with_nan_in_details(self):
assert defect_rates["evaluator.protected_material_details.detail1_defect_rate"] == 0.5
assert defect_rates["evaluator.protected_material_details.detail2_defect_rate"] == 0.5

def test_quotation_fix_test_data(self, quotation_fix_test_data):
def test_aggregate_label_defect_metrics_xpia_uses_xpia_prefix(self):
"""Test that xpia_label columns (IndirectAttackEvaluator output) are properly aggregated."""
import pandas as pd

data = {
# IndirectAttackEvaluator outputs xpia_label (not indirect_attack_label)
"indirect_attack.xpia_label": [True, False, True, False],
}
df = pd.DataFrame(data)

label_cols, defect_rates = _aggregate_label_defect_metrics(df)

# The label column should be recognized
assert "indirect_attack.xpia_label" in label_cols
# The defect rate should use xpia prefix
assert "indirect_attack.xpia_defect_rate" in defect_rates
assert defect_rates["indirect_attack.xpia_defect_rate"] == 0.5


from test_evaluators.test_inputs_evaluators import QuotationFixEval

result = evaluate(
Expand Down
Loading