Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_4da51ba6ab"
"Tag": "python/evaluation/azure-ai-evaluation_802ffe01e9"
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,47 @@

LOGGER = logging.getLogger(__name__)

# Metric names that differ between the sync evals endpoint and the legacy annotation endpoint.
# Key = sync endpoint metric name, Value = legacy annotation API metric name.
# Used bidirectionally: forward lookup for sync→legacy, reverse for legacy→sync.
# Note: only metrics where the API request metric name differs should be here.
# For XPIA and ECI, the legacy API uses the annotation_task, not MetricList,
# so the metric name doesn't need remapping — but the response key does.
_SYNC_TO_LEGACY_METRIC_NAMES: Dict[str, str] = {
"hate_unfairness": "hate_fairness",
"groundedness": "generic_groundedness",
}

# Legacy response key lookup: the annotation API may return results under a different
# key than the sync metric name. This is a superset of _SYNC_TO_LEGACY_METRIC_NAMES.
_SYNC_TO_LEGACY_RESPONSE_KEYS: Dict[str, str] = {
**_SYNC_TO_LEGACY_METRIC_NAMES,
"indirect_attack": "xpia",
"election_critical_information": "eci",
}

# Reverse mapping: legacy metric name → sync metric name (built once at module level)
_LEGACY_TO_SYNC_METRIC_NAMES: Dict[str, str] = {v: k for k, v in _SYNC_TO_LEGACY_METRIC_NAMES.items()}


def _normalize_metric_for_endpoint(metric_name, use_legacy_endpoint, metric_display_name=None):
"""Normalize metric name based on which endpoint is being used.

Returns (metric_name, metric_display_name) tuple with the correct metric name
for the target endpoint, and metric_display_name set to preserve output key names.
"""
metric_name_str = metric_name.value if hasattr(metric_name, "value") else metric_name
if use_legacy_endpoint:
legacy_name = _SYNC_TO_LEGACY_METRIC_NAMES.get(metric_name_str)
if legacy_name:
return legacy_name, (metric_display_name or metric_name_str)
else:
sync_name = _LEGACY_TO_SYNC_METRIC_NAMES.get(metric_name_str)
if sync_name:
return sync_name, metric_display_name
return metric_name, metric_display_name


USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
}
Expand Down Expand Up @@ -453,9 +494,19 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
)
result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
return result
# Check for metric_name in response; also check legacy response key name if different.
# Note: parse_response is only called from legacy endpoint functions (evaluate_with_rai_service
# and evaluate_with_rai_service_multimodal), so this fallback is inherently legacy-only.
response_key = metric_name
if metric_name not in batch_response[0]:
return {}
response = batch_response[0][metric_name]
legacy_key = _SYNC_TO_LEGACY_RESPONSE_KEYS.get(
metric_name.value if hasattr(metric_name, "value") else metric_name
)
if legacy_key and legacy_key in batch_response[0]:
response_key = legacy_key
else:
return {}
response = batch_response[0][response_key]
response = response.replace("false", "False")
response = response.replace("true", "True")
parsed_response = literal_eval(response)
Expand Down Expand Up @@ -547,13 +598,23 @@ def _parse_content_harm_response(
}

response = batch_response[0]
# Check for metric_name in response; also check legacy response key name if different.
# Note: _parse_content_harm_response is only called from parse_response, which is
# only called from legacy endpoint functions, so this fallback is inherently legacy-only.
response_key = metric_name
if metric_name not in response:
return result
legacy_key = _SYNC_TO_LEGACY_RESPONSE_KEYS.get(
metric_name.value if hasattr(metric_name, "value") else metric_name
)
if legacy_key and legacy_key in response:
response_key = legacy_key
else:
return result

try:
harm_response = literal_eval(response[metric_name])
harm_response = literal_eval(response[response_key])
except Exception: # pylint: disable=broad-exception-caught
harm_response = response[metric_name]
harm_response = response[response_key]

total_tokens = 0
prompt_tokens = 0
Expand Down Expand Up @@ -1044,6 +1105,10 @@ async def evaluate_with_rai_service_sync(
:return: The EvalRunOutputItem containing the evaluation results (or parsed dict if legacy).
:rtype: Union[EvalRunOutputItem, Dict[str, Union[str, float]]]
"""
metric_name, metric_display_name = _normalize_metric_for_endpoint(
metric_name, use_legacy_endpoint, metric_display_name
)

# Route to legacy endpoint if requested
if use_legacy_endpoint:
return await evaluate_with_rai_service(
Expand Down Expand Up @@ -1261,13 +1326,16 @@ async def evaluate_with_rai_service_sync_multimodal(
:return: The EvalRunOutputItem or legacy response payload.
:rtype: Union[Dict, EvalRunOutputItem]
"""
metric_name, metric_display_name = _normalize_metric_for_endpoint(metric_name, use_legacy_endpoint)

# Route to legacy endpoint if requested
if use_legacy_endpoint:
return await evaluate_with_rai_service_multimodal(
messages=messages,
metric_name=metric_name,
project_scope=project_scope,
credential=credential,
metric_display_name=metric_display_name,
)

# Sync evals endpoint implementation (default)
Expand Down Expand Up @@ -1316,6 +1384,7 @@ async def evaluate_with_rai_service_multimodal(
metric_name: str,
project_scope: Union[str, AzureAIProject],
credential: TokenCredential,
metric_display_name: Optional[str] = None,
):
"""Evaluate the content safety of the response using Responsible AI service (legacy endpoint)
:param messages: The normalized list of messages.
Expand All @@ -1327,6 +1396,8 @@ async def evaluate_with_rai_service_multimodal(
:type project_scope: Union[str, AzureAIProject]
:param credential: The Azure authentication credential.
:type credential: ~azure.core.credentials.TokenCredential
:param metric_display_name: The display name for the metric in output keys. If None, uses metric_name.
:type metric_display_name: Optional[str]
:return: The parsed annotation result.
:rtype: List[List[Dict]]
"""
Expand All @@ -1341,7 +1412,7 @@ async def evaluate_with_rai_service_multimodal(
await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
result = parse_response(annotation_response, metric_name)
result = parse_response(annotation_response, metric_name, metric_display_name)
return result
else:
token = await fetch_or_reuse_token(credential)
Expand All @@ -1350,5 +1421,5 @@ async def evaluate_with_rai_service_multimodal(
# Submit annotation request and fetch result
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
result = parse_response(annotation_response, metric_name)
result = parse_response(annotation_response, metric_name, metric_display_name)
return result
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
"""Evaluates content according to this evaluator's metric.
Evaluates each turn separately to maintain per-turn granularity.
When using the legacy endpoint, sends the entire conversation in a single call
(matching pre-sync-migration behavior) via the sync wrapper for metric normalization.
"""
validate_conversation(conversation)
messages = conversation["messages"]

# Convert enum to string value
metric_value = self._eval_metric.value if hasattr(self._eval_metric, "value") else self._eval_metric

# Extract conversation turns (user-assistant pairs)
if self._use_legacy_endpoint:
# Legacy path: send entire conversation in a single call (pre-sync-migration behavior)
# Route through evaluate_with_rai_service_sync_multimodal for metric normalization.
result = await evaluate_with_rai_service_sync_multimodal(
messages=messages,
metric_name=metric_value,
project_scope=self._azure_ai_project,
credential=self._credential,
use_legacy_endpoint=True,
)
# Wrap as single-turn result and aggregate to produce evaluation_per_turn structure
return self._aggregate_results([result])

# Sync path: evaluate each turn separately for per-turn granularity
turns = self._extract_turns(messages)

# Evaluate each turn separately
per_turn_results = []
for turn in turns:
turn_result = await evaluate_with_rai_service_sync_multimodal(
Expand Down Expand Up @@ -213,6 +227,10 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
use_legacy_endpoint=self._use_legacy_endpoint,
)

# Legacy endpoint returns a pre-parsed dict from parse_response(); return directly
if self._use_legacy_endpoint:
return eval_result

# Parse the EvalRunOutputItem format to the expected dict format
return self._parse_eval_result(eval_result)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_sync
from azure.ai.evaluation._common.utils import (
get_default_threshold_for_evaluator,
is_onedp_project,
)
from azure.ai.evaluation._evaluate._utils import _write_output

Expand Down Expand Up @@ -154,15 +153,9 @@ async def evaluate_conversation(
try:
self.logger.debug(f"Evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}")

use_sync_endpoint = is_onedp_project(self.azure_ai_project)
effective_metric_name = (
"hate_unfairness" if use_sync_endpoint and metric_name == "hate_fairness" else metric_name
)

@retry(**self.retry_config["network_retry"])
async def evaluate_with_rai_service_with_retry():
try:
# Always use sync_evals endpoint for all projects
return await evaluate_with_rai_service_sync(
data=query_response,
metric_name=metric_name,
Expand All @@ -171,6 +164,7 @@ async def evaluate_with_rai_service_with_retry():
annotation_task=annotation_task,
scan_session_id=self.scan_session_id,
use_legacy_endpoint=self._use_legacy_endpoint,
evaluator_name=f"RedTeam.{risk_category.value}",
)
except (
httpx.ConnectTimeout,
Expand Down Expand Up @@ -213,7 +207,7 @@ async def evaluate_with_rai_service_with_retry():

# Find the result matching our metric/risk category
eval_result = None
lookup_names = {metric_name, risk_cat_value, effective_metric_name}
lookup_names = {metric_name, risk_cat_value}
for result_item in results:
result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
result_name = str(result_dict.get("name") or "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
risk_category: RiskCategory,
logger: Optional[logging.Logger] = None,
dataset_config: Optional[Any] = None,
use_legacy_endpoint: bool = False,
) -> None:
"""Initialize the RAI Service scorer.

Expand All @@ -56,6 +57,8 @@ def __init__(
:type logger: Optional[logging.Logger]
:param dataset_config: Optional DatasetConfiguration for context lookup
:type dataset_config: Optional[Any]
:param use_legacy_endpoint: Whether to use the legacy evaluation endpoint. Defaults to False.
:type use_legacy_endpoint: bool
"""
# Initialize with a default validator for text scoring
super().__init__(validator=ScorerPromptValidator(supported_data_types=["text"]))
Expand All @@ -64,6 +67,7 @@ def __init__(
self.risk_category = risk_category
self.logger = logger or logging.getLogger(__name__)
self.dataset_config = dataset_config
self._use_legacy_endpoint = use_legacy_endpoint
self._context_lookup: Dict[str, Dict[str, Any]] = {}

# Build context lookup from dataset config if provided
Expand Down Expand Up @@ -160,18 +164,17 @@ async def _score_piece_async(
metric_name = get_metric_from_risk_category(self.risk_category)
annotation_task = get_annotation_task_from_risk_category(self.risk_category)

# The Sync API requires "hate_unfairness" instead of "hate_fairness"
if metric_name == "hate_fairness":
metric_name = "hate_unfairness"
# Metric name normalization (hate_fairness ↔ hate_unfairness) is handled by
# evaluate_with_rai_service_sync based on the use_legacy_endpoint flag.

try:
# Call RAI service using sync_evals endpoint (matches legacy path behavior)
eval_result = await evaluate_with_rai_service_sync(
eval_input,
metric_name=metric_name,
project_scope=self.azure_ai_project,
credential=self.credential,
annotation_task=annotation_task,
use_legacy_endpoint=self._use_legacy_endpoint,
)

self.logger.debug(f"RAI evaluation result: {eval_result}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from azure.ai.evaluation._constants import EvaluationMetrics, _InternalEvaluationMetrics
from azure.ai.evaluation._common.constants import Tasks, _InternalAnnotationTasks

# Mapping of risk categories to their corresponding evaluation metrics
# Note: For HateUnfairness, the mapping defaults to HATE_FAIRNESS, but the Sync API
# (used for all projects) requires HATE_UNFAIRNESS instead.
# This is handled dynamically in _evaluation_processor.py.
# Mapping of risk categories to their corresponding evaluation metrics.
# Note: HateUnfairness maps to HATE_FAIRNESS because that is the metric name the legacy
# annotation endpoint recognizes. The bidirectional mapping between hate_fairness and
# hate_unfairness is handled at the routing layer in evaluate_with_rai_service_sync(),
# which normalizes the metric name based on the endpoint being used (legacy vs sync).
RISK_CATEGORY_METRIC_MAP = {
RiskCategory.Violence: EvaluationMetrics.VIOLENCE,
RiskCategory.HateUnfairness: EvaluationMetrics.HATE_FAIRNESS,
Expand Down
3 changes: 2 additions & 1 deletion sdk/evaluation/azure-ai-evaluation/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,9 @@ def simple_conversation():
@pytest.fixture
def redirect_openai_requests():
"""Route requests from the openai package to the test proxy."""
proxy_url = PROXY_URL() if callable(PROXY_URL) else PROXY_URL
config = TestProxyConfig(
recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL()
recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=proxy_url
)

with TestProxyHttpxClientBase.record_with_proxy(config):
Expand Down
Loading
Loading