diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index 02457b59..8352c42b 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -2777,7 +2777,22 @@ def _helper(v: Any) -> Any: return event +def _validate_and_sanitize_metadata(metadata: object) -> dict[str, Any]: + if not isinstance(metadata, dict): + metadata = bt_safe_deep_copy(metadata) + + if not isinstance(metadata, dict): + raise ValueError("metadata must be a dictionary or serialize to a dictionary") + + for key in metadata.keys(): + if not isinstance(key, str): + raise ValueError("metadata keys must be strings") + + return metadata + + def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]: + event = dict(event) scores = event.get("scores") if scores: for name, score in scores.items(): @@ -2796,13 +2811,8 @@ def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) if score < 0 or score > 1: raise ValueError("score values must be between 0 and 1") - metadata = event.get("metadata") - if metadata: - if not isinstance(metadata, dict): - raise ValueError("metadata must be a dictionary") - for key in metadata.keys(): - if not isinstance(key, str): - raise ValueError("metadata keys must be strings") + if "metadata" in event and event["metadata"] is not None: + event["metadata"] = _validate_and_sanitize_metadata(event["metadata"]) metrics = event.get("metrics") if metrics: @@ -3381,7 +3391,7 @@ def _log_feedback_impl( expected: Any | None = None, tags: Sequence[str] | None = None, comment: str | None = None, - metadata: Metadata | None = None, + metadata: object | None = None, source: Literal["external", "app", "api", None] = None, ): if source is None: @@ -3818,7 +3828,7 @@ def log( error: str | None = None, tags: Sequence[str] | None = None, scores: Mapping[str, int | float] | None = None, - metadata: Metadata | None = None, + metadata: object | None = None, metrics: Mapping[str, int | float] | None = None, id: str | None = None, dataset_record_id: str | None = None, @@ -3832,7 +3842,7 @@ def log( :param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models. :param error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception. :param scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments. - :param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings. + :param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings. :param tags: (Optional) a list of strings that you can use to filter and group records later. :param metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end". :param id: (Optional) a unique identifier for the event. If you don't provide one, BrainTrust will generate one for you. @@ -3870,7 +3880,7 @@ def log_feedback( expected: Any | None = None, tags: Sequence[str] | None = None, comment: str | None = None, - metadata: Metadata | None = None, + metadata: object | None = None, source: Literal["external", "app", "api", None] = None, ) -> None: """ @@ -3881,7 +3891,7 @@ def log_feedback( :param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. :param tags: (Optional) a list of strings that you can use to filter and group records later. :param comment: (Optional) an optional comment string to log about the event. - :param metadata: (Optional) a dictionary with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. + :param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. The values in `metadata` can be any JSON-serializable type, but its keys must be strings. :param source: (Optional) the source of the feedback. Must be one of "external" (default), "app", or "api". """ return _log_feedback_impl( @@ -5251,7 +5261,7 @@ def log( error: str | None = None, tags: Sequence[str] | None = None, scores: Mapping[str, int | float] | None = None, - metadata: Metadata | None = None, + metadata: object | None = None, metrics: Mapping[str, int | float] | None = None, id: str | None = None, allow_concurrent_with_spans: bool = False, @@ -5265,7 +5275,7 @@ def log( :param error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception. :param tags: (Optional) a list of strings that you can use to filter and group records later. :param scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs. - :param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings. + :param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings. :param metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end". :param id: (Optional) a unique identifier for the event. If you don't provide one, BrainTrust will generate one for you. :param allow_concurrent_with_spans: (Optional) in rare cases where you need to log at the top level separately from using spans on the logger elsewhere, set this to True. @@ -5302,7 +5312,7 @@ def log_feedback( expected: Any | None = None, tags: Sequence[str] | None = None, comment: str | None = None, - metadata: Metadata | None = None, + metadata: object | None = None, source: Literal["external", "app", "api", None] = None, ) -> None: """ @@ -5313,7 +5323,7 @@ def log_feedback( :param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. :param tags: (Optional) a list of strings that you can use to filter and group records later. :param comment: (Optional) an optional comment string to log about the event. - :param metadata: (Optional) a dictionary with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. + :param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. The values in `metadata` can be any JSON-serializable type, but its keys must be strings. :param source: (Optional) the source of the feedback. Must be one of "external" (default), "app", or "api". """ return _log_feedback_impl( diff --git a/py/src/braintrust/test_logger.py b/py/src/braintrust/test_logger.py index c341bead..8c5fcef4 100644 --- a/py/src/braintrust/test_logger.py +++ b/py/src/braintrust/test_logger.py @@ -22,6 +22,7 @@ init_logger, logger, ) +from braintrust.db_fields import AUDIT_METADATA_FIELD from braintrust.id_gen import OTELIDGenerator, get_id_generator from braintrust.logger import ( RemoteEvalParameters, @@ -838,6 +839,107 @@ def test_span_log_with_simple_circular_reference(with_memory_logger): assert "circular" in logged_output["self"].lower() +def test_span_log_accepts_pydantic_model_metadata(with_memory_logger): + try: + from pydantic import BaseModel + except ImportError: + pytest.skip("Pydantic not available") + + class MetadataModel(BaseModel): + foo: str = "bar" + + logger = init_test_logger(__name__) + + with logger.start_span(name="test_span") as span: + span.log(input=MetadataModel(), metadata=MetadataModel()) + + logs = with_memory_logger.pop() + assert len(logs) == 1 + assert logs[0]["input"] == {"foo": "bar"} + assert logs[0]["metadata"] == {"foo": "bar"} + + +class _ModelDumpMetadata: + def __init__(self, **values): + self.values = values + + def model_dump(self, **kwargs): + assert kwargs == {"exclude_none": True} + return dict(self.values) + + +def test_span_log_accepts_model_dump_metadata(with_memory_logger): + logger = init_test_logger(__name__) + + with logger.start_span(name="test_span") as span: + span.log(metadata=_ModelDumpMetadata(foo="bar")) + + logs = with_memory_logger.pop() + assert len(logs) == 1 + assert logs[0]["metadata"] == {"foo": "bar"} + + +def test_logger_log_accepts_model_dump_metadata(with_memory_logger): + logger = init_test_logger(__name__) + + logger.log(input="input", output="output", metadata=_ModelDumpMetadata(foo="bar")) + + logs = with_memory_logger.pop() + assert len(logs) == 1 + assert logs[0]["metadata"] == {"foo": "bar"} + + +def test_experiment_log_accepts_model_dump_metadata(with_memory_logger): + experiment = init_test_exp("test-experiment", "test-project") + + experiment.log(input="input", output="output", scores={"score": 1}, metadata=_ModelDumpMetadata(foo="bar")) + + logs = with_memory_logger.pop() + assert len(logs) == 1 + assert logs[0]["metadata"] == {"foo": "bar"} + + +def test_logger_log_feedback_accepts_model_dump_metadata(with_memory_logger): + logger = init_test_logger(__name__) + + logger.log_feedback(id="event-id", scores={"score": 1}, metadata=_ModelDumpMetadata(user_id="user-1")) + + logs = with_memory_logger.pop() + assert len(logs) == 1 + assert logs[0][AUDIT_METADATA_FIELD] == {"user_id": "user-1"} + + +def test_experiment_log_feedback_accepts_model_dump_metadata(with_memory_logger): + experiment = init_test_exp("test-experiment", "test-project") + + experiment.log_feedback(id="event-id", scores={"score": 1}, metadata=_ModelDumpMetadata(user_id="user-1")) + + logs = with_memory_logger.pop() + assert len(logs) == 1 + assert logs[0][AUDIT_METADATA_FIELD] == {"user_id": "user-1"} + + +def test_span_log_rejects_metadata_with_non_string_keys(with_memory_logger): + logger = init_test_logger(__name__) + + with logger.start_span(name="test_span") as span: + with pytest.raises(ValueError, match="metadata keys must be strings"): + span.log(metadata={1: "bad"}) + + +def test_span_log_rejects_metadata_that_serializes_to_non_dict(with_memory_logger): + class BadMetadata: + def model_dump(self, **kwargs): + assert kwargs == {"exclude_none": True} + return ["not", "metadata"] + + logger = init_test_logger(__name__) + + with logger.start_span(name="test_span") as span: + with pytest.raises(ValueError, match="metadata must be a dictionary or serialize to a dictionary"): + span.log(metadata=BadMetadata()) + + def test_span_log_with_nested_circular_reference(with_memory_logger): """Test that span.log() with nested circular reference works gracefully.""" logger = init_test_logger(__name__) @@ -2919,12 +3021,13 @@ def test_update_span_includes_span_id_and_root_span_id_from_export(with_memory_l with_memory_logger.pop() - braintrust.update_span(exported=exported, output="updated output") + braintrust.update_span(exported=exported, output="updated output", metadata=_ModelDumpMetadata(foo="bar")) logs = with_memory_logger.pop() updated_log = next(log for log in logs if log.get("output") == "updated output") assert updated_log["span_id"] == span_id assert updated_log["root_span_id"] == root_span_id + assert updated_log["metadata"] == {"foo": "bar"} def test_get_exporter_returns_v3_by_default():