Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions py/src/braintrust/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -2777,7 +2777,22 @@ def _helper(v: Any) -> Any:
return event


def _validate_and_sanitize_metadata(metadata: object) -> dict[str, Any]:
if not isinstance(metadata, dict):
metadata = bt_safe_deep_copy(metadata)

if not isinstance(metadata, dict):
raise ValueError("metadata must be a dictionary or serialize to a dictionary")

for key in metadata.keys():
if not isinstance(key, str):
raise ValueError("metadata keys must be strings")

return metadata


def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
event = dict(event)
scores = event.get("scores")
if scores:
for name, score in scores.items():
Expand All @@ -2796,13 +2811,8 @@ def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any])
if score < 0 or score > 1:
raise ValueError("score values must be between 0 and 1")

metadata = event.get("metadata")
if metadata:
if not isinstance(metadata, dict):
raise ValueError("metadata must be a dictionary")
for key in metadata.keys():
if not isinstance(key, str):
raise ValueError("metadata keys must be strings")
if "metadata" in event and event["metadata"] is not None:
event["metadata"] = _validate_and_sanitize_metadata(event["metadata"])

metrics = event.get("metrics")
if metrics:
Expand Down Expand Up @@ -3381,7 +3391,7 @@ def _log_feedback_impl(
expected: Any | None = None,
tags: Sequence[str] | None = None,
comment: str | None = None,
metadata: Metadata | None = None,
metadata: object | None = None,
source: Literal["external", "app", "api", None] = None,
):
if source is None:
Expand Down Expand Up @@ -3818,7 +3828,7 @@ def log(
error: str | None = None,
tags: Sequence[str] | None = None,
scores: Mapping[str, int | float] | None = None,
metadata: Metadata | None = None,
metadata: object | None = None,
metrics: Mapping[str, int | float] | None = None,
id: str | None = None,
dataset_record_id: str | None = None,
Expand All @@ -3832,7 +3842,7 @@ def log(
:param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
:param error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
:param scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
:param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
:param tags: (Optional) a list of strings that you can use to filter and group records later.
:param metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
:param id: (Optional) a unique identifier for the event. If you don't provide one, BrainTrust will generate one for you.
Expand Down Expand Up @@ -3870,7 +3880,7 @@ def log_feedback(
expected: Any | None = None,
tags: Sequence[str] | None = None,
comment: str | None = None,
metadata: Metadata | None = None,
metadata: object | None = None,
source: Literal["external", "app", "api", None] = None,
) -> None:
"""
Expand All @@ -3881,7 +3891,7 @@ def log_feedback(
:param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not.
:param tags: (Optional) a list of strings that you can use to filter and group records later.
:param comment: (Optional) an optional comment string to log about the event.
:param metadata: (Optional) a dictionary with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event.
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
:param source: (Optional) the source of the feedback. Must be one of "external" (default), "app", or "api".
"""
return _log_feedback_impl(
Expand Down Expand Up @@ -5251,7 +5261,7 @@ def log(
error: str | None = None,
tags: Sequence[str] | None = None,
scores: Mapping[str, int | float] | None = None,
metadata: Metadata | None = None,
metadata: object | None = None,
metrics: Mapping[str, int | float] | None = None,
id: str | None = None,
allow_concurrent_with_spans: bool = False,
Expand All @@ -5265,7 +5275,7 @@ def log(
:param error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
:param tags: (Optional) a list of strings that you can use to filter and group records later.
:param scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
:param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
:param metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
:param id: (Optional) a unique identifier for the event. If you don't provide one, BrainTrust will generate one for you.
:param allow_concurrent_with_spans: (Optional) in rare cases where you need to log at the top level separately from using spans on the logger elsewhere, set this to True.
Expand Down Expand Up @@ -5302,7 +5312,7 @@ def log_feedback(
expected: Any | None = None,
tags: Sequence[str] | None = None,
comment: str | None = None,
metadata: Metadata | None = None,
metadata: object | None = None,
source: Literal["external", "app", "api", None] = None,
) -> None:
"""
Expand All @@ -5313,7 +5323,7 @@ def log_feedback(
:param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not.
:param tags: (Optional) a list of strings that you can use to filter and group records later.
:param comment: (Optional) an optional comment string to log about the event.
:param metadata: (Optional) a dictionary with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event.
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
:param source: (Optional) the source of the feedback. Must be one of "external" (default), "app", or "api".
"""
return _log_feedback_impl(
Expand Down
105 changes: 104 additions & 1 deletion py/src/braintrust/test_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
init_logger,
logger,
)
from braintrust.db_fields import AUDIT_METADATA_FIELD
from braintrust.id_gen import OTELIDGenerator, get_id_generator
from braintrust.logger import (
RemoteEvalParameters,
Expand Down Expand Up @@ -838,6 +839,107 @@ def test_span_log_with_simple_circular_reference(with_memory_logger):
assert "circular" in logged_output["self"].lower()


def test_span_log_accepts_pydantic_model_metadata(with_memory_logger):
try:
from pydantic import BaseModel
except ImportError:
pytest.skip("Pydantic not available")

class MetadataModel(BaseModel):
foo: str = "bar"

logger = init_test_logger(__name__)

with logger.start_span(name="test_span") as span:
span.log(input=MetadataModel(), metadata=MetadataModel())

logs = with_memory_logger.pop()
assert len(logs) == 1
assert logs[0]["input"] == {"foo": "bar"}
assert logs[0]["metadata"] == {"foo": "bar"}


class _ModelDumpMetadata:
def __init__(self, **values):
self.values = values

def model_dump(self, **kwargs):
assert kwargs == {"exclude_none": True}
return dict(self.values)


def test_span_log_accepts_model_dump_metadata(with_memory_logger):
logger = init_test_logger(__name__)

with logger.start_span(name="test_span") as span:
span.log(metadata=_ModelDumpMetadata(foo="bar"))

logs = with_memory_logger.pop()
assert len(logs) == 1
assert logs[0]["metadata"] == {"foo": "bar"}


def test_logger_log_accepts_model_dump_metadata(with_memory_logger):
logger = init_test_logger(__name__)

logger.log(input="input", output="output", metadata=_ModelDumpMetadata(foo="bar"))

logs = with_memory_logger.pop()
assert len(logs) == 1
assert logs[0]["metadata"] == {"foo": "bar"}


def test_experiment_log_accepts_model_dump_metadata(with_memory_logger):
experiment = init_test_exp("test-experiment", "test-project")

experiment.log(input="input", output="output", scores={"score": 1}, metadata=_ModelDumpMetadata(foo="bar"))

logs = with_memory_logger.pop()
assert len(logs) == 1
assert logs[0]["metadata"] == {"foo": "bar"}


def test_logger_log_feedback_accepts_model_dump_metadata(with_memory_logger):
logger = init_test_logger(__name__)

logger.log_feedback(id="event-id", scores={"score": 1}, metadata=_ModelDumpMetadata(user_id="user-1"))

logs = with_memory_logger.pop()
assert len(logs) == 1
assert logs[0][AUDIT_METADATA_FIELD] == {"user_id": "user-1"}


def test_experiment_log_feedback_accepts_model_dump_metadata(with_memory_logger):
experiment = init_test_exp("test-experiment", "test-project")

experiment.log_feedback(id="event-id", scores={"score": 1}, metadata=_ModelDumpMetadata(user_id="user-1"))

logs = with_memory_logger.pop()
assert len(logs) == 1
assert logs[0][AUDIT_METADATA_FIELD] == {"user_id": "user-1"}


def test_span_log_rejects_metadata_with_non_string_keys(with_memory_logger):
logger = init_test_logger(__name__)

with logger.start_span(name="test_span") as span:
with pytest.raises(ValueError, match="metadata keys must be strings"):
span.log(metadata={1: "bad"})


def test_span_log_rejects_metadata_that_serializes_to_non_dict(with_memory_logger):
class BadMetadata:
def model_dump(self, **kwargs):
assert kwargs == {"exclude_none": True}
return ["not", "metadata"]

logger = init_test_logger(__name__)

with logger.start_span(name="test_span") as span:
with pytest.raises(ValueError, match="metadata must be a dictionary or serialize to a dictionary"):
span.log(metadata=BadMetadata())


def test_span_log_with_nested_circular_reference(with_memory_logger):
"""Test that span.log() with nested circular reference works gracefully."""
logger = init_test_logger(__name__)
Expand Down Expand Up @@ -2919,12 +3021,13 @@ def test_update_span_includes_span_id_and_root_span_id_from_export(with_memory_l

with_memory_logger.pop()

braintrust.update_span(exported=exported, output="updated output")
braintrust.update_span(exported=exported, output="updated output", metadata=_ModelDumpMetadata(foo="bar"))

logs = with_memory_logger.pop()
updated_log = next(log for log in logs if log.get("output") == "updated output")
assert updated_log["span_id"] == span_id
assert updated_log["root_span_id"] == root_span_id
assert updated_log["metadata"] == {"foo": "bar"}


def test_get_exporter_returns_v3_by_default():
Expand Down
Loading