braintrust-sdk-python/py/src/braintrust/integrations/utils.py at main · braintrustdata/braintrust-sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
"""Shared tracing utilities for Braintrust SDK integrations.

These helpers are common building blocks used across multiple provider
integrations. Keeping them here avoids duplication and makes behavioral fixes
propagate to all providers at once.

Names are prefixed with ``_`` so that consumer modules can import them
directly without aliasing (e.g. ``from braintrust.integrations.utils import
_try_to_dict``).
"""

import base64
import binascii
import mimetypes
import os
import re
import time
import warnings
from collections.abc import Callable, Mapping
from dataclasses import dataclass
from numbers import Real
from typing import Any

from braintrust.logger import Attachment, Span
from braintrust.util import is_numeric


_DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$")

# Keep these overrides narrow and deterministic across platforms. Python's
# mimetypes registry varies by OS (notably on Windows), which can otherwise
# produce verbose vendor-subtype suffixes instead of common file extensions.
_KNOWN_ATTACHMENT_EXTENSIONS = {
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
}


def _try_to_dict(obj: Any) -> dict[str, Any] | Any:
    """Best-effort conversion of an SDK response object to a plain dict.

    Tries, in order:
      1. ``model_dump(mode="python")`` (preferred for Pydantic v2 objects)
      2. ``model_dump()``               (fallback for SDKs with custom signatures)
      3. ``to_dict()``                  (used by some provider SDK response objects)
      4. ``dict()``                     (Pydantic v1 / legacy)
      5. ``vars(obj)``                  (plain Python attribute bags)
      6. returns *obj* unchanged

    Only dict-like conversion results are accepted; non-dict results are
    ignored so later fallbacks still run.

    Pydantic serializer warnings (common with generic/discriminated-union
    models such as OpenAI's ``ParsedResponse[T]``) are suppressed.
    """
    if isinstance(obj, dict):
        return obj

    model_dump = getattr(obj, "model_dump", None)

    def _call_model_dump_python() -> Any:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Pydantic serializer warnings", category=UserWarning)
            return model_dump(mode="python")

    def _call_model_dump() -> Any:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Pydantic serializer warnings", category=UserWarning)
            return model_dump()

    to_dict = getattr(obj, "to_dict", None)
    dict_method = getattr(obj, "dict", None)

    converters: list[Callable[[], Any]] = []
    if callable(model_dump):
        converters.extend((_call_model_dump_python, _call_model_dump))
    if callable(to_dict):
        converters.append(to_dict)
    if callable(dict_method):
        converters.append(dict_method)
    converters.append(lambda: vars(obj))

    for converter in converters:
        try:
            result = converter()
        except Exception:
            continue
        if isinstance(result, dict):
            return result

    return obj


def _camel_to_snake(value: str) -> str:
    """Convert a camelCase or PascalCase string into snake_case."""
    out = []
    for char in value:
        if char.isupper():
            out.append("_")
            out.append(char.lower())
        else:
            out.append(char)
    return "".join(out).lstrip("_")


def _is_supported_metric_value(value: Any) -> bool:
    """Return ``True`` for numeric metric values, excluding booleans."""
    return isinstance(value, Real) and not isinstance(value, bool)


def _attachment_filename_for_mime_type(mime_type: str, *, prefix: str = "file") -> str:
    """Return a stable filename for *mime_type* using *prefix*.

    Examples:
    - ``image/png`` with prefix ``image`` -> ``image.png``
    - ``application/pdf`` with prefix ``document`` -> ``document.pdf``
    - ``image/svg+xml`` with prefix ``file`` -> ``file.svg``
    - ``application/vnd.openxmlformats-officedocument.spreadsheetml.sheet``
      with prefix ``file`` -> ``file.xlsx``
    """
    extension = _KNOWN_ATTACHMENT_EXTENSIONS.get(mime_type)
    if extension is None:
        guessed_extension = mimetypes.guess_extension(mime_type)
        if guessed_extension:
            extension = guessed_extension.lstrip(".")
        else:
            extension = mime_type.split("/", 1)[1] if "/" in mime_type else "bin"
            extension = extension.split("+", 1)[0]
    return f"{prefix}.{extension}"


@dataclass(frozen=True)
class _ResolvedAttachment:
    attachment: Attachment

    @property
    def mime_type(self) -> str:
        return self.attachment.reference.get("content_type") or "application/octet-stream"

    @property
    def filename(self) -> str:
        return self.attachment.reference.get("filename") or "file"

    @property
    def is_image(self) -> bool:
        return self.mime_type.startswith("image/")

    @property
    def multimodal_part_payload(self) -> dict[str, Any]:
        if self.is_image:
            return {"image_url": {"url": self.attachment}}
        return {"file": {"file_data": self.attachment, "filename": self.filename}}


def _label_for_mime_type(mime_type: str, label: str | None) -> str:
    if label is not None:
        return label
    return "image" if mime_type.startswith("image/") else "file"


def _default_attachment_filename(
    mime_type: str,
    *,
    filename: str | None = None,
    label: str | None = None,
    prefix: str | None = None,
) -> str:
    return filename or _attachment_filename_for_mime_type(
        mime_type,
        prefix=prefix or _label_for_mime_type(mime_type, label),
    )


def _resolved_attachment_from_bytes(
    data: bytes | bytearray,
    mime_type: str,
    *,
    filename: str | None = None,
    label: str | None = None,
    prefix: str | None = None,
) -> _ResolvedAttachment:
    resolved_filename = _default_attachment_filename(mime_type, filename=filename, label=label, prefix=prefix)
    attachment = Attachment(
        data=data if isinstance(data, bytes) else bytes(data),
        filename=resolved_filename,
        content_type=mime_type,
    )
    return _ResolvedAttachment(attachment=attachment)


def _resolved_attachment_from_base64(
    data: str,
    mime_type: str,
    *,
    filename: str | None = None,
    label: str | None = None,
    prefix: str | None = None,
) -> _ResolvedAttachment | None:
    raw_data = data.partition(",")[2] if data.startswith("data:") else data

    try:
        decoded = base64.b64decode(raw_data, validate=True)
    except (binascii.Error, ValueError):
        return None

    return _resolved_attachment_from_bytes(decoded, mime_type, filename=filename, label=label, prefix=prefix)


def _materialize_attachment(
    value: Any,
    *,
    mime_type: str | None = None,
    filename: str | None = None,
    label: str | None = None,
    prefix: str | None = None,
) -> _ResolvedAttachment | None:
    """Resolve common attachment inputs into a concrete attachment object.

    Supports existing :class:`Attachment` objects, bytes-like data, raw base64
    strings, data URLs, filesystem paths, file-like objects, and common
    ``(filename, value, content_type)`` tuple inputs.
    """
    if value is None:
        return None

    if isinstance(value, Attachment):
        ref_ct = value.reference.get("content_type")
        ref_fn = value.reference.get("filename")
        resolved_mime_type = mime_type or ref_ct or "application/octet-stream"
        resolved_filename = (
            filename
            or ref_fn
            or _default_attachment_filename(
                resolved_mime_type,
                label=label,
                prefix=prefix,
            )
        )
        if ref_ct != resolved_mime_type or ref_fn != resolved_filename:
            attachment = Attachment(
                data=value.data,
                filename=resolved_filename,
                content_type=resolved_mime_type,
            )
            return _ResolvedAttachment(attachment=attachment)
        return _ResolvedAttachment(attachment=value)

    if isinstance(value, tuple):
        tuple_filename = value[0] if value and isinstance(value[0], (str, os.PathLike)) else None
        tuple_value = value[1] if len(value) > 1 else None
        tuple_content_type = value[2] if len(value) > 2 and isinstance(value[2], str) else None
        return _materialize_attachment(
            tuple_value,
            mime_type=mime_type or tuple_content_type,
            filename=filename or (os.path.basename(os.fspath(tuple_filename)) if tuple_filename is not None else None),
            label=label,
            prefix=prefix,
        )

    if isinstance(value, (bytes, bytearray)):
        resolved_mime_type = (
            mime_type
            or (mimetypes.guess_type(filename)[0] if filename is not None else None)
            or "application/octet-stream"
        )
        return _resolved_attachment_from_bytes(
            value, resolved_mime_type, filename=filename, label=label, prefix=prefix
        )

    if isinstance(value, (str, os.PathLike)):
        path_or_data = os.fspath(value)
        data_url_match = _DATA_URL_RE.match(path_or_data) if isinstance(value, str) else None
        if data_url_match:
            data_url_mime_type, _ = data_url_match.groups()
            return _resolved_attachment_from_base64(
                path_or_data,
                mime_type or data_url_mime_type,
                filename=filename,
                label=label,
                prefix=prefix,
            )

        try:
            with open(path_or_data, "rb") as file_obj:
                data = file_obj.read()
        except OSError:
            if isinstance(value, str) and mime_type is not None:
                return _resolved_attachment_from_base64(
                    value,
                    mime_type,
                    filename=filename,
                    label=label,
                    prefix=prefix,
                )
            return None

        resolved_filename = filename or os.path.basename(path_or_data)
        resolved_mime_type = mime_type or mimetypes.guess_type(resolved_filename)[0] or "application/octet-stream"
        return _resolved_attachment_from_bytes(
            data,
            resolved_mime_type,
            filename=resolved_filename,
            label=label,
            prefix=prefix,
        )

    read = getattr(value, "read", None)
    if callable(read):
        file_name_attr = getattr(value, "name", None)
        resolved_filename = filename or (os.path.basename(file_name_attr) if isinstance(file_name_attr, str) else None)
        resolved_mime_type = (
            mime_type
            or (mimetypes.guess_type(resolved_filename)[0] if resolved_filename is not None else None)
            or "application/octet-stream"
        )

        position = None
        try:
            position = value.tell()
        except Exception:
            pass

        try:
            data = value.read()
        finally:
            if position is not None:
                try:
                    value.seek(position)
                except Exception:
                    pass

        if isinstance(data, str):
            data = data.encode()
        if isinstance(data, (bytes, bytearray)):
            return _resolved_attachment_from_bytes(
                data,
                resolved_mime_type,
                filename=resolved_filename,
                label=label,
                prefix=prefix,
            )
        return None

    return None


def _materialize_chat_message_content_part(part: Any) -> Any:
    """Materialize binary payloads inside one OpenAI-style message content part.

    Handles the three part types that Braintrust integrations commonly see in
    chat-completions ``messages``:

    - ``{"type": "image_url", "image_url": {"url": ...}}``
    - ``{"type": "input_audio", "input_audio": {"data": ..., "format": ...}}``
    - ``{"type": "file", "file": {"file_data": ..., "filename": ...}}``

    Data URLs, raw base64 strings, and bytes are converted into
    :class:`braintrust.logger.Attachment` objects; plain remote URLs and
    already-materialized attachments pass through unchanged. Unrecognized part
    shapes are returned untouched.
    """
    if not isinstance(part, dict):
        return part

    part_type = part.get("type")
    if part_type == "image_url":
        image_url = part.get("image_url")
        url = image_url.get("url") if isinstance(image_url, dict) else image_url
        if isinstance(url, str) and url.startswith("data:"):
            resolved = _materialize_attachment(url)
            if resolved is not None:
                return {**part, "image_url": {"url": resolved.attachment}}
    elif part_type == "input_audio":
        audio = part.get("input_audio") if isinstance(part.get("input_audio"), dict) else {}
        data = audio.get("data")
        fmt = audio.get("format")
        if isinstance(data, str) and data:
            mime = f"audio/{fmt}" if fmt else None
            resolved = _materialize_attachment(data, mime_type=mime)
            if resolved is not None:
                return {**part, "input_audio": {**audio, "data": resolved.attachment}}
    elif part_type == "file":
        file_obj = part.get("file") if isinstance(part.get("file"), dict) else {}
        data = file_obj.get("file_data")
        if isinstance(data, str) and data:
            filename = file_obj.get("filename")
            resolved = _materialize_attachment(data, filename=filename if isinstance(filename, str) else None)
            if resolved is not None:
                return {**part, "file": {**file_obj, "file_data": resolved.attachment}}

    return part


def _normalize_chat_messages(messages: Any) -> Any:
    """Return *messages* with binary multimodal content parts materialized.

    Plain strings, ``None`` and non-list inputs are returned unchanged. Each
    list element with ``list`` content has its parts walked through
    :func:`_materialize_chat_message_content_part`; messages with string
    content pass through untouched.
    """
    if not isinstance(messages, list):
        return messages

    normalized: list[Any] = []
    for msg in messages:
        if not isinstance(msg, dict):
            normalized.append(msg)
            continue
        content = msg.get("content")
        if isinstance(content, list):
            normalized.append({**msg, "content": [_materialize_chat_message_content_part(p) for p in content]})
        else:
            normalized.append(msg)
    return normalized


_AUDIO_FORMAT_TO_MIME_TYPE = {
    "mp3": "audio/mpeg",
    "wav": "audio/wav",
    "opus": "audio/opus",
    "aac": "audio/aac",
    "flac": "audio/flac",
    "pcm": "audio/pcm",
    "pcm16": "audio/pcm",
}


def _infer_audio_mime_type(response: Any, response_format: Any = None) -> str:
    raw_response = getattr(response, "response", None)
    if raw_response is None and isinstance(response, Mapping):
        raw_response = response.get("response")

    headers = getattr(raw_response, "headers", None)
    if headers is not None:
        content_type = headers.get("content-type")
        if isinstance(content_type, str) and content_type:
            return content_type.split(";", 1)[0].strip()

    if isinstance(response_format, str) and response_format:
        normalized = response_format.lower()
        return _AUDIO_FORMAT_TO_MIME_TYPE.get(
            normalized,
            normalized if "/" in normalized else f"audio/{normalized}",
        )

    return "application/octet-stream"


def _extract_audio_output(
    response: Any,
    *,
    response_format: Any = None,
    prefix: str = "generated_audio",
) -> dict[str, Any]:
    audio_bytes = getattr(response, "content", None)
    if not isinstance(audio_bytes, (bytes, bytearray)) and isinstance(response, Mapping):
        raw_response = response.get("response")
        audio_bytes = getattr(raw_response, "content", None)

    if not isinstance(audio_bytes, (bytes, bytearray)):
        return {"type": "audio"}

    mime_type = _infer_audio_mime_type(response, response_format)
    resolved_attachment = _materialize_attachment(
        audio_bytes,
        mime_type=mime_type,
        prefix=prefix,
    )
    if resolved_attachment is None:
        return {
            "type": "audio",
            "mime_type": mime_type,
            "audio_size_bytes": len(audio_bytes),
        }

    return {
        "type": "audio",
        "mime_type": resolved_attachment.mime_type,
        "audio_size_bytes": len(audio_bytes),
        **resolved_attachment.multimodal_part_payload,
    }


def _is_not_given(value: object) -> bool:
    """Return ``True`` when *value* is a provider omitted-parameter sentinel.

    Works by type-name inspection so that Braintrust does not need a
    direct import dependency on any provider SDK.
    """
    if value is None:
        return False
    try:
        return type(value).__name__ in {"NotGiven", "Omit"}
    except Exception:
        return False


def _serialize_response_format(response_format: Any) -> Any:
    """Serialize a Pydantic ``BaseModel`` subclass into a JSON-schema dict.

    Non-Pydantic values pass through unchanged. Used when logging
    ``response_format`` parameters so the span metadata contains a
    readable schema rather than a Python class reference.
    """
    try:
        from pydantic import BaseModel
    except ImportError:
        return response_format

    if isinstance(response_format, type) and issubclass(response_format, BaseModel):
        return dict(
            type="json_schema",
            json_schema=dict(
                name=response_format.__name__,
                schema=response_format.model_json_schema(),
            ),
        )
    return response_format


def _prettify_response_params(params: dict[str, Any], *, drop_not_given: bool = False) -> dict[str, Any]:
    """Return a shallow copy of traced request params with logging-friendly values."""
    ret = params.copy()
    if drop_not_given:
        ret = {key: value for key, value in ret.items() if not _is_not_given(value)}

    if "response_format" in ret:
        ret["response_format"] = _serialize_response_format(ret["response_format"])
    return ret


def _parse_openai_usage_metrics(
    usage: Any,
    *,
    token_name_map: Mapping[str, str],
    token_prefix_map: Mapping[str, str],
) -> dict[str, Any]:
    """Parse usage payloads that follow OpenAI's ``*_tokens`` conventions."""
    metrics: dict[str, Any] = {}

    if not usage:
        return metrics

    usage = _try_to_dict(usage)
    if not isinstance(usage, dict):
        return metrics

    for name, value in usage.items():
        if name.endswith("_tokens_details"):
            if not isinstance(value, dict):
                continue
            raw_prefix = name[: -len("_tokens_details")]
            prefix = token_prefix_map.get(raw_prefix, raw_prefix)
            for nested_name, nested_value in value.items():
                if is_numeric(nested_value):
                    metrics[f"{prefix}_{nested_name}"] = nested_value
        elif is_numeric(value):
            metrics[token_name_map.get(name, name)] = value

    return metrics


def _timing_metrics(start_time: float, end_time: float, first_token_time: float | None = None) -> dict[str, float]:
    """Build a standard ``start / end / duration`` metrics dict.

    Optionally includes ``time_to_first_token`` when *first_token_time*
    is provided.
    """
    metrics: dict[str, float] = {
        "start": start_time,
        "end": end_time,
        "duration": end_time - start_time,
    }
    if first_token_time is not None:
        metrics["time_to_first_token"] = first_token_time - start_time
    return metrics


def _merge_timing_and_usage_metrics(
    start_time: float,
    usage: Any,
    usage_parser: Callable[[Any], dict[str, Any]],
    first_token_time: float | None = None,
) -> dict[str, Any]:
    """Combine standard timing metrics with provider-specific usage parsing."""
    return {
        **_timing_metrics(start_time, time.time(), first_token_time),
        **usage_parser(usage),
    }


def _log_and_end_span(
    span: Span,
    *,
    output: Any = None,
    metrics: dict[str, Any] | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    """Log *output*, *metrics* and *metadata* (when present) then end the span."""
    event: dict[str, Any] = {}
    if output is not None:
        event["output"] = output
    if metrics:
        event["metrics"] = metrics
    if metadata:
        event["metadata"] = metadata
    if event:
        span.log(**event)
    span.end()


def _log_error_and_end_span(span: Span, error: BaseException) -> None:
    """Log an error to *span* and immediately end it."""
    span.log(error=error)
    span.end()