Skip to content

Commit 5fe29f2

Browse files
committed
optimization: extract and upload base64 files to s3
adds attachment process to standard braintrust tracing - scan for base64 images in `braintrust.input_json` and `braintrust.output_json`: - for each image: - extract the base64 string - upload to s3 - replace base64 image with a pointer to the s3 image
1 parent 28dd598 commit 5fe29f2

11 files changed

Lines changed: 1536 additions & 28 deletions

File tree

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
package dev.braintrust.trace;
2+
3+
import com.fasterxml.jackson.databind.JsonNode;
4+
import com.fasterxml.jackson.databind.node.ArrayNode;
5+
import com.fasterxml.jackson.databind.node.ObjectNode;
6+
import com.fasterxml.jackson.databind.node.TextNode;
7+
import dev.braintrust.json.BraintrustJsonMapper;
8+
import java.time.Duration;
9+
import java.util.Base64;
10+
import java.util.concurrent.atomic.AtomicBoolean;
11+
import java.util.regex.Matcher;
12+
import java.util.regex.Pattern;
13+
import lombok.SneakyThrows;
14+
import lombok.extern.slf4j.Slf4j;
15+
16+
/**
17+
* Scans JSON content for base64 data URI attachments and replaces them with attachment references
18+
* after uploading to S3.
19+
*
20+
* <p>Package-private; not exposed in the public API.
21+
*/
22+
@Slf4j
23+
class AttachmentProcessor {
24+
/**
25+
* quick heuristic to determine if the json payload contains a base64 encoded file
26+
*
27+
* <p>This is used for performance reasons as a fail-fast to avoid doing a json parse.
28+
*/
29+
static final Pattern BASE64_DATA_URI_PATTERN =
30+
Pattern.compile("data:([\\w/\\-.+]+);base64,([A-Za-z0-9+/=]{20,})");
31+
32+
private final AttachmentUploader uploader;
33+
34+
AttachmentProcessor(AttachmentUploader uploader) {
35+
this.uploader = uploader;
36+
}
37+
38+
/**
39+
* Scans a JSON string for base64 data URIs, uploads them, and returns the modified JSON with
40+
* attachment references.
41+
*
42+
* @param json the JSON string to scan
43+
* @return the modified JSON with base64 data replaced by attachment references, or the original
44+
* JSON if no base64 data was found
45+
*/
46+
String processAndUpload(String json) {
47+
if (uploader.isShutdown()
48+
|| json == null
49+
|| !BASE64_DATA_URI_PATTERN.matcher(json).find()) {
50+
return json;
51+
}
52+
53+
try {
54+
JsonNode root = BraintrustJsonMapper.get().readTree(json);
55+
AtomicBoolean modified = new AtomicBoolean(false);
56+
JsonNode result = replaceBase64Attachments(root, modified);
57+
return modified.get() ? BraintrustJsonMapper.get().writeValueAsString(result) : json;
58+
} catch (UploaderRejectionException e) {
59+
log.debug("uploader optimization failed, falling back to span uploads", e);
60+
uploader.shutdown(Duration.ofSeconds(0)); // don't block
61+
return json;
62+
} catch (Exception e) {
63+
throw new RuntimeException("Failed to process attachments in JSON", e);
64+
}
65+
}
66+
67+
// NOTE: not concerned with recursion blowing the stack because we're mutating AI vendor
68+
// messages which are not deep enough for this to be a concern
69+
private JsonNode replaceBase64Attachments(JsonNode node, AtomicBoolean modified) {
70+
if (node.isTextual()) {
71+
return replaceInText((TextNode) node, modified);
72+
} else if (node.isObject()) {
73+
ObjectNode objectNode = (ObjectNode) node;
74+
ObjectNode result = objectNode.deepCopy();
75+
var fieldNames = objectNode.fieldNames();
76+
while (fieldNames.hasNext()) {
77+
String fieldName = fieldNames.next();
78+
JsonNode child = objectNode.get(fieldName);
79+
result.set(fieldName, replaceBase64Attachments(child, modified));
80+
}
81+
return result;
82+
} else if (node.isArray()) {
83+
ArrayNode arrayNode = (ArrayNode) node;
84+
ArrayNode result = arrayNode.deepCopy();
85+
for (int i = 0; i < arrayNode.size(); i++) {
86+
result.set(i, replaceBase64Attachments(arrayNode.get(i), modified));
87+
}
88+
return result;
89+
}
90+
return node;
91+
}
92+
93+
@SneakyThrows
94+
private JsonNode replaceInText(TextNode textNode, AtomicBoolean modified) {
95+
String value = textNode.asText();
96+
Matcher matcher = BASE64_DATA_URI_PATTERN.matcher(value);
97+
if (!matcher.find()) {
98+
return textNode;
99+
}
100+
if (!isEntirelyDataUri(value)) {
101+
log.debug("found base64 string but text contained extra content {}", value);
102+
return textNode;
103+
}
104+
105+
matcher.reset();
106+
StringBuilder sb = new StringBuilder();
107+
while (matcher.find()) {
108+
String contentType = matcher.group(1);
109+
String base64Data = matcher.group(2);
110+
byte[] data = Base64.getDecoder().decode(base64Data);
111+
112+
String extension = contentTypeToExtension(contentType);
113+
String filename = "attachment" + extension;
114+
AttachmentReference ref = AttachmentReference.create(filename, contentType);
115+
116+
if (!uploader.enqueue(ref, data)) {
117+
throw new UploaderRejectionException("uploader rejected attachment upload");
118+
}
119+
120+
String replacement =
121+
"{\"type\":\"braintrust_attachment\",\"content_type\":\""
122+
+ contentType
123+
+ "\",\"filename\":\""
124+
+ filename
125+
+ "\",\"key\":\""
126+
+ ref.key()
127+
+ "\"}";
128+
129+
matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement));
130+
}
131+
matcher.appendTail(sb);
132+
133+
modified.set(true);
134+
135+
return BraintrustJsonMapper.get().readTree(sb.toString());
136+
}
137+
138+
static boolean isEntirelyDataUri(String value) {
139+
String trimmed = value.trim();
140+
return trimmed.startsWith("data:")
141+
&& !trimmed.contains("\"")
142+
&& !trimmed.contains("\\")
143+
&& !trimmed.contains(" ");
144+
}
145+
146+
private static String contentTypeToExtension(String contentType) {
147+
switch (contentType.toLowerCase()) {
148+
case "image/png":
149+
return ".png";
150+
case "image/jpeg":
151+
case "image/jpg":
152+
return ".jpg";
153+
case "image/gif":
154+
return ".gif";
155+
case "image/webp":
156+
return ".webp";
157+
case "image/svg+xml":
158+
return ".svg";
159+
case "application/pdf":
160+
return ".pdf";
161+
case "text/plain":
162+
return ".txt";
163+
case "application/json":
164+
return ".json";
165+
default:
166+
String[] parts = contentType.split("/");
167+
if (parts.length == 2) {
168+
return "." + parts[1].split("[;\\-]")[0];
169+
}
170+
return "";
171+
}
172+
}
173+
174+
private static class UploaderRejectionException extends RuntimeException {
175+
public UploaderRejectionException(String message) {
176+
super(message);
177+
}
178+
}
179+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package dev.braintrust.trace;
2+
3+
import java.util.Objects;
4+
import java.util.UUID;
5+
import javax.annotation.Nonnull;
6+
7+
/**
8+
* Represents an attachment reference stored on a span in place of uploaded attachment data.
9+
*
10+
* <p>Its shape intentionally matches the cross-SDK Braintrust attachment reference format.
11+
*/
12+
record AttachmentReference(
13+
@Nonnull String type,
14+
@Nonnull String filename,
15+
@Nonnull String contentType,
16+
@Nonnull String key) {
17+
18+
private static final String DEFAULT_TYPE = "braintrust_attachment";
19+
20+
/**
21+
* Creates an attachment reference with a generated UUID key.
22+
*
23+
* @param filename the display filename for the attachment
24+
* @param contentType the MIME type of the attachment content
25+
* @return a new AttachmentReference with a unique key
26+
*/
27+
static AttachmentReference create(@Nonnull String filename, @Nonnull String contentType) {
28+
Objects.requireNonNull(filename, "filename cannot be null");
29+
Objects.requireNonNull(contentType, "contentType cannot be null");
30+
return new AttachmentReference(
31+
DEFAULT_TYPE, filename, contentType, UUID.randomUUID().toString());
32+
}
33+
}

0 commit comments

Comments
 (0)