|
| 1 | +package dev.braintrust.trace; |
| 2 | + |
| 3 | +import com.fasterxml.jackson.databind.JsonNode; |
| 4 | +import com.fasterxml.jackson.databind.node.ArrayNode; |
| 5 | +import com.fasterxml.jackson.databind.node.ObjectNode; |
| 6 | +import com.fasterxml.jackson.databind.node.TextNode; |
| 7 | +import dev.braintrust.json.BraintrustJsonMapper; |
| 8 | +import java.time.Duration; |
| 9 | +import java.util.Base64; |
| 10 | +import java.util.concurrent.atomic.AtomicBoolean; |
| 11 | +import java.util.regex.Matcher; |
| 12 | +import java.util.regex.Pattern; |
| 13 | +import lombok.SneakyThrows; |
| 14 | +import lombok.extern.slf4j.Slf4j; |
| 15 | + |
| 16 | +/** |
| 17 | + * Scans JSON content for base64 data URI attachments and replaces them with attachment references |
| 18 | + * after uploading to S3. |
| 19 | + * |
| 20 | + * <p>Package-private; not exposed in the public API. |
| 21 | + */ |
| 22 | +@Slf4j |
| 23 | +class AttachmentProcessor { |
| 24 | + /** |
| 25 | + * quick heuristic to determine if the json payload contains a base64 encoded file |
| 26 | + * |
| 27 | + * <p>This is used for performance reasons as a fail-fast to avoid doing a json parse. |
| 28 | + */ |
| 29 | + static final Pattern BASE64_DATA_URI_PATTERN = |
| 30 | + Pattern.compile("data:([\\w/\\-.+]+);base64,([A-Za-z0-9+/=]{20,})"); |
| 31 | + |
| 32 | + private final AttachmentUploader uploader; |
| 33 | + |
| 34 | + AttachmentProcessor(AttachmentUploader uploader) { |
| 35 | + this.uploader = uploader; |
| 36 | + } |
| 37 | + |
| 38 | + /** |
| 39 | + * Scans a JSON string for base64 data URIs, uploads them, and returns the modified JSON with |
| 40 | + * attachment references. |
| 41 | + * |
| 42 | + * @param json the JSON string to scan |
| 43 | + * @return the modified JSON with base64 data replaced by attachment references, or the original |
| 44 | + * JSON if no base64 data was found |
| 45 | + */ |
| 46 | + String processAndUpload(String json) { |
| 47 | + if (uploader.isShutdown() |
| 48 | + || json == null |
| 49 | + || !BASE64_DATA_URI_PATTERN.matcher(json).find()) { |
| 50 | + return json; |
| 51 | + } |
| 52 | + |
| 53 | + try { |
| 54 | + JsonNode root = BraintrustJsonMapper.get().readTree(json); |
| 55 | + AtomicBoolean modified = new AtomicBoolean(false); |
| 56 | + JsonNode result = replaceBase64Attachments(root, modified); |
| 57 | + return modified.get() ? BraintrustJsonMapper.get().writeValueAsString(result) : json; |
| 58 | + } catch (UploaderRejectionException e) { |
| 59 | + log.debug("uploader optimization failed, falling back to span uploads", e); |
| 60 | + uploader.shutdown(Duration.ofSeconds(0)); // don't block |
| 61 | + return json; |
| 62 | + } catch (Exception e) { |
| 63 | + throw new RuntimeException("Failed to process attachments in JSON", e); |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + // NOTE: not concerned with recursion blowing the stack because we're mutating AI vendor |
| 68 | + // messages which are not deep enough for this to be a concern |
| 69 | + private JsonNode replaceBase64Attachments(JsonNode node, AtomicBoolean modified) { |
| 70 | + if (node.isTextual()) { |
| 71 | + return replaceInText((TextNode) node, modified); |
| 72 | + } else if (node.isObject()) { |
| 73 | + ObjectNode objectNode = (ObjectNode) node; |
| 74 | + ObjectNode result = objectNode.deepCopy(); |
| 75 | + var fieldNames = objectNode.fieldNames(); |
| 76 | + while (fieldNames.hasNext()) { |
| 77 | + String fieldName = fieldNames.next(); |
| 78 | + JsonNode child = objectNode.get(fieldName); |
| 79 | + result.set(fieldName, replaceBase64Attachments(child, modified)); |
| 80 | + } |
| 81 | + return result; |
| 82 | + } else if (node.isArray()) { |
| 83 | + ArrayNode arrayNode = (ArrayNode) node; |
| 84 | + ArrayNode result = arrayNode.deepCopy(); |
| 85 | + for (int i = 0; i < arrayNode.size(); i++) { |
| 86 | + result.set(i, replaceBase64Attachments(arrayNode.get(i), modified)); |
| 87 | + } |
| 88 | + return result; |
| 89 | + } |
| 90 | + return node; |
| 91 | + } |
| 92 | + |
| 93 | + @SneakyThrows |
| 94 | + private JsonNode replaceInText(TextNode textNode, AtomicBoolean modified) { |
| 95 | + String value = textNode.asText(); |
| 96 | + Matcher matcher = BASE64_DATA_URI_PATTERN.matcher(value); |
| 97 | + if (!matcher.find()) { |
| 98 | + return textNode; |
| 99 | + } |
| 100 | + if (!isEntirelyDataUri(value)) { |
| 101 | + log.debug("found base64 string but text contained extra content {}", value); |
| 102 | + return textNode; |
| 103 | + } |
| 104 | + |
| 105 | + matcher.reset(); |
| 106 | + StringBuilder sb = new StringBuilder(); |
| 107 | + while (matcher.find()) { |
| 108 | + String contentType = matcher.group(1); |
| 109 | + String base64Data = matcher.group(2); |
| 110 | + byte[] data = Base64.getDecoder().decode(base64Data); |
| 111 | + |
| 112 | + String extension = contentTypeToExtension(contentType); |
| 113 | + String filename = "attachment" + extension; |
| 114 | + AttachmentReference ref = AttachmentReference.create(filename, contentType); |
| 115 | + |
| 116 | + if (!uploader.enqueue(ref, data)) { |
| 117 | + throw new UploaderRejectionException("uploader rejected attachment upload"); |
| 118 | + } |
| 119 | + |
| 120 | + String replacement = |
| 121 | + "{\"type\":\"braintrust_attachment\",\"content_type\":\"" |
| 122 | + + contentType |
| 123 | + + "\",\"filename\":\"" |
| 124 | + + filename |
| 125 | + + "\",\"key\":\"" |
| 126 | + + ref.key() |
| 127 | + + "\"}"; |
| 128 | + |
| 129 | + matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement)); |
| 130 | + } |
| 131 | + matcher.appendTail(sb); |
| 132 | + |
| 133 | + modified.set(true); |
| 134 | + |
| 135 | + return BraintrustJsonMapper.get().readTree(sb.toString()); |
| 136 | + } |
| 137 | + |
| 138 | + static boolean isEntirelyDataUri(String value) { |
| 139 | + String trimmed = value.trim(); |
| 140 | + return trimmed.startsWith("data:") |
| 141 | + && !trimmed.contains("\"") |
| 142 | + && !trimmed.contains("\\") |
| 143 | + && !trimmed.contains(" "); |
| 144 | + } |
| 145 | + |
| 146 | + private static String contentTypeToExtension(String contentType) { |
| 147 | + switch (contentType.toLowerCase()) { |
| 148 | + case "image/png": |
| 149 | + return ".png"; |
| 150 | + case "image/jpeg": |
| 151 | + case "image/jpg": |
| 152 | + return ".jpg"; |
| 153 | + case "image/gif": |
| 154 | + return ".gif"; |
| 155 | + case "image/webp": |
| 156 | + return ".webp"; |
| 157 | + case "image/svg+xml": |
| 158 | + return ".svg"; |
| 159 | + case "application/pdf": |
| 160 | + return ".pdf"; |
| 161 | + case "text/plain": |
| 162 | + return ".txt"; |
| 163 | + case "application/json": |
| 164 | + return ".json"; |
| 165 | + default: |
| 166 | + String[] parts = contentType.split("/"); |
| 167 | + if (parts.length == 2) { |
| 168 | + return "." + parts[1].split("[;\\-]")[0]; |
| 169 | + } |
| 170 | + return ""; |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + private static class UploaderRejectionException extends RuntimeException { |
| 175 | + public UploaderRejectionException(String message) { |
| 176 | + super(message); |
| 177 | + } |
| 178 | + } |
| 179 | +} |
0 commit comments