From c23bbf9c5cbf7e2c613dab6f14b183e33a16f13c Mon Sep 17 00:00:00 2001 From: Puneet Rai Date: Sun, 8 Feb 2026 00:04:37 +0530 Subject: [PATCH] fix(tiff): decode UserComment EXIF tag (Bug #19) Previously, UserComment (tag 0x9286) was returned as raw hex bytes. Now properly decodes the 8-byte charset prefix and text content. Supported encodings: - ASCII: "ASCII\x00\x00\x00" prefix - Unicode: "UNICODE\x00" prefix (UTF-16 LE/BE) - JIS: "JIS\x00\x00\x00\x00\x00" prefix - Undefined: null prefix (treated as UTF-8) - No prefix fallback for malformed data Before: "415343494900000053686f7420..." After: "Shot during golden hour, 3-stop graduated ND filter used" Closes #19 Co-Authored-By: Claude Opus 4.5 --- api_integration_test.go | 2 +- internal/parser/tiff/ifd.go | 168 ++++++++++++++++++++++++++ internal/parser/tiff/ifd_test.go | 200 +++++++++++++++++++++++++++++++ internal/parser/tiff/types.go | 1 + 4 files changed, 370 insertions(+), 1 deletion(-) diff --git a/api_integration_test.go b/api_integration_test.go index 9180dc6..e70bab6 100644 --- a/api_integration_test.go +++ b/api_integration_test.go @@ -283,7 +283,7 @@ func TestIntegration_CR2(t *testing.T) { {Name: "Flash", Value: "Off, Did not fire"}, {Name: "FocalLength", Value: "85/1"}, // MakerNote is now parsed into Canon directory - {Name: "UserComment", Type: "[]byte"}, // Binary data - check presence only + {Name: "UserComment", Type: "string"}, // Decoded comment text (Bug #19 fix) {Name: "FlashpixVersion", Value: "0100"}, {Name: "ColorSpace", Value: "sRGB"}, {Name: "PixelXDimension", Value: uint16(4992)}, diff --git a/internal/parser/tiff/ifd.go b/internal/parser/tiff/ifd.go index e4466c3..f2ecc0f 100644 --- a/internal/parser/tiff/ifd.go +++ b/internal/parser/tiff/ifd.go @@ -141,6 +141,14 @@ func (p *Parser) parseTag(r *imxbin.Reader, entry *IFDEntry, dirName string) (*p value = decoded } + // Decode UserComment (EXIF tag 0x9286) + // Format: 8-byte charset prefix + variable-length text + if entry.Tag == TagUserComment { + if data, ok := value.([]byte); ok { + value = decodeUserComment(data) + } + } + return &parser.Tag{ ID: parser.TagID(fmt.Sprintf("%s:0x%04X", dirName, entry.Tag)), Name: tagName, @@ -664,3 +672,163 @@ func getTagNameForDir(tag uint16, dirName string) string { return getTIFFTagName(tag) // Default to TIFF tags } } + +// UserComment charset prefixes (8 bytes each) +var ( + userCommentASCII = []byte("ASCII\x00\x00\x00") + userCommentUnicode = []byte("UNICODE\x00") + userCommentJIS = []byte("JIS\x00\x00\x00\x00\x00") + userCommentUndefined = []byte("\x00\x00\x00\x00\x00\x00\x00\x00") +) + +// decodeUserComment decodes EXIF UserComment tag (0x9286). +// Format: 8-byte charset prefix + variable-length text. +// +// Charset prefixes: +// - "ASCII\x00\x00\x00" (8 bytes) - ASCII text +// - "UNICODE\x00" (8 bytes) - UTF-16 text (big or little endian) +// - "JIS\x00\x00\x00\x00\x00" (8 bytes) - JIS encoding +// - "\x00\x00\x00\x00\x00\x00\x00\x00" (8 bytes) - undefined (often UTF-8) +// +// Returns decoded string or original bytes if decoding fails. +func decodeUserComment(data []byte) interface{} { + const prefixLen = 8 + + // Need at least the charset prefix + if len(data) < prefixLen { + return data + } + + prefix := data[:prefixLen] + content := data[prefixLen:] + + // Empty content + if len(content) == 0 { + return "" + } + + switch { + case bytes.Equal(prefix, userCommentASCII): + // Trim trailing nulls for ASCII + return string(bytes.TrimRight(content, "\x00")) + + case bytes.Equal(prefix, userCommentUnicode): + // UTF-16 - don't trim nulls before decoding (nulls are part of UTF-16) + // decodeUTF16 handles null termination internally + return decodeUTF16(content) + + case bytes.Equal(prefix, userCommentJIS): + // JIS encoding - return as-is for now (rarely used) + // Full JIS decoding would require additional dependencies + return string(bytes.TrimRight(content, "\x00")) + + case bytes.Equal(prefix, userCommentUndefined): + // Undefined charset - often UTF-8 in practice + return string(bytes.TrimRight(content, "\x00")) + + default: + // Unknown prefix - check if it looks like text without prefix + // Some cameras omit the charset prefix entirely + if isValidUTF8(data) { + return string(bytes.TrimRight(data, "\x00")) + } + return data + } +} + +// decodeUTF16 decodes UTF-16 encoded data to a UTF-8 string. +// Detects byte order from BOM if present, otherwise assumes little-endian. +func decodeUTF16(data []byte) string { + if len(data) < 2 { + return "" + } + + // Check for BOM + var littleEndian bool + start := 0 + if data[0] == 0xFF && data[1] == 0xFE { + littleEndian = true + start = 2 + } else if data[0] == 0xFE && data[1] == 0xFF { + littleEndian = false + start = 2 + } else { + // No BOM - assume little-endian (most common on Windows) + littleEndian = true + } + + // Decode UTF-16 to UTF-8 + data = data[start:] + if len(data)%2 != 0 { + data = data[:len(data)-1] // Truncate odd byte + } + + var result []rune + for i := 0; i < len(data); i += 2 { + var r uint16 + if littleEndian { + r = uint16(data[i]) | uint16(data[i+1])<<8 + } else { + r = uint16(data[i])<<8 | uint16(data[i+1]) + } + + // Handle surrogate pairs for characters outside BMP + if r >= 0xD800 && r <= 0xDBFF && i+2 < len(data) { + var r2 uint16 + if littleEndian { + r2 = uint16(data[i+2]) | uint16(data[i+3])<<8 + } else { + r2 = uint16(data[i+2])<<8 | uint16(data[i+3]) + } + if r2 >= 0xDC00 && r2 <= 0xDFFF { + // Valid surrogate pair + codePoint := 0x10000 + (int(r-0xD800) << 10) + int(r2-0xDC00) + result = append(result, rune(codePoint)) + i += 2 + continue + } + } + + if r == 0 { + break // Null terminator + } + result = append(result, rune(r)) + } + + return string(result) +} + +// isValidUTF8 checks if data is valid UTF-8 text (printable or whitespace). +func isValidUTF8(data []byte) bool { + if len(data) == 0 { + return false + } + + // Trim trailing nulls first - they're common in fixed-size EXIF fields + trimmed := bytes.TrimRight(data, "\x00") + if len(trimmed) == 0 { + return false + } + + // Quick check: count embedded nulls (after trimming trailing ones) + nullCount := 0 + for _, b := range trimmed { + if b == 0 { + nullCount++ + } + } + // If more than 20% embedded nulls, probably binary + if nullCount > len(trimmed)/5 { + return false + } + + // Check if valid UTF-8 + s := string(trimmed) + for _, r := range s { + // Allow printable characters and common whitespace + if r == '\uFFFD' { // Unicode replacement character = invalid UTF-8 + return false + } + } + return true +} diff --git a/internal/parser/tiff/ifd_test.go b/internal/parser/tiff/ifd_test.go index 26dafb1..8fa764e 100644 --- a/internal/parser/tiff/ifd_test.go +++ b/internal/parser/tiff/ifd_test.go @@ -1421,3 +1421,203 @@ func TestGetTagNameForDir(t *testing.T) { } } } + +func TestDecodeUserComment(t *testing.T) { + tests := []struct { + name string + data []byte + want interface{} + wantStr bool // true if want is expected to be a string + }{ + { + name: "ASCII encoding", + data: append([]byte("ASCII\x00\x00\x00"), []byte("Hello World")...), + want: "Hello World", + wantStr: true, + }, + { + name: "ASCII with trailing nulls", + data: append([]byte("ASCII\x00\x00\x00"), []byte("Hello\x00\x00\x00")...), + want: "Hello", + wantStr: true, + }, + { + name: "undefined encoding (UTF-8)", + data: append([]byte("\x00\x00\x00\x00\x00\x00\x00\x00"), []byte("Test comment")...), + want: "Test comment", + wantStr: true, + }, + { + name: "empty content after prefix", + data: []byte("ASCII\x00\x00\x00"), + want: "", + wantStr: true, + }, + { + name: "empty content with trailing nulls", + data: append([]byte("ASCII\x00\x00\x00"), []byte("\x00\x00\x00")...), + want: "", + wantStr: true, + }, + { + name: "too short - less than prefix", + data: []byte("ASCII"), + want: []byte("ASCII"), + wantStr: false, + }, + { + name: "JIS encoding", + data: append([]byte("JIS\x00\x00\x00\x00\x00"), []byte("Test")...), + want: "Test", + wantStr: true, + }, + { + name: "Unicode UTF-16LE with BOM", + data: append([]byte("UNICODE\x00"), append([]byte{0xFF, 0xFE}, []byte{'H', 0, 'i', 0}...)...), + want: "Hi", + wantStr: true, + }, + { + name: "Unicode UTF-16BE with BOM", + data: append([]byte("UNICODE\x00"), append([]byte{0xFE, 0xFF}, []byte{0, 'H', 0, 'i'}...)...), + want: "Hi", + wantStr: true, + }, + { + name: "Unicode UTF-16LE without BOM", + data: append([]byte("UNICODE\x00"), []byte{'T', 0, 'e', 0, 's', 0, 't', 0}...), + want: "Test", + wantStr: true, + }, + { + name: "no prefix - valid UTF-8 text", + data: []byte("Plain text comment"), + want: "Plain text comment", + wantStr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := decodeUserComment(tt.data) + + if tt.wantStr { + gotStr, ok := got.(string) + if !ok { + t.Errorf("decodeUserComment() returned %T, want string", got) + return + } + wantStr := tt.want.(string) + if gotStr != wantStr { + t.Errorf("decodeUserComment() = %q, want %q", gotStr, wantStr) + } + } else { + gotBytes, ok := got.([]byte) + if !ok { + t.Errorf("decodeUserComment() returned %T, want []byte", got) + return + } + wantBytes := tt.want.([]byte) + if !bytes.Equal(gotBytes, wantBytes) { + t.Errorf("decodeUserComment() = %v, want %v", gotBytes, wantBytes) + } + } + }) + } +} + +func TestDecodeUTF16(t *testing.T) { + tests := []struct { + name string + data []byte + want string + }{ + { + name: "empty", + data: []byte{}, + want: "", + }, + { + name: "single byte", + data: []byte{0x41}, + want: "", + }, + { + name: "simple ASCII as UTF-16LE", + data: []byte{'A', 0, 'B', 0, 'C', 0}, + want: "ABC", + }, + { + name: "with LE BOM", + data: []byte{0xFF, 0xFE, 'H', 0, 'i', 0}, + want: "Hi", + }, + { + name: "with BE BOM", + data: []byte{0xFE, 0xFF, 0, 'H', 0, 'i'}, + want: "Hi", + }, + { + name: "null terminated", + data: []byte{'A', 0, 0, 0}, + want: "A", + }, + { + name: "odd length truncated", + data: []byte{'A', 0, 'B'}, + want: "A", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := decodeUTF16(tt.data) + if got != tt.want { + t.Errorf("decodeUTF16() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestIsValidUTF8(t *testing.T) { + tests := []struct { + name string + data []byte + want bool + }{ + { + name: "empty", + data: []byte{}, + want: false, + }, + { + name: "valid ASCII", + data: []byte("Hello World"), + want: true, + }, + { + name: "valid UTF-8", + data: []byte("Héllo Wörld"), + want: true, + }, + { + name: "too many nulls", + data: []byte{0, 0, 0, 0, 'A'}, + want: false, + }, + { + name: "trailing nulls ok", + data: []byte("Test\x00\x00"), + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isValidUTF8(tt.data) + if got != tt.want { + t.Errorf("isValidUTF8() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/internal/parser/tiff/types.go b/internal/parser/tiff/types.go index 501a8b8..c406478 100644 --- a/internal/parser/tiff/types.go +++ b/internal/parser/tiff/types.go @@ -39,6 +39,7 @@ const ( TagIPTC uint16 = 0x83BB TagXMP uint16 = 0x02BC // XMLPacket (decimal 700) TagMakerNote uint16 = 0x927C + TagUserComment uint16 = 0x9286 TagSubIFDs uint16 = 0x014A TagJPEGInterchange uint16 = 0x0201 TagJPEGInterLength uint16 = 0x0202