Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ func TestIntegration_CR2(t *testing.T) {
{Name: "Flash", Value: "Off, Did not fire"},
{Name: "FocalLength", Value: "85/1"},
// MakerNote is now parsed into Canon directory
{Name: "UserComment", Type: "[]byte"}, // Binary data - check presence only
{Name: "UserComment", Type: "string"}, // Decoded comment text (Bug #19 fix)
{Name: "FlashpixVersion", Value: "0100"},
{Name: "ColorSpace", Value: "sRGB"},
{Name: "PixelXDimension", Value: uint16(4992)},
Expand Down
168 changes: 168 additions & 0 deletions internal/parser/tiff/ifd.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,14 @@ func (p *Parser) parseTag(r *imxbin.Reader, entry *IFDEntry, dirName string) (*p
value = decoded
}

// Decode UserComment (EXIF tag 0x9286)
// Format: 8-byte charset prefix + variable-length text
if entry.Tag == TagUserComment {
if data, ok := value.([]byte); ok {
value = decodeUserComment(data)
}
}

return &parser.Tag{
ID: parser.TagID(fmt.Sprintf("%s:0x%04X", dirName, entry.Tag)),
Name: tagName,
Expand Down Expand Up @@ -664,3 +672,163 @@ func getTagNameForDir(tag uint16, dirName string) string {
return getTIFFTagName(tag) // Default to TIFF tags
}
}

// UserComment charset prefixes (8 bytes each)
var (
userCommentASCII = []byte("ASCII\x00\x00\x00")
userCommentUnicode = []byte("UNICODE\x00")
userCommentJIS = []byte("JIS\x00\x00\x00\x00\x00")
userCommentUndefined = []byte("\x00\x00\x00\x00\x00\x00\x00\x00")
)

// decodeUserComment decodes EXIF UserComment tag (0x9286).
// Format: 8-byte charset prefix + variable-length text.
//
// Charset prefixes:
// - "ASCII\x00\x00\x00" (8 bytes) - ASCII text
// - "UNICODE\x00" (8 bytes) - UTF-16 text (big or little endian)
// - "JIS\x00\x00\x00\x00\x00" (8 bytes) - JIS encoding
// - "\x00\x00\x00\x00\x00\x00\x00\x00" (8 bytes) - undefined (often UTF-8)
//
// Returns decoded string or original bytes if decoding fails.
func decodeUserComment(data []byte) interface{} {
const prefixLen = 8

// Need at least the charset prefix
if len(data) < prefixLen {
return data
}

prefix := data[:prefixLen]
content := data[prefixLen:]

// Empty content
if len(content) == 0 {
return ""
}

switch {
case bytes.Equal(prefix, userCommentASCII):
// Trim trailing nulls for ASCII
return string(bytes.TrimRight(content, "\x00"))

case bytes.Equal(prefix, userCommentUnicode):
// UTF-16 - don't trim nulls before decoding (nulls are part of UTF-16)
// decodeUTF16 handles null termination internally
return decodeUTF16(content)

case bytes.Equal(prefix, userCommentJIS):
// JIS encoding - return as-is for now (rarely used)
// Full JIS decoding would require additional dependencies
return string(bytes.TrimRight(content, "\x00"))

case bytes.Equal(prefix, userCommentUndefined):
// Undefined charset - often UTF-8 in practice
return string(bytes.TrimRight(content, "\x00"))

default:
// Unknown prefix - check if it looks like text without prefix
// Some cameras omit the charset prefix entirely
if isValidUTF8(data) {
return string(bytes.TrimRight(data, "\x00"))
}
return data
}
}

// decodeUTF16 decodes UTF-16 encoded data to a UTF-8 string.
// Detects byte order from BOM if present, otherwise assumes little-endian.
func decodeUTF16(data []byte) string {
if len(data) < 2 {
return ""
}

// Check for BOM
var littleEndian bool
start := 0
if data[0] == 0xFF && data[1] == 0xFE {
littleEndian = true
start = 2
} else if data[0] == 0xFE && data[1] == 0xFF {
littleEndian = false
start = 2
} else {
// No BOM - assume little-endian (most common on Windows)
littleEndian = true
}

// Decode UTF-16 to UTF-8
data = data[start:]
if len(data)%2 != 0 {
data = data[:len(data)-1] // Truncate odd byte
}

var result []rune
for i := 0; i < len(data); i += 2 {
var r uint16
if littleEndian {
r = uint16(data[i]) | uint16(data[i+1])<<8
} else {
r = uint16(data[i])<<8 | uint16(data[i+1])
}

// Handle surrogate pairs for characters outside BMP
if r >= 0xD800 && r <= 0xDBFF && i+2 < len(data) {
var r2 uint16
if littleEndian {
r2 = uint16(data[i+2]) | uint16(data[i+3])<<8
} else {
r2 = uint16(data[i+2])<<8 | uint16(data[i+3])
}
if r2 >= 0xDC00 && r2 <= 0xDFFF {
// Valid surrogate pair
codePoint := 0x10000 + (int(r-0xD800) << 10) + int(r2-0xDC00)
result = append(result, rune(codePoint))
i += 2
continue
}
}

if r == 0 {
break // Null terminator
}
result = append(result, rune(r))
}

return string(result)
}

// isValidUTF8 checks if data is valid UTF-8 text (printable or whitespace).
func isValidUTF8(data []byte) bool {
if len(data) == 0 {
return false
}

// Trim trailing nulls first - they're common in fixed-size EXIF fields
trimmed := bytes.TrimRight(data, "\x00")
if len(trimmed) == 0 {
return false
}

// Quick check: count embedded nulls (after trimming trailing ones)
nullCount := 0
for _, b := range trimmed {
if b == 0 {
nullCount++
}
}
// If more than 20% embedded nulls, probably binary
if nullCount > len(trimmed)/5 {
return false
}

// Check if valid UTF-8
s := string(trimmed)
for _, r := range s {
// Allow printable characters and common whitespace
if r == '\uFFFD' { // Unicode replacement character = invalid UTF-8
return false
}
}
return true
}
200 changes: 200 additions & 0 deletions internal/parser/tiff/ifd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1421,3 +1421,203 @@ func TestGetTagNameForDir(t *testing.T) {
}
}
}

func TestDecodeUserComment(t *testing.T) {
tests := []struct {
name string
data []byte
want interface{}
wantStr bool // true if want is expected to be a string
}{
{
name: "ASCII encoding",
data: append([]byte("ASCII\x00\x00\x00"), []byte("Hello World")...),
want: "Hello World",
wantStr: true,
},
{
name: "ASCII with trailing nulls",
data: append([]byte("ASCII\x00\x00\x00"), []byte("Hello\x00\x00\x00")...),
want: "Hello",
wantStr: true,
},
{
name: "undefined encoding (UTF-8)",
data: append([]byte("\x00\x00\x00\x00\x00\x00\x00\x00"), []byte("Test comment")...),
want: "Test comment",
wantStr: true,
},
{
name: "empty content after prefix",
data: []byte("ASCII\x00\x00\x00"),
want: "",
wantStr: true,
},
{
name: "empty content with trailing nulls",
data: append([]byte("ASCII\x00\x00\x00"), []byte("\x00\x00\x00")...),
want: "",
wantStr: true,
},
{
name: "too short - less than prefix",
data: []byte("ASCII"),
want: []byte("ASCII"),
wantStr: false,
},
{
name: "JIS encoding",
data: append([]byte("JIS\x00\x00\x00\x00\x00"), []byte("Test")...),
want: "Test",
wantStr: true,
},
{
name: "Unicode UTF-16LE with BOM",
data: append([]byte("UNICODE\x00"), append([]byte{0xFF, 0xFE}, []byte{'H', 0, 'i', 0}...)...),
want: "Hi",
wantStr: true,
},
{
name: "Unicode UTF-16BE with BOM",
data: append([]byte("UNICODE\x00"), append([]byte{0xFE, 0xFF}, []byte{0, 'H', 0, 'i'}...)...),
want: "Hi",
wantStr: true,
},
{
name: "Unicode UTF-16LE without BOM",
data: append([]byte("UNICODE\x00"), []byte{'T', 0, 'e', 0, 's', 0, 't', 0}...),
want: "Test",
wantStr: true,
},
{
name: "no prefix - valid UTF-8 text",
data: []byte("Plain text comment"),
want: "Plain text comment",
wantStr: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := decodeUserComment(tt.data)

if tt.wantStr {
gotStr, ok := got.(string)
if !ok {
t.Errorf("decodeUserComment() returned %T, want string", got)
return
}
wantStr := tt.want.(string)
if gotStr != wantStr {
t.Errorf("decodeUserComment() = %q, want %q", gotStr, wantStr)
}
} else {
gotBytes, ok := got.([]byte)
if !ok {
t.Errorf("decodeUserComment() returned %T, want []byte", got)
return
}
wantBytes := tt.want.([]byte)
if !bytes.Equal(gotBytes, wantBytes) {
t.Errorf("decodeUserComment() = %v, want %v", gotBytes, wantBytes)
}
}
})
}
}

func TestDecodeUTF16(t *testing.T) {
tests := []struct {
name string
data []byte
want string
}{
{
name: "empty",
data: []byte{},
want: "",
},
{
name: "single byte",
data: []byte{0x41},
want: "",
},
{
name: "simple ASCII as UTF-16LE",
data: []byte{'A', 0, 'B', 0, 'C', 0},
want: "ABC",
},
{
name: "with LE BOM",
data: []byte{0xFF, 0xFE, 'H', 0, 'i', 0},
want: "Hi",
},
{
name: "with BE BOM",
data: []byte{0xFE, 0xFF, 0, 'H', 0, 'i'},
want: "Hi",
},
{
name: "null terminated",
data: []byte{'A', 0, 0, 0},
want: "A",
},
{
name: "odd length truncated",
data: []byte{'A', 0, 'B'},
want: "A",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := decodeUTF16(tt.data)
if got != tt.want {
t.Errorf("decodeUTF16() = %q, want %q", got, tt.want)
}
})
}
}

func TestIsValidUTF8(t *testing.T) {
tests := []struct {
name string
data []byte
want bool
}{
{
name: "empty",
data: []byte{},
want: false,
},
{
name: "valid ASCII",
data: []byte("Hello World"),
want: true,
},
{
name: "valid UTF-8",
data: []byte("Héllo Wörld"),
want: true,
},
{
name: "too many nulls",
data: []byte{0, 0, 0, 0, 'A'},
want: false,
},
{
name: "trailing nulls ok",
data: []byte("Test\x00\x00"),
want: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := isValidUTF8(tt.data)
if got != tt.want {
t.Errorf("isValidUTF8() = %v, want %v", got, tt.want)
}
})
}
}
1 change: 1 addition & 0 deletions internal/parser/tiff/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ const (
TagIPTC uint16 = 0x83BB
TagXMP uint16 = 0x02BC // XMLPacket (decimal 700)
TagMakerNote uint16 = 0x927C
TagUserComment uint16 = 0x9286
TagSubIFDs uint16 = 0x014A
TagJPEGInterchange uint16 = 0x0201
TagJPEGInterLength uint16 = 0x0202
Expand Down
Loading