diff --git a/cmp/compare_test.go b/cmp/compare_test.go index 88b7d45..975e4be 100644 --- a/cmp/compare_test.go +++ b/cmp/compare_test.go @@ -1320,6 +1320,11 @@ using the AllowUnexported option.`, "\n"), x: "d5c14bdf6bac81c27afc5429500ed750\n25483503b557c606dad4f144d27ae10b\n90bdbcdbb6ea7156068e3dcfb7459244\n978f480a6e3cced51e297fbff9a506b7\n", y: "Xd5c14bdf6bac81c27afc5429500ed750\nX25483503b557c606dad4f144d27ae10b\nX90bdbcdbb6ea7156068e3dcfb7459244\nX978f480a6e3cced51e297fbff9a506b7\n", reason: "all lines are different, so diffing based on lines is pointless", + }, { + label: label + "/JapaneseUTF8", + x: "プライベート ブランド ジャケット", + y: "プライベート ブランド シャツ", + reason: "multi-byte UTF-8 characters should be readable in diff output (issue #314)", }, { label: label + "/StringifiedBytes", x: struct{ X []byte }{[]byte("hello, world!")}, diff --git a/cmp/report_slices.go b/cmp/report_slices.go index 23e444f..349fca2 100644 --- a/cmp/report_slices.go +++ b/cmp/report_slices.go @@ -99,7 +99,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { // Auto-detect the type of the data. var sx, sy string var ssx, ssy []string - var isString, isMostlyText, isPureLinedText, isBinary bool + var isString, isMostlyText, isPureLinedText, isBinary, isValidUTF8 bool switch { case t.Kind() == reflect.String: sx, sy = vx.String(), vy.String() @@ -133,6 +133,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { isMostlyText = float64(numValidRunes) > math.Floor(0.90*float64(numTotalRunes)) isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024 isBinary = !isMostlyText + isValidUTF8 = utf8.ValidString(sx) && utf8.ValidString(sy) // Avoid diffing by lines if it produces a significantly more complex // edit script than diffing by bytes. @@ -251,13 +252,32 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { // then perform differencing in approximately fixed-sized chunks. // The output is printed as quoted strings. case isMostlyText: - list = opts.formatDiffSlice( - reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte", - func(v reflect.Value, d diffMode) textRecord { - s := formatString(v.String()) - return textRecord{Diff: d, Value: textLine(s)} - }, - ) + if isValidUTF8 { + // Rune-based diffing for valid UTF-8 strings + rx := []rune(sx) + ry := []rune(sy) + list = opts.formatDiffSlice( + reflect.ValueOf(rx), reflect.ValueOf(ry), 64, "rune", + func(v reflect.Value, d diffMode) textRecord { + // Convert []rune chunk back to string for display + runes := make([]rune, v.Len()) + for i := 0; i < v.Len(); i++ { + runes[i] = rune(v.Index(i).Int()) + } + s := formatString(string(runes)) + return textRecord{Diff: d, Value: textLine(s)} + }, + ) + } else { + // Byte-based diffing for invalid UTF-8 (original behavior) + list = opts.formatDiffSlice( + reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte", + func(v reflect.Value, d diffMode) textRecord { + s := formatString(v.String()) + return textRecord{Diff: d, Value: textLine(s)} + }, + ) + } // If the text appears to be binary data, // then perform differencing in approximately fixed-sized chunks. diff --git a/cmp/testdata/diffs b/cmp/testdata/diffs index be77b95..7fda3ef 100644 --- a/cmp/testdata/diffs +++ b/cmp/testdata/diffs @@ -515,7 +515,7 @@ + "21 2nd Street", `","city":"New York","state":"NY","postalCode":"10021-3100"},"pho`, `neNumbers":[{"type":"home","number":"212 555-1234"},{"type":"off`, - ... // 101 identical bytes + ... // 101 identical runes }, ""), BytesB: nil, BytesC: nil, @@ -1019,7 +1019,7 @@ <<< TestDiff/Reporter/LargeStringInInterface struct{ X any }{ X: strings.Join({ - ... // 485 identical bytes + ... // 485 identical runes "s mus. Pellentesque mi lorem, consectetur id porttitor id, solli", "citudin sit amet enim. Duis eu dolor magna. Nunc ut augue turpis", - ".", @@ -1030,7 +1030,7 @@ <<< TestDiff/Reporter/LargeBytesInInterface struct{ X any }{ X: bytes.Join({ - ... // 485 identical bytes + ... // 485 identical runes "s mus. Pellentesque mi lorem, consectetur id porttitor id, solli", "citudin sit amet enim. Duis eu dolor magna. Nunc ut augue turpis", - ".", @@ -1098,6 +1098,13 @@ "978f480a6e3cced51e297fbff9a506b7\n", }, "") >>> TestDiff/Reporter/AllLinesDiffer +<<< TestDiff/Reporter/JapaneseUTF8 + strings.Join({ + "プライベート ブランド ", +- "ジャケット", ++ "シャツ", + }, "") +>>> TestDiff/Reporter/JapaneseUTF8 <<< TestDiff/Reporter/StringifiedBytes struct{ X []uint8 }{ - X: []uint8("hello, world!"),