Skip to content

Commit 085e8b9

Browse files
committed
Merge pull request sergi#12 from goojba/utf8bug
Use []rune in diff internals
2 parents bf55222 + af0ba86 commit 085e8b9

File tree

2 files changed

+40
-30
lines changed

2 files changed

+40
-30
lines changed

diffmatchpatch/dmp.go

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,13 @@ func (dmp *DiffMatchPatch) diffLineMode(text1, text2 string, deadline time.Time)
366366
// and return the recursively constructed diff.
367367
// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
368368
func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) []Diff {
369+
// Convert to runes to avoid utf8 slicing bugs.
370+
runes1 := []rune(text1)
371+
runes2 := []rune(text2)
369372
// Cache the text lengths to prevent multiple calls.
370-
text1_len, text2_len := len(text1), len(text2)
373+
runes1_len, runes2_len := len(runes1), len(runes2)
371374

372-
max_d := (text1_len + text2_len + 1) / 2
375+
max_d := (runes1_len + runes2_len + 1) / 2
373376
v_offset := max_d
374377
v_length := 2 * max_d
375378

@@ -382,7 +385,7 @@ func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) [
382385
v1[v_offset+1] = 0
383386
v2[v_offset+1] = 0
384387

385-
delta := text1_len - text2_len
388+
delta := runes1_len - runes2_len
386389
// If the total number of characters is odd, then the front path will collide
387390
// with the reverse path.
388391
front := (delta%2 != 0)
@@ -410,30 +413,28 @@ func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) [
410413
}
411414

412415
y1 := x1 - k1
413-
for x1 < text1_len && y1 < text2_len {
414-
r1, size := utf8.DecodeRuneInString(text1[x1:])
415-
r2, _ := utf8.DecodeRuneInString(text2[y1:])
416-
if r1 != r2 {
416+
for x1 < runes1_len && y1 < runes2_len {
417+
if runes1[x1] != runes2[y1] {
417418
break
418419
}
419-
x1 += size
420-
y1 += size
420+
x1++
421+
y1++
421422
}
422423
v1[k1_offset] = x1
423-
if x1 > text1_len {
424+
if x1 > runes1_len {
424425
// Ran off the right of the graph.
425426
k1end += 2
426-
} else if y1 > text2_len {
427+
} else if y1 > runes2_len {
427428
// Ran off the bottom of the graph.
428429
k1start += 2
429430
} else if front {
430431
k2_offset := v_offset + delta - k1
431432
if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1 {
432433
// Mirror x2 onto top-left coordinate system.
433-
x2 := text1_len - v2[k2_offset]
434+
x2 := runes1_len - v2[k2_offset]
434435
if x1 >= x2 {
435436
// Overlap detected.
436-
return dmp.diffBisectSplit_(text1, text2, x1, y1, deadline)
437+
return dmp.diffBisectSplit_(runes1, runes2, x1, y1, deadline)
437438
}
438439
}
439440
}
@@ -448,20 +449,18 @@ func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) [
448449
x2 = v2[k2_offset-1] + 1
449450
}
450451
var y2 = x2 - k2
451-
for x2 < text1_len && y2 < text2_len {
452-
r1, size := utf8.DecodeLastRuneInString(text1[:text1_len-x2])
453-
r2, _ := utf8.DecodeLastRuneInString(text2[:text2_len-y2])
454-
if r1 != r2 {
452+
for x2 < runes1_len && y2 < runes2_len {
453+
if runes1[runes1_len-x2-1] != runes2[runes2_len-y2-1] {
455454
break
456455
}
457-
x2 += size
458-
y2 += size
456+
x2++
457+
y2++
459458
}
460459
v2[k2_offset] = x2
461-
if x2 > text1_len {
460+
if x2 > runes1_len {
462461
// Ran off the left of the graph.
463462
k2end += 2
464-
} else if y2 > text2_len {
463+
} else if y2 > runes2_len {
465464
// Ran off the top of the graph.
466465
k2start += 2
467466
} else if !front {
@@ -470,10 +469,10 @@ func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) [
470469
x1 := v1[k1_offset]
471470
y1 := v_offset + x1 - k1_offset
472471
// Mirror x2 onto top-left coordinate system.
473-
x2 = text1_len - x2
472+
x2 = runes1_len - x2
474473
if x1 >= x2 {
475474
// Overlap detected.
476-
return dmp.diffBisectSplit_(text1, text2, x1, y1, deadline)
475+
return dmp.diffBisectSplit_(runes1, runes2, x1, y1, deadline)
477476
}
478477
}
479478
}
@@ -487,16 +486,16 @@ func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) [
487486
}
488487
}
489488

490-
func (dmp *DiffMatchPatch) diffBisectSplit_(text1, text2 string, x, y int,
489+
func (dmp *DiffMatchPatch) diffBisectSplit_(runes1, runes2 []rune, x, y int,
491490
deadline time.Time) []Diff {
492-
text1a := text1[:x]
493-
text2a := text2[:y]
494-
text1b := text1[x:]
495-
text2b := text2[y:]
491+
runes1a := runes1[:x]
492+
runes2a := runes2[:y]
493+
runes1b := runes1[x:]
494+
runes2b := runes2[y:]
496495

497496
// Compute both diffs serially.
498-
diffs := dmp.diffMain(text1a, text2a, false, deadline)
499-
diffsb := dmp.diffMain(text1b, text2b, false, deadline)
497+
diffs := dmp.diffMain(string(runes1a), string(runes2a), false, deadline)
498+
diffsb := dmp.diffMain(string(runes1b), string(runes2b), false, deadline)
500499

501500
return append(diffs, diffsb...)
502501
}

diffmatchpatch/dmp_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,17 @@ func Test_diffHalfmatchTest(t *testing.T) {
193193
assert.True(t, dmp.DiffHalfMatch("qHilloHelloHew", "xHelloHeHulloy") == nil, "")
194194
}
195195

196+
func Test_diffBisectSplit(t *testing.T) {
197+
// As originally written, this can produce invalid utf8 strings.
198+
dmp := New()
199+
diffs := dmp.diffBisectSplit_([]rune("STUV\x05WX\x05YZ\x05["),
200+
[]rune("WĺĻļ\x05YZ\x05ĽľĿŀZ"), 7, 6, time.Now().Add(time.Hour))
201+
for _, d := range diffs {
202+
assert.True(t, utf8.ValidString(d.Text))
203+
}
204+
}
205+
206+
196207
func Test_diffLinesToChars(t *testing.T) {
197208
dmp := New()
198209
// Convert lines down to characters.

0 commit comments

Comments
 (0)