Skip to content

Commit

Permalink
Prevent splitting surrogate pairs when diffing
Browse files Browse the repository at this point in the history
See google#69

In this patch I'm trying to follow the approach taken by @judofyr
but starting from the top of the script and going through, auditing
every place that perform string operations that split, index, or
otherwise operate on a character level so that we can make sure that
we don't split surrogate pairs.

This contrasts with [attempt one] where I created a custom iterator
for strings. Surprisingly I found this more "ad-hoc" approach easier
to manage since it doesn't create a split universe of string/Unicode.

As of this commit I haven't audited the cleanup functions but my own
tests are passing so I'm given to believe that they might be safe.
I have my own doubts that this is sound work and that the middle-snake
algorithm might find the wrong snake when presented with variable-width
characters.
  • Loading branch information
dmsnell committed Apr 2, 2020
1 parent 62f2e68 commit 550130a
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 2 deletions.
86 changes: 85 additions & 1 deletion javascript/diff_match_patch_uncompressed.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,35 @@ diff_match_patch.Diff.prototype.toString = function() {
return this[0] + ',' + this[1];
};

diff_match_patch.prototype.isHighSurrogate = function(c) {
var v = c.charCodeAt(0);
return v >= 0xD800 && v <= 0xDBFF;
}

diff_match_patch.prototype.isLowSurrogate = function(c) {
var v = c.charCodeAt(0);
return v >= 0xDC00 && v <= 0xDFFF;
}

diff_match_patch.prototype.scalarValues = function(str) {
var length = str.length;
var scalars = [];

for (var i = 0; i < length; i++) {
var scalar = str[i];

// proper surrogate pairs will come through as the whole scalar value
// but if the pairs are broken they will be passed-through unaltered
if (i < length - 1 && this.isHighSurrogate(scalar) && this.isLowSurrogate(str[i+1])) {
scalar += str[i+1];
i++;
}

scalars.push(scalar);
}

return scalars;
}

/**
* Find the differences between two texts. Simplifies the problem by stripping
Expand Down Expand Up @@ -134,12 +163,18 @@ diff_match_patch.prototype.diff_main = function(text1, text2, opt_checklines,

// Trim off common prefix (speedup).
var commonlength = this.diff_commonPrefix(text1, text2);
if (commonlength > 0 && this.isHighSurrogate(text1[commonlength - 1])) {
commonlength--;
}
var commonprefix = text1.substring(0, commonlength);
text1 = text1.substring(commonlength);
text2 = text2.substring(commonlength);

// Trim off common suffix (speedup).
commonlength = this.diff_commonSuffix(text1, text2);
if (commonlength > 0 && this.isLowSurrogate(text1[text1.length - commonlength])) {
commonlength--;
}
var commonsuffix = text1.substring(text1.length - commonlength);
text1 = text1.substring(0, text1.length - commonlength);
text2 = text2.substring(0, text2.length - commonlength);
Expand Down Expand Up @@ -187,13 +222,23 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines,

var longtext = text1.length > text2.length ? text1 : text2;
var shorttext = text1.length > text2.length ? text2 : text1;
var shortlength = shorttext.length;
var i = longtext.indexOf(shorttext);
if (i != -1) {
// skip leading unpaired surrogate
if (this.isLowSurrogate(longtext[i])) {
shortlength--;
i++;
}
// skip trailing unpaired surrogate
if (this.isHighSurrogate(longtext[i + shortlength])) {
shortlength--;
}
// Shorter text is inside the longer text (speedup).
diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)),
new diff_match_patch.Diff(DIFF_EQUAL, shorttext),
new diff_match_patch.Diff(DIFF_INSERT,
longtext.substring(i + shorttext.length))];
longtext.substring(i + shortlength))];
// Swap insertions for deletions if diff is reversed.
if (text1.length > text2.length) {
diffs[0][0] = diffs[2][0] = DIFF_DELETE;
Expand Down Expand Up @@ -439,6 +484,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
*/
diff_match_patch.prototype.diff_bisectSplit_ = function(text1, text2, x, y,
deadline) {
// backup if we split a surrogate
if (
x > 0 && x < text1.length && this.isLowSurrogate(text1[x]) &&
y > 0 && y < text2.length && this.isLowSurrogate(text2[y])
) {
x--;
y--;
}

var text1a = text1.substring(0, x);
var text2a = text2.substring(0, y);
var text1b = text1.substring(x);
Expand Down Expand Up @@ -569,6 +623,12 @@ diff_match_patch.prototype.diff_commonPrefix = function(text1, text2) {
}
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
}

// shorten the prefix if it splits a surrogate
if (pointermid > 0 && this.isHighSurrogate(text1[pointermid-1])) {
pointermid--;
}

return pointermid;
};

Expand Down Expand Up @@ -601,6 +661,12 @@ diff_match_patch.prototype.diff_commonSuffix = function(text1, text2) {
}
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
}

// shorten the suffix if it splits a surrogate
if (pointermid < length - 1 && this.isLowSurrogate(text1[pointermid])) {
pointermid++;
}

return pointermid;
};

Expand Down Expand Up @@ -749,6 +815,24 @@ diff_match_patch.prototype.diff_halfMatch_ = function(text1, text2) {
text1_b = hm[3];
}
var mid_common = hm[4];

// move forward to prevent splitting a surrogate pair
if (mid_common.length > 0 && this.isLowSurrogate(mid_common[0])) {
text1_a = text1_a + mid_common[0];
text2_a = text2_a + mid_common[0];
mid_common = mid_common.substring(1);
}

// back up to prevent splitting a surrogate pair
if (
text1_b.length > 0 && this.isLowSurrogate(text1_b[0]) &&
text2_b.length > 0 && this.isLowSurrogate(text2_b[0])
) {
text1_b = mid_common[mid_common.length - 1] + text1_b;
text2_b = mid_common[mid_common.length - 1] + text2_b;
mid_common = mid_common.substring(0, -1);
}

return [text1_a, text1_b, text2_a, text2_b, mid_common];
};

Expand Down
79 changes: 78 additions & 1 deletion javascript/tests/diff_match_patch_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,83 @@ function testDiffDelta() {
// Convert delta string into a diff.
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));

(function(){
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
U+1F18x 🆎
0 1 2 3 4 5 6 7 8 9 A B C D E F
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
U+1F21x 🈚
U+1F22x 🈯
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
U+1F25x 🉐 🉑
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;

// applies some random edits to string and returns new, edited string
function applyRandomTextEdit(text) {
let textArr = [...text];
let r = Math.random();
if(r < 1/3) { // swap
let swapCount = Math.floor(Math.random()*5);
for(let i = 0; i < swapCount; i++) {
let swapPos1 = Math.floor(Math.random()*textArr.length);
let swapPos2 = Math.floor(Math.random()*textArr.length);
let char1 = textArr[swapPos1];
let char2 = textArr[swapPos2];
textArr[swapPos1] = char2;
textArr[swapPos2] = char1;
}
} else if(r < 2/3) { // remove
let removeCount = Math.floor(Math.random()*5);
for(let i = 0; i < removeCount; i++) {
let removePos = Math.floor(Math.random()*textArr.length);
textArr[removePos] = "";
}
} else { // add
let addCount = Math.floor(Math.random()*5);
for(let i = 0; i < addCount; i++) {
let addPos = Math.floor(Math.random()*textArr.length);
let addFromPos = Math.floor(Math.random()*textArr.length);
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
}
}
return textArr.join("");
}

for(let i = 0; i < 1000; i++) {
newText = applyRandomTextEdit(originalText);
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
}
})();

// Unicode - splitting surrogates
assertEquivalent(
'+%F0%9F%85%B1\t=4',
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
);

assertEquivalent(
'-2\t=4',
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
);

assertEquivalent(
'=2\t-2\t=2',
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
);

// Different versions of the library may have created deltas with
// half of a surrogate pair encoded as if it were valid UTF-8
try {
assertEquivalent(
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
);
} catch ( e ) {
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
}

// Verify pool of unchanged characters.
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
var text2 = dmp.diff_text2(diffs);
Expand Down Expand Up @@ -963,4 +1040,4 @@ function testPatchApply() {
patches = dmp.patch_make('y', 'y123');
results = dmp.patch_apply(patches, 'x');
assertEquivalent(['x123', [true]], results);
}
}

0 comments on commit 550130a

Please sign in to comment.