diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index 88a702c..7a80e06 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -88,6 +88,15 @@ diff_match_patch.Diff.prototype.toString = function() { return this[0] + ',' + this[1]; }; +diff_match_patch.prototype.isHighSurrogate = function(c) { + var v = c.charCodeAt(0); + return v >= 0xD800 && v <= 0xDBFF; +} + +diff_match_patch.prototype.isLowSurrogate = function(c) { + var v = c.charCodeAt(0); + return v >= 0xDC00 && v <= 0xDFFF; +} /** * Find the differences between two texts. Simplifies the problem by stripping @@ -134,12 +143,18 @@ diff_match_patch.prototype.diff_main = function(text1, text2, opt_checklines, // Trim off common prefix (speedup). var commonlength = this.diff_commonPrefix(text1, text2); + if (commonlength > 0 && this.isHighSurrogate(text1[commonlength - 1])) { + commonlength--; + } var commonprefix = text1.substring(0, commonlength); text1 = text1.substring(commonlength); text2 = text2.substring(commonlength); // Trim off common suffix (speedup). commonlength = this.diff_commonSuffix(text1, text2); + if (commonlength > 0 && this.isLowSurrogate(text1[text1.length - commonlength])) { + commonlength--; + } var commonsuffix = text1.substring(text1.length - commonlength); text1 = text1.substring(0, text1.length - commonlength); text2 = text2.substring(0, text2.length - commonlength); @@ -187,13 +202,23 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines, var longtext = text1.length > text2.length ? text1 : text2; var shorttext = text1.length > text2.length ? text2 : text1; + var shortlength = shorttext.length; var i = longtext.indexOf(shorttext); if (i != -1) { + // skip leading unpaired surrogate + if (this.isLowSurrogate(longtext[i])) { + shortlength--; + i++; + } + // skip trailing unpaired surrogate + if (this.isHighSurrogate(longtext[i + shortlength])) { + shortlength--; + } // Shorter text is inside the longer text (speedup). diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)), new diff_match_patch.Diff(DIFF_EQUAL, shorttext), new diff_match_patch.Diff(DIFF_INSERT, - longtext.substring(i + shorttext.length))]; + longtext.substring(i + shortlength))]; // Swap insertions for deletions if diff is reversed. if (text1.length > text2.length) { diffs[0][0] = diffs[2][0] = DIFF_DELETE; @@ -439,6 +464,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) { */ diff_match_patch.prototype.diff_bisectSplit_ = function(text1, text2, x, y, deadline) { + // backup if we split a surrogate + if ( + x > 0 && x < text1.length && this.isLowSurrogate(text1[x]) && + y > 0 && y < text2.length && this.isLowSurrogate(text2[y]) + ) { + x--; + y--; + } + var text1a = text1.substring(0, x); var text2a = text2.substring(0, y); var text1b = text1.substring(x); @@ -569,6 +603,12 @@ diff_match_patch.prototype.diff_commonPrefix = function(text1, text2) { } pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin); } + + // shorten the prefix if it splits a surrogate + if (pointermid > 0 && this.isHighSurrogate(text1[pointermid-1])) { + pointermid--; + } + return pointermid; }; @@ -601,6 +641,12 @@ diff_match_patch.prototype.diff_commonSuffix = function(text1, text2) { } pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin); } + + // shorten the suffix if it splits a surrogate + if (pointermid < length - 1 && this.isLowSurrogate(text1[pointermid])) { + pointermid++; + } + return pointermid; }; @@ -749,6 +795,24 @@ diff_match_patch.prototype.diff_halfMatch_ = function(text1, text2) { text1_b = hm[3]; } var mid_common = hm[4]; + + // move forward to prevent splitting a surrogate pair + if (mid_common.length > 0 && this.isLowSurrogate(mid_common[0])) { + text1_a = text1_a + mid_common[0]; + text2_a = text2_a + mid_common[0]; + mid_common = mid_common.substring(1); + } + + // back up to prevent splitting a surrogate pair + if ( + text1_b.length > 0 && this.isLowSurrogate(text1_b[0]) && + text2_b.length > 0 && this.isLowSurrogate(text2_b[0]) + ) { + text1_b = mid_common[mid_common.length - 1] + text1_b; + text2_b = mid_common[mid_common.length - 1] + text2_b; + mid_common = mid_common.substring(0, -1); + } + return [text1_a, text1_b, text2_a, text2_b, mid_common]; }; diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js index 109e56a..0ce4e52 100644 --- a/javascript/tests/diff_match_patch_test.js +++ b/javascript/tests/diff_match_patch_test.js @@ -492,6 +492,83 @@ function testDiffDelta() { // Convert delta string into a diff. assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta)); + (function(){ + const originalText = `U+1F17x πŸ…°οΈ πŸ…±οΈ πŸ…ΎοΈ πŸ…ΏοΈ safhawifhkw + U+1F18x πŸ†Ž + 0 1 2 3 4 5 6 7 8 9 A B C D E F + U+1F19x πŸ†‘ πŸ†’ πŸ†“ πŸ†” πŸ†• πŸ†– πŸ†— πŸ†˜ πŸ†™ πŸ†š + U+1F20x 🈁 πŸˆ‚οΈ sfss.,_||saavvvbbds + U+1F21x 🈚 + U+1F22x 🈯 + U+1F23x 🈲 🈳 🈴 🈡 🈢 🈷️ 🈸 🈹 🈺 + U+1F25x πŸ‰ πŸ‰‘ + U+1F30x πŸŒ€ 🌁 πŸŒ‚ πŸŒƒ πŸŒ„ πŸŒ… πŸŒ† πŸŒ‡ 🌈 πŸŒ‰ 🌊 πŸŒ‹ 🌌 🌍 🌎 🌏 + U+1F31x 🌐 πŸŒ‘ πŸŒ’ πŸŒ“ πŸŒ” πŸŒ• πŸŒ– πŸŒ— 🌘 πŸŒ™ 🌚 πŸŒ› 🌜 🌝 🌞 `; + + // applies some random edits to string and returns new, edited string + function applyRandomTextEdit(text) { + let textArr = [...text]; + let r = Math.random(); + if(r < 1/3) { // swap + let swapCount = Math.floor(Math.random()*5); + for(let i = 0; i < swapCount; i++) { + let swapPos1 = Math.floor(Math.random()*textArr.length); + let swapPos2 = Math.floor(Math.random()*textArr.length); + let char1 = textArr[swapPos1]; + let char2 = textArr[swapPos2]; + textArr[swapPos1] = char2; + textArr[swapPos2] = char1; + } + } else if(r < 2/3) { // remove + let removeCount = Math.floor(Math.random()*5); + for(let i = 0; i < removeCount; i++) { + let removePos = Math.floor(Math.random()*textArr.length); + textArr[removePos] = ""; + } + } else { // add + let addCount = Math.floor(Math.random()*5); + for(let i = 0; i < addCount; i++) { + let addPos = Math.floor(Math.random()*textArr.length); + let addFromPos = Math.floor(Math.random()*textArr.length); + textArr[addPos] = textArr[addPos] + textArr[addFromPos]; + } + } + return textArr.join(""); + } + + for(let i = 0; i < 1000; i++) { + newText = applyRandomTextEdit(originalText); + dmp.diff_toDelta(dmp.diff_main(originalText, newText)); + } + })(); + + // Unicode - splitting surrogates + assertEquivalent( + '+%F0%9F%85%B1\t=4', + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71')) + ); + + assertEquivalent( + '-2\t=4', + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71')) + ); + + assertEquivalent( + '=2\t-2\t=2', + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71')) + ); + + // Different versions of the library may have created deltas with + // half of a surrogate pair encoded as if it were valid UTF-8 + try { + assertEquivalent( + dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')), + dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1')) + ); + } catch ( e ) { + assertEquals('Decode UTF8-encoded surrogate half', 'crashed'); + } + // Verify pool of unchanged characters. diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']]; var text2 = dmp.diff_text2(diffs); @@ -963,4 +1040,4 @@ function testPatchApply() { patches = dmp.patch_make('y', 'y123'); results = dmp.patch_apply(patches, 'x'); assertEquivalent(['x123', [true]], results); -} +} \ No newline at end of file