From 0e3c57c64a09acc029d44ee72c559f278c4befa8 Mon Sep 17 00:00:00 2001
From: Wilfred Hughes <me@wilfred.me.uk>
Date: Sun, 15 Jan 2023 11:38:02 -0800
Subject: [PATCH] Skip unique items before computing Myer's diff on text

This substantially improves performance on text files where there are
few lines in common.

For example, 10,000 line files with no lines in common is more than 10x
faster (8.5 seconds to 0.49 seconds on my machine), and
sample_files/huge_cpp_before.cpp is nearly 2% faster.

Fixes the case mentioned by @quackenbush in #236.

This is inspired by the heuristics discussions at
https://github.com/mitsuhiko/similar/issues/15
---
 CHANGELOG.md                  |   5 ++
 sample_files/compare.expected |   2 +-
 src/diff/myers_diff.rs        | 136 ++++++++++++++++++++++++++++++++--
 src/line_parser.rs            |   4 +-
 src/parse/syntax.rs           |   2 +-
 5 files changed, 140 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a52487ed3..140f66d571 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@
 
 Improved CSS parsing and HTML sublanguage parsing.
 
+### Diffing
+
+Improved textual diffing performance, particularly when the two files
+have few lines in common.
+
 ### Display
 
 Fixed an issue with unwanted underlines with textual diffing when
diff --git a/sample_files/compare.expected b/sample_files/compare.expected
index 60a244ecfd..720d8e62b1 100644
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@@ -71,7 +71,7 @@ sample_files/html_simple_before.html sample_files/html_simple_after.html
 ce3bfa12bc21d0eb5528766e18387e86  -
 
 sample_files/huge_cpp_before.cpp sample_files/huge_cpp_after.cpp
-8910dbf7dae13b1a7229b0497602b414  -
+a85613f8c3cad686d592a276cad8d883  -
 
 sample_files/identical_before.scala sample_files/identical_after.scala
 9c7319f61833e46a0a8cb6c01cc997c9  -
diff --git a/src/diff/myers_diff.rs b/src/diff/myers_diff.rs
index 5ada0f2b24..9c07607f3a 100644
--- a/src/diff/myers_diff.rs
+++ b/src/diff/myers_diff.rs
@@ -1,6 +1,6 @@
 //! A fast diff for linear content, using Myer's diff algorithm.
 
-use rustc_hash::FxHashMap;
+use rustc_hash::{FxHashMap, FxHashSet};
 use std::hash::Hash;
 
 #[derive(Debug, PartialEq)]
@@ -10,6 +10,8 @@ pub enum DiffResult<T> {
     Right(T),
 }
 
+/// Compute a linear diff between `lhs` and `rhs`. This is the
+/// traditional Myer's diff algorithm.
 pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
     wu_diff::diff(lhs, rhs)
         .into_iter()
@@ -25,12 +27,18 @@ pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
         .collect::<Vec<_>>()
 }
 
-/// Compute a unique numeric value for each item, use that for
-/// diffing, then return diff results in terms of the original type.
+/// Compute a linear diff between `lhs` and `rhs`, but use hashed
+/// values internally.
 ///
-/// This is the decorate-sort-undecorate pattern, or Schwartzian
-/// transform, for diffing.
+/// This is faster when equality checks on `T` are expensive, such as
+/// large strings.
 pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
+    // Compute a unique numeric value for each item, use that for
+    // diffing, then return diff results in terms of the original
+    // type.
+    //
+    // This is the decorate-sort-undecorate pattern, or Schwartzian
+    // transform, for diffing.
     let mut value_ids: FxHashMap<&T, u32> = FxHashMap::default();
     let mut id_values: FxHashMap<u32, &T> = FxHashMap::default();
 
@@ -75,6 +83,101 @@ pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
         .collect::<Vec<_>>()
 }
 
+/// Compute the linear diff between `lhs` and `rhs`. If there are
+/// items that only occur on a single side, mark them as novel without
+/// processing them with Myer's diff.
+///
+/// This is substantially faster than `slice`, when `lhs` and `rhs`
+/// have few items in common.
+///
+/// (This heuristic is used in traditional diff tools too, such as GNU
+/// diff.)
+pub fn slice_unique_by_hash<'a, T: Eq + Clone + Hash>(
+    lhs: &'a [T],
+    rhs: &'a [T],
+) -> Vec<DiffResult<&'a T>> {
+    let mut lhs_set = FxHashSet::default();
+    for item in lhs {
+        lhs_set.insert(item);
+    }
+    let mut rhs_set = FxHashSet::default();
+    for item in rhs {
+        rhs_set.insert(item);
+    }
+
+    let lhs_without_unique: Vec<&'a T> = lhs.iter().filter(|n| rhs_set.contains(n)).collect();
+    let rhs_without_unique: Vec<&'a T> = rhs.iter().filter(|n| lhs_set.contains(n)).collect();
+
+    let mut res: Vec<DiffResult<&'a T>> = Vec::with_capacity(lhs.len());
+    let mut lhs_i = 0;
+    let mut rhs_i = 0;
+
+    for item in slice_by_hash(&lhs_without_unique, &rhs_without_unique) {
+        match item {
+            DiffResult::Left(lhs_item) => {
+                while lhs_i < lhs.len() {
+                    if &lhs[lhs_i] != *lhs_item {
+                        res.push(DiffResult::Left(&lhs[lhs_i]));
+                        lhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                res.push(DiffResult::Left(*lhs_item));
+                lhs_i += 1;
+            }
+            DiffResult::Both(lhs_item, rhs_item) => {
+                while lhs_i < lhs.len() {
+                    if &lhs[lhs_i] != *lhs_item {
+                        res.push(DiffResult::Left(&lhs[lhs_i]));
+                        lhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                while rhs_i < rhs.len() {
+                    if &rhs[rhs_i] != *rhs_item {
+                        res.push(DiffResult::Right(&rhs[rhs_i]));
+                        rhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                res.push(DiffResult::Both(*lhs_item, *rhs_item));
+                lhs_i += 1;
+                rhs_i += 1;
+            }
+            DiffResult::Right(rhs_item) => {
+                while rhs_i < rhs.len() {
+                    if &rhs[rhs_i] != *rhs_item {
+                        res.push(DiffResult::Right(&rhs[rhs_i]));
+                        rhs_i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                res.push(DiffResult::Right(*rhs_item));
+                rhs_i += 1;
+            }
+        }
+    }
+
+    while lhs_i < lhs.len() {
+        res.push(DiffResult::Left(&lhs[lhs_i]));
+        lhs_i += 1;
+    }
+    while rhs_i < rhs.len() {
+        res.push(DiffResult::Right(&rhs[rhs_i]));
+        rhs_i += 1;
+    }
+
+    res
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -124,4 +227,27 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_slice_unique_same_items() {
+        let diff_items = slice_unique_by_hash(&["a", "b"], &["a", "b"]);
+        assert_eq!(
+            diff_items,
+            vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
+        );
+    }
+
+    #[test]
+    fn test_slice_unique_different_items() {
+        let diff_items = slice_unique_by_hash(&["a", "b"], &["c", "d"]);
+        assert_eq!(
+            diff_items,
+            vec![
+                DiffResult::Left(&"a"),
+                DiffResult::Left(&"b"),
+                DiffResult::Right(&"c"),
+                DiffResult::Right(&"d"),
+            ]
+        );
+    }
 }
diff --git a/src/line_parser.rs b/src/line_parser.rs
index 9eb6785fe5..dbfcaf68fc 100644
--- a/src/line_parser.rs
+++ b/src/line_parser.rs
@@ -78,7 +78,7 @@ fn changed_parts<'a>(
     let opposite_src_lines = split_lines_keep_newline(opposite_src);
 
     let mut res: Vec<(TextChangeKind, Vec<&'a str>, Vec<&'a str>)> = vec![];
-    for diff_res in myers_diff::slice_by_hash(&src_lines, &opposite_src_lines) {
+    for diff_res in myers_diff::slice_unique_by_hash(&src_lines, &opposite_src_lines) {
         match diff_res {
             myers_diff::DiffResult::Left(line) => {
                 res.push((TextChangeKind::Novel, vec![line], vec![]));
@@ -141,7 +141,7 @@ pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos> {
                 let lhs_part = lhs_lines.join("");
                 let rhs_part = rhs_lines.join("");
 
-                for diff_res in myers_diff::slice(&split_words(&lhs_part), &split_words(&rhs_part))
+                for diff_res in myers_diff::slice_unique_by_hash(&split_words(&lhs_part), &split_words(&rhs_part))
                 {
                     match diff_res {
                         myers_diff::DiffResult::Left(lhs_word) => {
diff --git a/src/parse/syntax.rs b/src/parse/syntax.rs
index 721f7d3e2d..2d2c40230d 100644
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@@ -674,7 +674,7 @@ fn split_comment_words(
     let mut opposite_offset = 0;
 
     let mut res = vec![];
-    for diff_res in myers_diff::slice(&content_parts, &other_parts) {
+    for diff_res in myers_diff::slice_by_hash(&content_parts, &other_parts) {
         match diff_res {
             myers_diff::DiffResult::Left(word) => {
                 // This word is novel to this side.