From 5916411e9e07985ab211f1e773d059451f5a34f1 Mon Sep 17 00:00:00 2001 From: You Guang Date: Wed, 14 Aug 2024 16:32:30 +0800 Subject: [PATCH] tried something --- lib/WordDiff.ml | 138 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 4 deletions(-) diff --git a/lib/WordDiff.ml b/lib/WordDiff.ml index 600a162..25e93cf 100644 --- a/lib/WordDiff.ml +++ b/lib/WordDiff.ml @@ -81,7 +81,137 @@ let compute (block : string Block.t) : line_content Block.t = match block with | Block.Common line -> Block.Common [ Unchanged line ] | Block.Changed { mine; their; order } -> - let mine_str = String.concat " " mine in - let their_str = String.concat " " their in - let mine_words, their_words = diff_words mine_str their_str in - Block.Changed { mine = [ mine_words ]; their = [ their_words ]; order } + (* Helper function to convert a string to an array of words *) + let string_to_word_array s = + s |> String.split_on_char ' ' |> Array.of_list + in + + (* Use Levenshtein distance to find the best pairing of lines *) + let pair_lines lines1 lines2 = + let distances = + Array.make_matrix (Array.length lines1) (Array.length lines2) 0 + in + for i = 0 to Array.length lines1 - 1 do + for j = 0 to Array.length lines2 - 1 do + distances.(i).(j) <- + edit_distance ( = ) + (string_to_word_array lines1.(i)) + (string_to_word_array lines2.(j)) + done + done; + + (* Use a greedy approach to pair lines based on minimum distance *) + let paired = ref [] in + let used1 = Array.make (Array.length lines1) false in + let used2 = Array.make (Array.length lines2) false in + + for _ = 1 to min (Array.length lines1) (Array.length lines2) do + let min_dist = ref max_int in + let min_i = ref (-1) in + let min_j = ref (-1) in + + for i = 0 to Array.length lines1 - 1 do + for j = 0 to Array.length lines2 - 1 do + if + (not used1.(i)) + && (not used2.(j)) + && distances.(i).(j) < !min_dist + then ( + min_dist := distances.(i).(j); + min_i := i; + min_j := j) + done + done; + + if !min_i <> -1 && !min_j <> -1 then ( + paired := (!min_i, !min_j) :: !paired; + used1.(!min_i) <- true; + used2.(!min_j) <- true) + done; + + (* Add unpaired lines *) + let final_pairs = ref !paired in + for i = 0 to Array.length lines1 - 1 do + if not used1.(i) then final_pairs := (i, -1) :: !final_pairs + done; + for j = 0 to Array.length lines2 - 1 do + if not used2.(j) then final_pairs := (-1, j) :: !final_pairs + done; + + List.sort compare !final_pairs + in + + let mine_array = Array.of_list mine in + let their_array = Array.of_list their in + let pairs = pair_lines mine_array their_array in + + let result_mine = ref [] in + let result_their = ref [] in + + (* Helper function to trim leading/trailing whitespace and collapse multiple spaces *) + let normalize_whitespace s = + let s = String.trim s in + let buf = Buffer.create (String.length s) in + let space_seen = ref false in + String.iter + (fun c -> + match c with + | ' ' | '\t' | '\n' | '\r' -> + if not !space_seen then ( + Buffer.add_char buf ' '; + space_seen := true) + | _ -> + Buffer.add_char buf c; + space_seen := false) + s; + Buffer.contents buf + in + + List.iter + (fun (i, j) -> + match (i, j) with + | -1, j -> + let their_content = normalize_whitespace their_array.(j) in + if their_content <> "" then + result_their := + (diff_words "" their_content |> snd) :: !result_their + | i, -1 -> + let mine_content = normalize_whitespace mine_array.(i) in + if mine_content <> "" then + result_mine := + (diff_words mine_content "" |> fst) :: !result_mine + | i, j -> + let mine_content = normalize_whitespace mine_array.(i) in + let their_content = normalize_whitespace their_array.(j) in + if mine_content <> "" || their_content <> "" then ( + let mine_diff, their_diff = + diff_words mine_content their_content + in + result_mine := mine_diff :: !result_mine; + result_their := their_diff :: !result_their)) + pairs; + + (* Remove any empty lines at the beginning and end of the results *) + let trim_empty_lines lines = + let is_empty line = + List.for_all (function Unchanged "" -> true | _ -> false) line + in + let rec trim_start = function + | [] -> [] + | hd :: tl when is_empty hd -> trim_start tl + | lines -> lines + in + let rec trim_end = function + | [] -> [] + | hd :: tl -> + let trimmed_tail = trim_end tl in + if trimmed_tail = [] && is_empty hd then [] + else hd :: trimmed_tail + in + lines |> trim_start |> trim_end + in + + let trimmed_mine = trim_empty_lines (List.rev !result_mine) in + let trimmed_their = trim_empty_lines (List.rev !result_their) in + + Block.Changed { mine = trimmed_mine; their = trimmed_their; order }