From 0aa7af8cd74e368406a43eef47eba8be2b67a003 Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 14:51:11 +0200 Subject: [PATCH 1/7] Streamline some code - `Hash` knows how to initialize values of missing keys. - `NodeMap` returns the map and is not (anymore) called with a block. --- lib/hyp_diff.rb | 7 +++---- lib/hyp_diff/text_from_node.rb | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/hyp_diff.rb b/lib/hyp_diff.rb index 047f4f3..0ea2554 100644 --- a/lib/hyp_diff.rb +++ b/lib/hyp_diff.rb @@ -42,14 +42,14 @@ def compare(before, after, options = {}) # @api private class NodeMap - def self.for(change_node_tuples, &block) + def self.for(change_node_tuples) new.build(change_node_tuples).map end attr_reader :map def initialize - @map = {} + @map = Hash.new {|h, k| h[k] = [] } @stashed = [] end @@ -79,8 +79,7 @@ def build(change_node_tuples) end def append_to_node(node, change) - list = (@map[node] ||= []) - list << change + @map[node] << change end end diff --git a/lib/hyp_diff/text_from_node.rb b/lib/hyp_diff/text_from_node.rb index 9ad5d43..7800710 100644 --- a/lib/hyp_diff/text_from_node.rb +++ b/lib/hyp_diff/text_from_node.rb @@ -7,7 +7,7 @@ def initialize(raw_text, node) end def ==(other) - text == other.text + eql?(other) end def eql?(other) From a6b2be6246b0be25697188042ee1cecc28b6083b Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 17:11:36 +0200 Subject: [PATCH 2/7] [test] aggregate_failures reduces red surprises --- spec/hyp_diff_spec.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spec/hyp_diff_spec.rb b/spec/hyp_diff_spec.rb index 83f0b36..0b06794 100644 --- a/spec/hyp_diff_spec.rb +++ b/spec/hyp_diff_spec.rb @@ -125,7 +125,7 @@ def expect_diff(old, new, expected) expect_diff("hello world", "hello world", "hello world") end - it "treats consecutive whitespace as a single whitespace across tags" do + it "treats consecutive whitespace as a single whitespace across tags", :aggregate_failures do expect_diff( "hello world", "hello world", @@ -138,7 +138,7 @@ def expect_diff(old, new, expected) ) end - it "considers trailing and leading whitespace for insertions and deletions" do + it "considers trailing and leading whitespace for insertions and deletions", :aggregate_failures do expect_diff("hello", "hello world", "hello world") expect_diff("hello world", "hello", "hello world") expect_diff("world", "hello world", "hello world") @@ -149,14 +149,14 @@ def expect_diff(old, new, expected) expect_diff("hello world", "hello ", "hello world") end - it "considers trailing and leading whitespace changes" do + it "considers trailing and leading whitespace changes", :aggregate_failures do expect_diff("hello ", "hello", "hello ") expect_diff("hello", "hello ", "hello ") expect_diff(" hello", "hello", " hello") expect_diff("hello", " hello", " hello") end - it "considers changes of text and whitespace" do + it "considers changes of text and whitespace", :aggregate_failures do expect_diff("hello world ", "hello friend", "hello world friend") expect_diff(" bye world", "hello world", " byehello world") expect_diff("hello friend", "hello world ", "hello friendworld ") @@ -168,7 +168,7 @@ def expect_diff(old, new, expected) expect_diff("hello world", "hello, world", "hello, world") end - it "diffs changes of punctuation to words" do + it "diffs changes of punctuation to words", :aggregate_failures do expect_diff( "hello, world", "hello beautiful world", @@ -181,7 +181,7 @@ def expect_diff(old, new, expected) ) end - it "diffs changes of punctuation to leading and trailing spaces" do + it "diffs changes of punctuation to leading and trailing spaces", :aggregate_failures do expect_diff("hello.", "hello ", "hello. ") expect_diff("hello ", "hello.", "hello .") expect_diff(" hello", ".hello", " .hello") From 5ba85f1fbb082a2b2a0970fed0c5896acd0fcd80 Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 14:28:42 +0200 Subject: [PATCH 3/7] [test] Ensure stable diff output for pseudo adjacent whitespaces --- spec/hyp_diff_spec.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spec/hyp_diff_spec.rb b/spec/hyp_diff_spec.rb index 0b06794..3c02611 100644 --- a/spec/hyp_diff_spec.rb +++ b/spec/hyp_diff_spec.rb @@ -192,4 +192,10 @@ def expect_diff(old, new, expected) expect_diff("hello world", "hello world.", "hello world.") end + it "converts newlines to spaces" do + content = + "

Office philosophy

\n

No set working places

\n

bla bla

" + expect_diff(content, content, + "

Office philosophy

No set working places

bla bla

") + end end From 221205ec50b4cad0223ed703713dc783e6ff7245 Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 02:09:39 +0200 Subject: [PATCH 4/7] [test] Should perform fast for medium sized (plain) text --- spec/large_text_spec.rb | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 spec/large_text_spec.rb diff --git a/spec/large_text_spec.rb b/spec/large_text_spec.rb new file mode 100644 index 0000000..ae4dad5 --- /dev/null +++ b/spec/large_text_spec.rb @@ -0,0 +1,24 @@ +# encoding: utf-8 +require "hyp_diff" +require "securerandom" + +describe HypDiff do + context "for large text" do + it "performs reasonably fast" do + words = [] + 300.times do + 8.times do |i| + words << SecureRandom.hex([i + 2, 6].min) + end + words << "replace" + words << "me." + end + text = words.join(" ") + modified_text = text.gsub("replace me", "better text") + start = Time.now + + HypDiff.compare(text, modified_text) + expect(Time.now - start).to be < 1 + end + end +end From b1e003b2dc3504cc9e61292472be0d3d2b3f3407 Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 01:55:30 +0200 Subject: [PATCH 5/7] Increase diff speed by hiding single whitespaces from Diff::LCS --- lib/hyp_diff.rb | 98 +++++++++++++++++++++++++++++++--- lib/hyp_diff/text_from_node.rb | 6 +++ spec/hyp_diff_spec.rb | 2 +- 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/lib/hyp_diff.rb b/lib/hyp_diff.rb index 0ea2554..b18c48c 100644 --- a/lib/hyp_diff.rb +++ b/lib/hyp_diff.rb @@ -102,15 +102,63 @@ def render(changes) changes.each do |change| case change.action when "!" then - deletions << change.old_element.text - insertions << change.new_element.text + old_fulltext = change.old_element.fulltext + new_fulltext = change.new_element.fulltext + if old_fulltext.include?(new_fulltext) + if old_fulltext.start_with?(new_fulltext) + apply_insertions_and_deletions + new_text << new_fulltext + deletions << old_fulltext[new_fulltext.length..-1] + next + end + if old_fulltext.end_with?(new_fulltext) + deletions << old_fulltext[0, old_fulltext.length - new_fulltext.length] + apply_insertions_and_deletions + new_text << new_fulltext + next + end + end + if new_fulltext.include?(old_fulltext) + if new_fulltext.start_with?(old_fulltext) + apply_insertions_and_deletions + new_text << old_fulltext + insertions << new_fulltext[old_fulltext.length..-1] + next + end + if new_fulltext.end_with?(old_fulltext) + insertions << new_fulltext[0, new_fulltext.length - old_fulltext.length] + apply_insertions_and_deletions + new_text << old_fulltext + next + end + end + if insertions.empty? && deletions.empty? && change.old_element.before_whitespace && change.new_element.before_whitespace + new_text << " " + deletions << change.old_element.text + insertions << change.new_element.text + next + end + deletions << change.old_element.fulltext + insertions << change.new_element.fulltext when "=" then + if change.old_element.before_whitespace && !change.new_element.before_whitespace + deletions << " " + apply_insertions_and_deletions + new_text << change.new_element.text + next + end + if change.new_element.before_whitespace && !change.old_element.before_whitespace + insertions << " " + apply_insertions_and_deletions + new_text << change.new_element.text + next + end apply_insertions_and_deletions - new_text << escape_html(change.new_element.text) + new_text << escape_html(change.new_element.fulltext) when "+" then - insertions << change.new_element.text + insertions << change.new_element.fulltext when "-" then - deletions << change.old_element.text + deletions << change.old_element.fulltext else raise "unexpected change.action #{change.action}" end @@ -130,6 +178,15 @@ def rendered_text attr_reader :insertions, :deletions, :new_text def apply_insertions_and_deletions + unless deletions.empty? || insertions.empty? + while !deletions.empty? && !insertions.empty? + break unless deletions.first == insertions.first + + deletions.shift + new_text << insertions.shift + end + end + if deletions.length > 0 new_text << deletion_tag(deletions.join) end @@ -162,7 +219,7 @@ def parse(text) end def extract_text(node) - filter_whitespace(text_fragments(node)) + merge_whitespace(filter_whitespace(text_fragments(node))) end def text_fragments(node) @@ -186,5 +243,34 @@ def filter_whitespace(node_list) result end + def merge_whitespace(node_list) + result = [] + + last_whitespace_node = nil + node_list.each do |node| + if node.whitespace? + last_whitespace_node = node + next + end + + unless last_whitespace_node + result << node + next + end + + if last_whitespace_node.node.equal?(node.node) + node.before_whitespace = last_whitespace_node + else + result << last_whitespace_node + end + last_whitespace_node = nil + result << node + end + + result << last_whitespace_node if last_whitespace_node + + result + end + end; end diff --git a/lib/hyp_diff/text_from_node.rb b/lib/hyp_diff/text_from_node.rb index 7800710..28053d6 100644 --- a/lib/hyp_diff/text_from_node.rb +++ b/lib/hyp_diff/text_from_node.rb @@ -1,6 +1,8 @@ module HypDiff class TextFromNode + attr_accessor :before_whitespace + def initialize(raw_text, node) @text = raw_text.strip == "" ? " " : raw_text @node = node @@ -22,6 +24,10 @@ def whitespace? @text == " " end + def fulltext + before_whitespace ? " #{text}" : text + end + def text @text end diff --git a/spec/hyp_diff_spec.rb b/spec/hyp_diff_spec.rb index 3c02611..9bfda0b 100644 --- a/spec/hyp_diff_spec.rb +++ b/spec/hyp_diff_spec.rb @@ -48,7 +48,7 @@ def expect_diff(old, new, expected) end it "merges consecutive deletions into a single tag" do - expect_diff("hello beautiful world", "hello world", "hello beautiful world") + expect_diff("hello beautiful world", "hello world", "hello beautiful world") end it "merge consecutive additions and edits into single tags" do From 026ec1b8356e9794e0941939a9bfad67c1d08fb5 Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 17:26:10 +0200 Subject: [PATCH 6/7] [spec] Expect and edge case's outcome differently --- spec/hyp_diff_spec.rb | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/spec/hyp_diff_spec.rb b/spec/hyp_diff_spec.rb index 9bfda0b..c3ccde9 100644 --- a/spec/hyp_diff_spec.rb +++ b/spec/hyp_diff_spec.rb @@ -125,17 +125,22 @@ def expect_diff(old, new, expected) expect_diff("hello world", "hello world", "hello world") end - it "treats consecutive whitespace as a single whitespace across tags", :aggregate_failures do + it "treats consecutive whitespace as a single whitespace across tags" do expect_diff( "hello world", "hello world", "hello world" ) - expect_diff( - "hello world", - "hello world", - "hello world" - ) + end + + it "treats consecutive whitespace as a single whitespace across tags (best effort for special cases)" do + expect( + HypDiff.compare( + "hello world", + "hello world", + ) + ).to eq("hello world") + .or eq("hello world") end it "considers trailing and leading whitespace for insertions and deletions", :aggregate_failures do From 2af49b443aa470f76c22f6cbaa98f75bb0c0728f Mon Sep 17 00:00:00 2001 From: Kai-Uwe Humpert Date: Thu, 3 Jun 2021 19:03:34 +0200 Subject: [PATCH 7/7] Remove artifact The loop's condition is sufficient. The artifact is from a spike which tried to identify a "common tail" of insertions and deletions. --- lib/hyp_diff.rb | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/hyp_diff.rb b/lib/hyp_diff.rb index b18c48c..fbff4e0 100644 --- a/lib/hyp_diff.rb +++ b/lib/hyp_diff.rb @@ -178,13 +178,11 @@ def rendered_text attr_reader :insertions, :deletions, :new_text def apply_insertions_and_deletions - unless deletions.empty? || insertions.empty? - while !deletions.empty? && !insertions.empty? - break unless deletions.first == insertions.first + while !deletions.empty? && !insertions.empty? + break unless deletions.first == insertions.first - deletions.shift - new_text << insertions.shift - end + deletions.shift + new_text << insertions.shift end if deletions.length > 0