Updates after informal code review

* There are more tests, each explicitly testing one aspect of the fingerprinting. Most notably is one to confirm that the original phrase is not affected while calculating the fingerprint. * The calculate_fingerprint method now has an explicit argument to receive the phrase * Variable names internal to this method have been clarified * Bang methods are no longer being used
MITLibraries · Jul 26, 2024 · b578611 · b578611
1 parent 76a62d6
commit b578611
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 20 deletions.
diff --git a/app/models/detector/suggested_resource.rb b/app/models/detector/suggested_resource.rb
@@ -23,35 +23,35 @@ class SuggestedResource < ApplicationRecord
     before_save :update_fingerprint
 
     def update_fingerprint
-      self.fingerprint = calculate_fingerprint
+      self.fingerprint = calculate_fingerprint(phrase)
     end
 
     # This implements the OpenRefine fingerprinting algorithm. See
     # https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint
-    def calculate_fingerprint
-      temp = phrase
-      temp.strip!
-      temp.downcase!
+    def calculate_fingerprint(old_phrase)
+      modified_phrase = old_phrase
+      modified_phrase = modified_phrase.strip
+      modified_phrase = modified_phrase.downcase
 
       # This removes all punctuation and symbol characters from the string.
-      temp.gsub!(/\p{P}|\p{S}/, '')
+      modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '')
 
       # Normalize to ASCII (e.g. gödel and godel are liable to be intended to
       # find the same thing)
-      temp = temp.to_ascii
+      modified_phrase = modified_phrase.to_ascii
 
       # Coercion to ASCII can introduce new symbols, so we remove those now.
-      temp.gsub!(/\p{P}|\p{S}/, '')
+      modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '')
 
       # Tokenize
-      array = temp.split
+      tokens = modified_phrase.split
 
       # Remove duplicates and sort
-      array.uniq!
-      array.sort!
+      tokens = tokens.uniq
+      tokens = tokens.sort
 
       # Rejoin tokens
-      array.join(' ')
+      tokens.join(' ')
     end
   end
 end
diff --git a/test/models/detector/suggested_resource_test.rb b/test/models/detector/suggested_resource_test.rb
@@ -39,16 +39,49 @@ class SuggestedResourceTest < ActiveSupport::TestCase
       assert resource.fingerprint == 'a brand is new phrase this'
     end
 
-    test 'fingerprints standardize characters used' do
-      resource = {
-        title: 'A wide range of characters',
-        url: 'https://example.org',
-        phrase: 'This phrase uses: WeIrD caPital letters, * (punctuation), and symbols™ like ¥€$'
-      }
+    test 'generating fingerprints does not alter the phrase' do
+      resource = detector_suggested_resources('jstor')
+      benchmark = 'This is an updated phrase! '
 
-      new_resource = Detector::SuggestedResource.create(resource)
+      refute resource.phrase == benchmark
+      resource.phrase = benchmark
+      resource.save
+      resource.reload
+
+      assert resource.phrase == benchmark
+    end
+
+    test 'fingerprints strip extra spaces' do
+      resource = detector_suggested_resources('jstor')
+      refute resource.fingerprint == 'i need space'
+
+      resource.phrase = '  i  need  space  '
+      resource.save
+      resource.reload
+
+      assert resource.fingerprint == 'i need space'
+    end
+
+    test 'fingerprints are coerced to lowercase' do
+      resource = detector_suggested_resources('jstor')
+      refute resource.fingerprint == 'ftw intercapping'
+
+      resource.phrase = 'InterCapping FTW'
+      resource.save
+      resource.reload
 
-      assert new_resource.fingerprint == 'and capital letters like phrase punctuation symbols this uses weird'
+      assert resource.fingerprint == 'ftw intercapping'
+    end
+
+    test 'fingerprints remove punctuation and symbols' do
+      resource = detector_suggested_resources('jstor')
+      refute resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols'
+
+      resource.phrase = 'symbols™ + punctuation: * bullets! - "quoted phrase" (perfect) ¥€$'
+      resource.save
+      resource.reload
+
+      assert resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols'
     end
 
     test 'fingerprints coerce characters to ASCII' do
@@ -64,5 +97,27 @@ class SuggestedResourceTest < ActiveSupport::TestCase
       assert new_resource.fingerprint == 'a b ch d dj dz dzh e f g gh gj i ia ie io iu j k kh kj l lj m n nj o p r s '\
       'sh shch t ts tsh u v y yi z zh'
     end
+
+    test 'fingerprints remove repeated words' do
+      resource = detector_suggested_resources('jstor')
+      refute resource.fingerprint == 'double'
+
+      resource.phrase = 'double double'
+      resource.save
+      resource.reload
+
+      assert resource.fingerprint == 'double'
+    end
+
+    test 'fingerprints sort words alphabetically' do
+      resource = detector_suggested_resources('jstor')
+      refute resource.fingerprint == 'delta gamma'
+
+      resource.phrase = 'gamma delta'
+      resource.save
+      resource.reload
+
+      assert resource.fingerprint == 'delta gamma'
+    end
   end
 end