Skip to content

Commit

Permalink
Updates after informal code review
Browse files Browse the repository at this point in the history
* There are more tests, each explicitly testing one aspect of the
  fingerprinting. Most notably is one to confirm that the original
  phrase is not affected while calculating the fingerprint.

* The calculate_fingerprint method now has an explicit argument to
  receive the phrase

* Variable names internal to this method have been clarified

* Bang methods are no longer being used
  • Loading branch information
matt-bernhardt committed Jul 26, 2024
1 parent 76a62d6 commit b578611
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 20 deletions.
24 changes: 12 additions & 12 deletions app/models/detector/suggested_resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,35 @@ class SuggestedResource < ApplicationRecord
before_save :update_fingerprint

def update_fingerprint
self.fingerprint = calculate_fingerprint
self.fingerprint = calculate_fingerprint(phrase)
end

# This implements the OpenRefine fingerprinting algorithm. See
# https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint
def calculate_fingerprint
temp = phrase
temp.strip!
temp.downcase!
def calculate_fingerprint(old_phrase)
modified_phrase = old_phrase
modified_phrase = modified_phrase.strip
modified_phrase = modified_phrase.downcase

# This removes all punctuation and symbol characters from the string.
temp.gsub!(/\p{P}|\p{S}/, '')
modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '')

# Normalize to ASCII (e.g. gödel and godel are liable to be intended to
# find the same thing)
temp = temp.to_ascii
modified_phrase = modified_phrase.to_ascii

# Coercion to ASCII can introduce new symbols, so we remove those now.
temp.gsub!(/\p{P}|\p{S}/, '')
modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '')

# Tokenize
array = temp.split
tokens = modified_phrase.split

# Remove duplicates and sort
array.uniq!
array.sort!
tokens = tokens.uniq
tokens = tokens.sort

# Rejoin tokens
array.join(' ')
tokens.join(' ')
end
end
end
71 changes: 63 additions & 8 deletions test/models/detector/suggested_resource_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,49 @@ class SuggestedResourceTest < ActiveSupport::TestCase
assert resource.fingerprint == 'a brand is new phrase this'
end

test 'fingerprints standardize characters used' do
resource = {
title: 'A wide range of characters',
url: 'https://example.org',
phrase: 'This phrase uses: WeIrD caPital letters, * (punctuation), and symbols™ like ¥€$'
}
test 'generating fingerprints does not alter the phrase' do
resource = detector_suggested_resources('jstor')
benchmark = 'This is an updated phrase! '

new_resource = Detector::SuggestedResource.create(resource)
refute resource.phrase == benchmark
resource.phrase = benchmark
resource.save
resource.reload

assert resource.phrase == benchmark
end

test 'fingerprints strip extra spaces' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'i need space'

resource.phrase = ' i need space '
resource.save
resource.reload

assert resource.fingerprint == 'i need space'
end

test 'fingerprints are coerced to lowercase' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'ftw intercapping'

resource.phrase = 'InterCapping FTW'
resource.save
resource.reload

assert new_resource.fingerprint == 'and capital letters like phrase punctuation symbols this uses weird'
assert resource.fingerprint == 'ftw intercapping'
end

test 'fingerprints remove punctuation and symbols' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols'

resource.phrase = 'symbols™ + punctuation: * bullets! - "quoted phrase" (perfect) ¥€$'
resource.save
resource.reload

assert resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols'
end

test 'fingerprints coerce characters to ASCII' do
Expand All @@ -64,5 +97,27 @@ class SuggestedResourceTest < ActiveSupport::TestCase
assert new_resource.fingerprint == 'a b ch d dj dz dzh e f g gh gj i ia ie io iu j k kh kj l lj m n nj o p r s '\
'sh shch t ts tsh u v y yi z zh'
end

test 'fingerprints remove repeated words' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'double'

resource.phrase = 'double double'
resource.save
resource.reload

assert resource.fingerprint == 'double'
end

test 'fingerprints sort words alphabetically' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'delta gamma'

resource.phrase = 'gamma delta'
resource.save
resource.reload

assert resource.fingerprint == 'delta gamma'
end
end
end

0 comments on commit b578611

Please sign in to comment.