Skip to content

Commit

Permalink
Merge pull request #66 from MITLibraries/tco22-hints-model
Browse files Browse the repository at this point in the history
Add Detector::SuggestedResource model
  • Loading branch information
matt-bernhardt authored Jul 29, 2024
2 parents cc5c517 + b578611 commit fa2aef7
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 1 deletion.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ gem 'rack-cors'
# Use Redis adapter to run Action Cable in production
# gem "redis", ">= 4.0.1"

gem 'stringex'

# Use Kredis to get higher-level data types in Redis [https://github.com/rails/kredis]
# gem "kredis"

Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ GEM
sqlite3 (1.7.3-x86_64-linux)
stimulus-rails (1.3.3)
railties (>= 6.0.0)
stringex (2.8.6)
stringio (3.1.1)
strscan (3.1.0)
thor (1.3.1)
Expand Down Expand Up @@ -368,6 +369,7 @@ DEPENDENCIES
sprockets-rails
sqlite3
stimulus-rails
stringex
turbo-rails
tzinfo-data
vcr
Expand Down
57 changes: 57 additions & 0 deletions app/models/detector/suggested_resource.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# frozen_string_literal: true

# == Schema Information
#
# Table name: detector_suggested_resources
#
# id :integer not null, primary key
# title :string
# url :string
# phrase :string
# fingerprint :string
# created_at :datetime not null
# updated_at :datetime not null
#

require 'stringex/core_ext'

module Detector
# Detector::SuggestedResource stores custom hints that we want to send to the
# user in response to specific strings. For example, a search for "web of
# science" should be met with our custom login link to Web of Science via MIT.
class SuggestedResource < ApplicationRecord
before_save :update_fingerprint

def update_fingerprint
self.fingerprint = calculate_fingerprint(phrase)
end

# This implements the OpenRefine fingerprinting algorithm. See
# https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint
def calculate_fingerprint(old_phrase)
modified_phrase = old_phrase
modified_phrase = modified_phrase.strip
modified_phrase = modified_phrase.downcase

# This removes all punctuation and symbol characters from the string.
modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '')

# Normalize to ASCII (e.g. gödel and godel are liable to be intended to
# find the same thing)
modified_phrase = modified_phrase.to_ascii

# Coercion to ASCII can introduce new symbols, so we remove those now.
modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '')

# Tokenize
tokens = modified_phrase.split

# Remove duplicates and sort
tokens = tokens.uniq
tokens = tokens.sort

# Rejoin tokens
tokens.join(' ')
end
end
end
14 changes: 14 additions & 0 deletions db/migrate/20240716143850_create_detector_suggested_resources.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class CreateDetectorSuggestedResources < ActiveRecord::Migration[7.1]
def change
create_table :detector_suggested_resources do |t|
t.string :title
t.string :url
t.string :phrase
t.string :fingerprint

t.timestamps
end
add_index :detector_suggested_resources, :phrase, unique: true
add_index :detector_suggested_resources, :fingerprint, unique: true
end
end
13 changes: 12 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions test/fixtures/detector/suggested_resources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# == Schema Information
#
# Table name: detector_suggested_resources
#
# id :integer not null, primary key
# title :string
# url :string
# phrase :string
# fingerprint :string
# created_at :datetime not null
# updated_at :datetime not null
#


jstor:
title: JSTOR
url: https://libguides.mit.edu/jstor
phrase: jstor
fingerprint: jstor

web_of_science:
title: Web of Science
url: https://libguides.mit.edu/webofsci
phrase: web of science
fingerprint: of science web

web_of_knowledge:
title: Web of Knowledge
url: https://libguides.mit.edu/webofsci
phrase: web of knowledge
fingerprint: knowledge of web
123 changes: 123 additions & 0 deletions test/models/detector/suggested_resource_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# frozen_string_literal: true

# == Schema Information
#
# Table name: detector_suggested_resources
#
# id :integer not null, primary key
# title :string
# url :string
# phrase :string
# fingerprint :string
# created_at :datetime not null
# updated_at :datetime not null
#
require 'test_helper'

module Detector
class SuggestedResourceTest < ActiveSupport::TestCase
test 'fingerprints are generated automatically' do
resource = {
title: 'Our latest resource',
url: 'https://example.org',
phrase: 'Our latest resource'
}

new_resource = Detector::SuggestedResource.create(resource)

assert new_resource.fingerprint == 'latest our resource'
end

test 'fingerprints are recalculated on save' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'A brand new phrase'

resource.phrase = 'This is a brand new phrase'
resource.save
resource.reload

assert resource.fingerprint == 'a brand is new phrase this'
end

test 'generating fingerprints does not alter the phrase' do
resource = detector_suggested_resources('jstor')
benchmark = 'This is an updated phrase! '

refute resource.phrase == benchmark
resource.phrase = benchmark
resource.save
resource.reload

assert resource.phrase == benchmark
end

test 'fingerprints strip extra spaces' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'i need space'

resource.phrase = ' i need space '
resource.save
resource.reload

assert resource.fingerprint == 'i need space'
end

test 'fingerprints are coerced to lowercase' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'ftw intercapping'

resource.phrase = 'InterCapping FTW'
resource.save
resource.reload

assert resource.fingerprint == 'ftw intercapping'
end

test 'fingerprints remove punctuation and symbols' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols'

resource.phrase = 'symbols™ + punctuation: * bullets! - "quoted phrase" (perfect) ¥€$'
resource.save
resource.reload

assert resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols'
end

test 'fingerprints coerce characters to ASCII' do
resource = {
title: 'A wide range of characters',
url: 'https://example.org',
phrase: 'а а̀ а̂ а̄ ӓ б в г ґ д ђ ѓ е ѐ е̄ е̂ ё є ж з з́ ѕ и і ї ꙇ ѝ и̂ ӣ й ј к л љ м н њ о о̀ о̂ ō ӧ п р с с́'\
' т ћ ќ у у̀ у̂ ӯ ў ӱ ф х ц ч џ ш щ ꙏ ъ ъ̀ ы ь ѣ э ю ю̀ я'
}

new_resource = Detector::SuggestedResource.create(resource)

assert new_resource.fingerprint == 'a b ch d dj dz dzh e f g gh gj i ia ie io iu j k kh kj l lj m n nj o p r s '\
'sh shch t ts tsh u v y yi z zh'
end

test 'fingerprints remove repeated words' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'double'

resource.phrase = 'double double'
resource.save
resource.reload

assert resource.fingerprint == 'double'
end

test 'fingerprints sort words alphabetically' do
resource = detector_suggested_resources('jstor')
refute resource.fingerprint == 'delta gamma'

resource.phrase = 'gamma delta'
resource.save
resource.reload

assert resource.fingerprint == 'delta gamma'
end
end
end

0 comments on commit fa2aef7

Please sign in to comment.