-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #66 from MITLibraries/tco22-hints-model
Add Detector::SuggestedResource model
- Loading branch information
Showing
7 changed files
with
241 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# frozen_string_literal: true | ||
|
||
# == Schema Information | ||
# | ||
# Table name: detector_suggested_resources | ||
# | ||
# id :integer not null, primary key | ||
# title :string | ||
# url :string | ||
# phrase :string | ||
# fingerprint :string | ||
# created_at :datetime not null | ||
# updated_at :datetime not null | ||
# | ||
|
||
require 'stringex/core_ext' | ||
|
||
module Detector | ||
# Detector::SuggestedResource stores custom hints that we want to send to the | ||
# user in response to specific strings. For example, a search for "web of | ||
# science" should be met with our custom login link to Web of Science via MIT. | ||
class SuggestedResource < ApplicationRecord | ||
before_save :update_fingerprint | ||
|
||
def update_fingerprint | ||
self.fingerprint = calculate_fingerprint(phrase) | ||
end | ||
|
||
# This implements the OpenRefine fingerprinting algorithm. See | ||
# https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint | ||
def calculate_fingerprint(old_phrase) | ||
modified_phrase = old_phrase | ||
modified_phrase = modified_phrase.strip | ||
modified_phrase = modified_phrase.downcase | ||
|
||
# This removes all punctuation and symbol characters from the string. | ||
modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '') | ||
|
||
# Normalize to ASCII (e.g. gödel and godel are liable to be intended to | ||
# find the same thing) | ||
modified_phrase = modified_phrase.to_ascii | ||
|
||
# Coercion to ASCII can introduce new symbols, so we remove those now. | ||
modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '') | ||
|
||
# Tokenize | ||
tokens = modified_phrase.split | ||
|
||
# Remove duplicates and sort | ||
tokens = tokens.uniq | ||
tokens = tokens.sort | ||
|
||
# Rejoin tokens | ||
tokens.join(' ') | ||
end | ||
end | ||
end |
14 changes: 14 additions & 0 deletions
14
db/migrate/20240716143850_create_detector_suggested_resources.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
class CreateDetectorSuggestedResources < ActiveRecord::Migration[7.1] | ||
def change | ||
create_table :detector_suggested_resources do |t| | ||
t.string :title | ||
t.string :url | ||
t.string :phrase | ||
t.string :fingerprint | ||
|
||
t.timestamps | ||
end | ||
add_index :detector_suggested_resources, :phrase, unique: true | ||
add_index :detector_suggested_resources, :fingerprint, unique: true | ||
end | ||
end |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# == Schema Information | ||
# | ||
# Table name: detector_suggested_resources | ||
# | ||
# id :integer not null, primary key | ||
# title :string | ||
# url :string | ||
# phrase :string | ||
# fingerprint :string | ||
# created_at :datetime not null | ||
# updated_at :datetime not null | ||
# | ||
|
||
|
||
jstor: | ||
title: JSTOR | ||
url: https://libguides.mit.edu/jstor | ||
phrase: jstor | ||
fingerprint: jstor | ||
|
||
web_of_science: | ||
title: Web of Science | ||
url: https://libguides.mit.edu/webofsci | ||
phrase: web of science | ||
fingerprint: of science web | ||
|
||
web_of_knowledge: | ||
title: Web of Knowledge | ||
url: https://libguides.mit.edu/webofsci | ||
phrase: web of knowledge | ||
fingerprint: knowledge of web |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
# frozen_string_literal: true | ||
|
||
# == Schema Information | ||
# | ||
# Table name: detector_suggested_resources | ||
# | ||
# id :integer not null, primary key | ||
# title :string | ||
# url :string | ||
# phrase :string | ||
# fingerprint :string | ||
# created_at :datetime not null | ||
# updated_at :datetime not null | ||
# | ||
require 'test_helper' | ||
|
||
module Detector | ||
class SuggestedResourceTest < ActiveSupport::TestCase | ||
test 'fingerprints are generated automatically' do | ||
resource = { | ||
title: 'Our latest resource', | ||
url: 'https://example.org', | ||
phrase: 'Our latest resource' | ||
} | ||
|
||
new_resource = Detector::SuggestedResource.create(resource) | ||
|
||
assert new_resource.fingerprint == 'latest our resource' | ||
end | ||
|
||
test 'fingerprints are recalculated on save' do | ||
resource = detector_suggested_resources('jstor') | ||
refute resource.fingerprint == 'A brand new phrase' | ||
|
||
resource.phrase = 'This is a brand new phrase' | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.fingerprint == 'a brand is new phrase this' | ||
end | ||
|
||
test 'generating fingerprints does not alter the phrase' do | ||
resource = detector_suggested_resources('jstor') | ||
benchmark = 'This is an updated phrase! ' | ||
|
||
refute resource.phrase == benchmark | ||
resource.phrase = benchmark | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.phrase == benchmark | ||
end | ||
|
||
test 'fingerprints strip extra spaces' do | ||
resource = detector_suggested_resources('jstor') | ||
refute resource.fingerprint == 'i need space' | ||
|
||
resource.phrase = ' i need space ' | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.fingerprint == 'i need space' | ||
end | ||
|
||
test 'fingerprints are coerced to lowercase' do | ||
resource = detector_suggested_resources('jstor') | ||
refute resource.fingerprint == 'ftw intercapping' | ||
|
||
resource.phrase = 'InterCapping FTW' | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.fingerprint == 'ftw intercapping' | ||
end | ||
|
||
test 'fingerprints remove punctuation and symbols' do | ||
resource = detector_suggested_resources('jstor') | ||
refute resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols' | ||
|
||
resource.phrase = 'symbols™ + punctuation: * bullets! - "quoted phrase" (perfect) ¥€$' | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols' | ||
end | ||
|
||
test 'fingerprints coerce characters to ASCII' do | ||
resource = { | ||
title: 'A wide range of characters', | ||
url: 'https://example.org', | ||
phrase: 'а а̀ а̂ а̄ ӓ б в г ґ д ђ ѓ е ѐ е̄ е̂ ё є ж з з́ ѕ и і ї ꙇ ѝ и̂ ӣ й ј к л љ м н њ о о̀ о̂ ō ӧ п р с с́'\ | ||
' т ћ ќ у у̀ у̂ ӯ ў ӱ ф х ц ч џ ш щ ꙏ ъ ъ̀ ы ь ѣ э ю ю̀ я' | ||
} | ||
|
||
new_resource = Detector::SuggestedResource.create(resource) | ||
|
||
assert new_resource.fingerprint == 'a b ch d dj dz dzh e f g gh gj i ia ie io iu j k kh kj l lj m n nj o p r s '\ | ||
'sh shch t ts tsh u v y yi z zh' | ||
end | ||
|
||
test 'fingerprints remove repeated words' do | ||
resource = detector_suggested_resources('jstor') | ||
refute resource.fingerprint == 'double' | ||
|
||
resource.phrase = 'double double' | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.fingerprint == 'double' | ||
end | ||
|
||
test 'fingerprints sort words alphabetically' do | ||
resource = detector_suggested_resources('jstor') | ||
refute resource.fingerprint == 'delta gamma' | ||
|
||
resource.phrase = 'gamma delta' | ||
resource.save | ||
resource.reload | ||
|
||
assert resource.fingerprint == 'delta gamma' | ||
end | ||
end | ||
end |