diff --git a/Gemfile b/Gemfile index c07aed4..ffe03d9 100644 --- a/Gemfile +++ b/Gemfile @@ -46,6 +46,8 @@ gem 'rack-cors' # Use Redis adapter to run Action Cable in production # gem "redis", ">= 4.0.1" +gem 'stringex' + # Use Kredis to get higher-level data types in Redis [https://github.com/rails/kredis] # gem "kredis" diff --git a/Gemfile.lock b/Gemfile.lock index 3ffa3b7..0748244 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -306,6 +306,7 @@ GEM sqlite3 (1.7.3-x86_64-linux) stimulus-rails (1.3.3) railties (>= 6.0.0) + stringex (2.8.6) stringio (3.1.1) strscan (3.1.0) thor (1.3.1) @@ -368,6 +369,7 @@ DEPENDENCIES sprockets-rails sqlite3 stimulus-rails + stringex turbo-rails tzinfo-data vcr diff --git a/app/models/detector/suggested_resource.rb b/app/models/detector/suggested_resource.rb new file mode 100644 index 0000000..526074e --- /dev/null +++ b/app/models/detector/suggested_resource.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: detector_suggested_resources +# +# id :integer not null, primary key +# title :string +# url :string +# phrase :string +# fingerprint :string +# created_at :datetime not null +# updated_at :datetime not null +# + +require 'stringex/core_ext' + +module Detector + # Detector::SuggestedResource stores custom hints that we want to send to the + # user in response to specific strings. For example, a search for "web of + # science" should be met with our custom login link to Web of Science via MIT. + class SuggestedResource < ApplicationRecord + before_save :update_fingerprint + + def update_fingerprint + self.fingerprint = calculate_fingerprint(phrase) + end + + # This implements the OpenRefine fingerprinting algorithm. See + # https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint + def calculate_fingerprint(old_phrase) + modified_phrase = old_phrase + modified_phrase = modified_phrase.strip + modified_phrase = modified_phrase.downcase + + # This removes all punctuation and symbol characters from the string. + modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '') + + # Normalize to ASCII (e.g. gödel and godel are liable to be intended to + # find the same thing) + modified_phrase = modified_phrase.to_ascii + + # Coercion to ASCII can introduce new symbols, so we remove those now. + modified_phrase = modified_phrase.gsub(/\p{P}|\p{S}/, '') + + # Tokenize + tokens = modified_phrase.split + + # Remove duplicates and sort + tokens = tokens.uniq + tokens = tokens.sort + + # Rejoin tokens + tokens.join(' ') + end + end +end diff --git a/db/migrate/20240716143850_create_detector_suggested_resources.rb b/db/migrate/20240716143850_create_detector_suggested_resources.rb new file mode 100644 index 0000000..b13e90b --- /dev/null +++ b/db/migrate/20240716143850_create_detector_suggested_resources.rb @@ -0,0 +1,14 @@ +class CreateDetectorSuggestedResources < ActiveRecord::Migration[7.1] + def change + create_table :detector_suggested_resources do |t| + t.string :title + t.string :url + t.string :phrase + t.string :fingerprint + + t.timestamps + end + add_index :detector_suggested_resources, :phrase, unique: true + add_index :detector_suggested_resources, :fingerprint, unique: true + end +end diff --git a/db/schema.rb b/db/schema.rb index 3acaf32..52c9d2d 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2024_07_01_205444) do +ActiveRecord::Schema[7.1].define(version: 2024_07_16_143850) do create_table "detector_journals", force: :cascade do |t| t.string "name" t.json "additional_info" @@ -19,6 +19,17 @@ t.index ["name"], name: "index_detector_journals_on_name" end + create_table "detector_suggested_resources", force: :cascade do |t| + t.string "title" + t.string "url" + t.string "phrase" + t.string "fingerprint" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["fingerprint"], name: "index_detector_suggested_resources_on_fingerprint", unique: true + t.index ["phrase"], name: "index_detector_suggested_resources_on_phrase", unique: true + end + create_table "metrics_algorithms", force: :cascade do |t| t.date "month" t.integer "doi" diff --git a/test/fixtures/detector/suggested_resources.yml b/test/fixtures/detector/suggested_resources.yml new file mode 100644 index 0000000..82c43a7 --- /dev/null +++ b/test/fixtures/detector/suggested_resources.yml @@ -0,0 +1,31 @@ +# == Schema Information +# +# Table name: detector_suggested_resources +# +# id :integer not null, primary key +# title :string +# url :string +# phrase :string +# fingerprint :string +# created_at :datetime not null +# updated_at :datetime not null +# + + +jstor: + title: JSTOR + url: https://libguides.mit.edu/jstor + phrase: jstor + fingerprint: jstor + +web_of_science: + title: Web of Science + url: https://libguides.mit.edu/webofsci + phrase: web of science + fingerprint: of science web + +web_of_knowledge: + title: Web of Knowledge + url: https://libguides.mit.edu/webofsci + phrase: web of knowledge + fingerprint: knowledge of web diff --git a/test/models/detector/suggested_resource_test.rb b/test/models/detector/suggested_resource_test.rb new file mode 100644 index 0000000..620b35b --- /dev/null +++ b/test/models/detector/suggested_resource_test.rb @@ -0,0 +1,123 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: detector_suggested_resources +# +# id :integer not null, primary key +# title :string +# url :string +# phrase :string +# fingerprint :string +# created_at :datetime not null +# updated_at :datetime not null +# +require 'test_helper' + +module Detector + class SuggestedResourceTest < ActiveSupport::TestCase + test 'fingerprints are generated automatically' do + resource = { + title: 'Our latest resource', + url: 'https://example.org', + phrase: 'Our latest resource' + } + + new_resource = Detector::SuggestedResource.create(resource) + + assert new_resource.fingerprint == 'latest our resource' + end + + test 'fingerprints are recalculated on save' do + resource = detector_suggested_resources('jstor') + refute resource.fingerprint == 'A brand new phrase' + + resource.phrase = 'This is a brand new phrase' + resource.save + resource.reload + + assert resource.fingerprint == 'a brand is new phrase this' + end + + test 'generating fingerprints does not alter the phrase' do + resource = detector_suggested_resources('jstor') + benchmark = 'This is an updated phrase! ' + + refute resource.phrase == benchmark + resource.phrase = benchmark + resource.save + resource.reload + + assert resource.phrase == benchmark + end + + test 'fingerprints strip extra spaces' do + resource = detector_suggested_resources('jstor') + refute resource.fingerprint == 'i need space' + + resource.phrase = ' i need space ' + resource.save + resource.reload + + assert resource.fingerprint == 'i need space' + end + + test 'fingerprints are coerced to lowercase' do + resource = detector_suggested_resources('jstor') + refute resource.fingerprint == 'ftw intercapping' + + resource.phrase = 'InterCapping FTW' + resource.save + resource.reload + + assert resource.fingerprint == 'ftw intercapping' + end + + test 'fingerprints remove punctuation and symbols' do + resource = detector_suggested_resources('jstor') + refute resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols' + + resource.phrase = 'symbols™ + punctuation: * bullets! - "quoted phrase" (perfect) ¥€$' + resource.save + resource.reload + + assert resource.fingerprint == 'bullets perfect phrase punctuation quoted symbols' + end + + test 'fingerprints coerce characters to ASCII' do + resource = { + title: 'A wide range of characters', + url: 'https://example.org', + phrase: 'а а̀ а̂ а̄ ӓ б в г ґ д ђ ѓ е ѐ е̄ е̂ ё є ж з з́ ѕ и і ї ꙇ ѝ и̂ ӣ й ј к л љ м н њ о о̀ о̂ ō ӧ п р с с́'\ + ' т ћ ќ у у̀ у̂ ӯ ў ӱ ф х ц ч џ ш щ ꙏ ъ ъ̀ ы ь ѣ э ю ю̀ я' + } + + new_resource = Detector::SuggestedResource.create(resource) + + assert new_resource.fingerprint == 'a b ch d dj dz dzh e f g gh gj i ia ie io iu j k kh kj l lj m n nj o p r s '\ + 'sh shch t ts tsh u v y yi z zh' + end + + test 'fingerprints remove repeated words' do + resource = detector_suggested_resources('jstor') + refute resource.fingerprint == 'double' + + resource.phrase = 'double double' + resource.save + resource.reload + + assert resource.fingerprint == 'double' + end + + test 'fingerprints sort words alphabetically' do + resource = detector_suggested_resources('jstor') + refute resource.fingerprint == 'delta gamma' + + resource.phrase = 'gamma delta' + resource.save + resource.reload + + assert resource.fingerprint == 'delta gamma' + end + end +end