diff --git a/app/models/detector.rb b/app/models/detector.rb new file mode 100644 index 0000000..5694035 --- /dev/null +++ b/app/models/detector.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +# Detectors are classes that implement various algorithms that allow us to identify patterns +# within search terms. +module Detector + def self.table_name_prefix + 'detector_' + end +end diff --git a/app/models/detector/journal.rb b/app/models/detector/journal.rb new file mode 100644 index 0000000..61962ae --- /dev/null +++ b/app/models/detector/journal.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: detector_journals +# +# id :integer not null, primary key +# name :string +# additional_info :json +# created_at :datetime not null +# updated_at :datetime not null +# +module Detector + # Detector::Journal stores information about academic journals loaded from external sources to allow us to check our + # incoming Terms against these information + class Journal < ApplicationRecord + before_save :downcase_fields! + + # Identify journals in which the incoming phrase matches a Journal.name exactly + # + # @note We always store the Journal.name downcased, so we should also always downcase the phrase + # when matching + # + # @note In reality, multiple Journals can exist with the same name. Therefore, we don't enforce + # unique names and don't expect a single Journal to be returned. + # + # @param phrase [String]. A string representation of a search term (not an actual Term object!) + # + # @return [Set of Detector::Journal] A set of ActiveRecord Detector::Journal relations. + def self.full_term_match(phrase) + Journal.where(name: phrase.downcase) + end + + # Identify journals in which the incoming phrase contains one or more Journal names + # + # @note This likely won't scale well and may not be suitable for live detection as it loads all Journal records. + # + # @param phrase [String]. A string representation of a search term (not an actual Term object!) + # + # @return [Set of Detector::Journal] A set of ActiveRecord Detector::Journal relations. + def self.partial_term_match(phrase) + Journal.all.map { |journal| journal if phrase.downcase.include?(journal.name) }.compact + end + + private + + # Downcasing all names before saving allows for more efficient matching by ensuring our index is lowercase. + # If we find we need the non-lowercase Journal name in the future, we could store that as `additional_info` json + def downcase_fields! + name.downcase! + end + end +end diff --git a/db/migrate/20240701205444_create_detector_journals.rb b/db/migrate/20240701205444_create_detector_journals.rb new file mode 100644 index 0000000..607d4d1 --- /dev/null +++ b/db/migrate/20240701205444_create_detector_journals.rb @@ -0,0 +1,11 @@ +class CreateDetectorJournals < ActiveRecord::Migration[7.1] + def change + create_table :detector_journals do |t| + t.string :name + t.json :additional_info + + t.timestamps + end + add_index :detector_journals, :name + end +end diff --git a/db/schema.rb b/db/schema.rb index 384ff92..3acaf32 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,15 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2024_06_21_132136) do +ActiveRecord::Schema[7.1].define(version: 2024_07_01_205444) do + create_table "detector_journals", force: :cascade do |t| + t.string "name" + t.json "additional_info" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["name"], name: "index_detector_journals_on_name" + end + create_table "metrics_algorithms", force: :cascade do |t| t.date "month" t.integer "doi" diff --git a/test/fixtures/detector/journals.yml b/test/fixtures/detector/journals.yml new file mode 100644 index 0000000..edec9e2 --- /dev/null +++ b/test/fixtures/detector/journals.yml @@ -0,0 +1,29 @@ +# == Schema Information +# +# Table name: detector_journals +# +# id :integer not null, primary key +# name :string +# additional_info :json +# created_at :datetime not null +# updated_at :datetime not null +# + +# Note: fixtures bypass ActiveRecord callbacks so while our model auto downcases titles, +# these fixtures will be stored mixed case unless they are all manually downcased here. +# Put another way, please make sure to always use downcase/lowercase for the 'name' in these fixtures +# to properly match the real behavior of the application. +nature: { + name: nature, + additional_info: {issns: ['0028-0836', '1476-4687']} +} + +the_new_england_journal_of_medicine: { + name: the new england journal of medicine, + additional_info: {issns: ['0028-4793', '1533-4406']} +} + +nature_medicine: { + name: nature medicine, + additional_info: {issns: ['1078-8956', '1546-170X']} +} diff --git a/test/models/detector/journal_test.rb b/test/models/detector/journal_test.rb new file mode 100644 index 0000000..cd68655 --- /dev/null +++ b/test/models/detector/journal_test.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: detector_journals +# +# id :integer not null, primary key +# name :string +# additional_info :json +# created_at :datetime not null +# updated_at :datetime not null +# +require 'test_helper' + +module Detector + class JournalTest < ActiveSupport::TestCase + test 'exact term match on journal name' do + expected = detector_journals('the_new_england_journal_of_medicine') + actual = Detector::Journal.full_term_match('the new england journal of medicine') + + assert actual.count == 1 + assert_equal(expected, actual.first) + end + + test 'mixed case exact term match on journal name' do + expected = detector_journals('the_new_england_journal_of_medicine') + actual = Detector::Journal.full_term_match('The New England Journal of Medicine') + + assert actual.count == 1 + assert_equal(expected, actual.first) + end + + test 'exact match within longer term returns no matches' do + actual = Detector::Journal.full_term_match('The New England Journal of Medicine, 1999') + assert actual.count.zero? + end + + test 'phrase match within longer term returns matches' do + actual = Detector::Journal.partial_term_match('words and stuff The New England Journal of Medicine, 1999') + assert actual.count == 1 + end + + test 'multple matches can happen with phrase matching within longer terms' do + actual = Detector::Journal.partial_term_match('words and stuff Nature medicine, 1999') + assert actual.count == 2 + end + end +end