Skip to content

Commit

Permalink
Merge pull request #127 from MITLibraries/bulk-detections
Browse files Browse the repository at this point in the history
Adds BulkChecker module for Detectors
  • Loading branch information
JPrevost authored Nov 13, 2024
2 parents 55ddb39 + f0c1568 commit 315a26a
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 0 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ group :production do
end

group :development, :test do
gem 'awesome_print'

# See https://guides.rubyonrails.org/debugging_rails_applications.html#debugging-with-the-debug-gem
gem 'debug', platforms: %i[mri windows]

Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ GEM
rake (>= 10.4, < 14.0)
ast (2.4.2)
attr_required (1.0.2)
awesome_print (1.9.2)
barnes (0.0.9)
multi_json (~> 1)
statsd-ruby (~> 1.1)
Expand Down Expand Up @@ -475,6 +476,7 @@ PLATFORMS
DEPENDENCIES
administrate (~> 0.20.1)
annotate
awesome_print
barnes
bootsnap
cancancan
Expand Down
37 changes: 37 additions & 0 deletions app/models/detector/bulk_checker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# frozen_string_literal: true

class Detector
# BulkTermChecker is expected to be added to Detectors via `extend Detector::BulkTermChecker` to allow the
# singleton class to access it
# See also: `PatternChecker` for shared instance methods
module BulkChecker
# This method is intended to be used for inspecting detections during development.
# Assumptions include
# - the Class including this module implements a `detections` method (either via `attr_reader` or as a method)
# that is only populated for Terms in which it has made a detection
# - the initialize method accepts a `phrase` as a string
# @param output [boolean] optional. Defaults to false as that is the more likely scenario useful in development as
# the logger output is often what is desired.
def check_all_matches(output: false)
count = 0
matches = []
Term.find_each do |t|
d = new(t.phrase)
next if d.detections.blank?

count += 1

matches.push [t.phrase, d.detections]
end

if Rails.env.development?
Rails.logger.ap matches

Rails.logger.ap "Total Terms: #{Term.count}"
Rails.logger.ap "Total Matches: #{count}"
end

matches if output
end
end
end
11 changes: 11 additions & 0 deletions app/models/detector/citation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ class Detector
class Citation
attr_reader :score, :subpatterns, :summary

# shared singleton methods
extend Detector::BulkChecker

# Citation patterns are regular expressions which attempt to identify structures that are part of many citations.
# This object is used as part of the pattern_checker method. Some of these patterns may get promoted to the Detector
# model if they prove useful beyond a Citation context.
Expand All @@ -29,6 +32,8 @@ class Citation

# The required score value is the threshold needed for a phrase to be officially recorded with a Detection via it's
# associated Term.
# Hint: set this to 0 in development environments if you want to temporarily see all output
# of `.check_all_matches` rather than just the matches that met this threshold.
REQUIRED_SCORE = 6

# Summary thresholds are used by the calculate_score method. This class counts the number of occurrences of specific
Expand Down Expand Up @@ -69,6 +74,12 @@ def initialize(phrase)
@score = calculate_score
end

def detections
return unless detection?

[@summary, @subpatterns, @score]
end

# The record method first runs all of the parsers by running the initialize method. If the resulting score is higher
# than the REQUIRED_SCORE value, then a Detection is registered.
# @param term [Term]
Expand Down
3 changes: 3 additions & 0 deletions app/models/detector/lcsh.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ class Lcsh
# shared instance methods
include Detector::PatternChecker

# shared singleton methods
extend Detector::BulkChecker

# For now the initialize method just needs to run the pattern checker. A space for future development would be to
# write additional methods to look up the detected LCSH for more information, and to confirm that the phrase is
# actually an LCSH.
Expand Down
1 change: 1 addition & 0 deletions app/models/detector/pattern_checker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
class Detector
# PatternChecker is intended to be added to Detectors via `include Detector::PatternChecker` to make
# these methods available to instances of the class
# See also: `BulkTermChecker` for shared singleton methods
module PatternChecker
# pattern_checker iterates over all patterns defined in the calling object's `pattern` method.
#
Expand Down
3 changes: 3 additions & 0 deletions app/models/detector/standard_identifiers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def self.table_name_prefix
# shared instance methods
include Detector::PatternChecker

# shared singleton methods
extend Detector::BulkChecker

# Initialization process will run pattern checkers and strip invalid ISSN detections.
# @param phrase String. Often a `Term.phrase`.
# @return Nothing intentional. Data is written to Hash `@detections` during processing.
Expand Down
33 changes: 33 additions & 0 deletions test/models/detector/bulk_checker_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# frozen_string_literal: true

require 'test_helper'

class Detector
class CitationTest < ActiveSupport::TestCase
test 'citation_bulk_checker' do
bulk = Detector::Citation.check_all_matches(output: true)

assert_equal(1, bulk.count)
end

test 'journal_bulk_checker' do
skip 'Detector::Journal does not yet support bulk_checker'
end

test 'lcsh_bulk_checker' do
bulk = Detector::Lcsh.check_all_matches(output: true)

assert_equal(1, bulk.count)
end

test 'standard_identifier_bulk_checker' do
bulk = Detector::StandardIdentifiers.check_all_matches(output: true)

assert_equal(5, bulk.count)
end

test 'suggested_resources_bulk_checker' do
skip 'Detector::SuggestedResources does not yet support bulk_checker'
end
end
end
15 changes: 15 additions & 0 deletions test/models/detector/citation_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -245,5 +245,20 @@ class CitationTest < ActiveSupport::TestCase
assert_equal detection_count + 1, Detection.count
end
end

test 'detections returns nil when score is lower than configured' do
result = Detector::Citation.new('nothing here')

assert_equal 0, result.score
assert_nil result.detections
end

test 'detections returns expected array when score is higher than configured' do
result = Detector::Citation.new(terms('citation').phrase)

assert_equal result.summary, result.detections[0]
assert_equal result.subpatterns, result.detections[1]
assert_equal result.score, result.detections[2]
end
end
end

0 comments on commit 315a26a

Please sign in to comment.