diff --git a/.yardopts b/.yardopts index 65a9e3d..a04dc0d 100644 --- a/.yardopts +++ b/.yardopts @@ -1 +1 @@ ---no-private --protected app/**/*.rb - docs/**/*.md +--private --protected app/**/*.rb - docs/**/*.md diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb index 624e0f2..662b3a6 100644 --- a/app/models/detector/standard_identifiers.rb +++ b/app/models/detector/standard_identifiers.rb @@ -17,12 +17,15 @@ def self.table_name_prefix extend Detector::BulkChecker # Initialization process will run pattern checkers and strip invalid ISSN detections. - # @param phrase String. Often a `Term.phrase`. - # @return Nothing intentional. Data is written to Hash `@detections` during processing. + # + # @param phrase String. Often a `Term.phrase`. + # @return nil. Data is written to Hash `@detections` during processing. Things technically get + # returned here but it is a side effect and should not be relied on. def initialize(phrase) @detections = {} pattern_checker(phrase) strip_invalid_issns + strip_invalid_isbns end # The record method will consult the set of regex-based detectors that are defined in @@ -53,13 +56,76 @@ def self.record(term) def patterns { barcode: /^39080[0-9]{9}$/, - isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/, + isbn: /\b(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/, issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/, pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/, doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b} } end + # strip_invalid_isbns coordinates the logic to remove ISBNs that are not valid from our list of detected ISBNs + # + # ISBNs cannot be validated via regex. Regex gives us a list of candidates that look like ISBNs. We remove invalid + # ISBNs by following validation specifications defined in the standard. + def strip_invalid_isbns + return unless @detections[:isbn] + + @detections.delete(:isbn) unless valid_isbn?(@detections[:isbn]) + end + + # valid_isbn? checks for 10 or 13 digit ISBNs and defers to appropriate methods for each + # + # @param candidate String. A string representation of a regex detected ISBN. + # @return boolean + def valid_isbn?(candidate) + digits = candidate.delete('-').chars + + # check 10 digit + if digits.length == 10 + valid_isbn_10?(digits) + # check 13 digit + elsif digits.length == 13 + valid_isbn_13?(digits) + # This shouldn't happen, log an error. + else + Rails.logger.error("Non-10 or 13 digit sequence detected as ISBN: #{candidate}") + Sentry.capture_message('Non-10 or 13 digit sequence detected as ISBN') + false + end + end + + # valid_isbn_10? follows the ISBN 10 specification for validation + # https://en.wikipedia.org/wiki/ISBN#ISBN-10_check_digits + # + # @param digits Array. An array of strings representing each character from a detected ISBN candidate. + # @return boolean + def valid_isbn_10?(digits) + sum = 0 + digits.each_with_index do |digit, index| + digit = '10' if digit.casecmp('x').zero? + sum += digit.to_i * (10 - index) + end + (sum % 11).zero? + end + + # valid_isbn_13? follows the ISBN 13 specification for validation + # https://en.wikipedia.org/wiki/ISBN#ISBN-13_check_digit_calculation + # + # @param digits Array. An array of strings representing each character from a detected ISBN candidate. + # @return boolean + def valid_isbn_13?(digits) + sum = 0 + digits.map(&:to_i).each_with_index do |digit, index| + sum += digit * (index.even? ? 1 : 3) + end + + (sum % 10).zero? + end + + # strip_invalid_issns coordinates the logic to remove ISSNs that are not valid from our list of detected ISSNs + # + # ISSNs cannot be validated via regex. Regex gives us a list of candidates that look like ISSNs. We remove invalid + # ISSNs by following validation specifications defined in the standard. def strip_invalid_issns return unless @detections[:issn] diff --git a/test/models/detector/standard_identifiers_test.rb b/test/models/detector/standard_identifiers_test.rb index 20e36bc..22412a7 100644 --- a/test/models/detector/standard_identifiers_test.rb +++ b/test/models/detector/standard_identifiers_test.rb @@ -23,10 +23,24 @@ class StandardIdentifiersTest < ActiveSupport::TestCase end end + test 'ISBN-10 examples with incorrect check digits are not detected' do + # from wikipedia + samples = ['99921-58-10-1', '9971-5-0210-1', '960-425-059-1', '80-902734-1-1', '85-359-0277-1', + '1-84356-028-1', '0-684-84328-1', '0-8044-2957-1', '0-85131-041-1', '93-86954-21-1', '0-943396-04-1', + '0-9752298-0-1'] + + samples.each do |isbn| + actual = Detector::StandardIdentifiers.new(isbn).detections + + assert_nil(actual[:isbn]) + end + end + test 'ISBN-13 examples' do - samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5', - '979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4', - '978-0-943396-04-2', '979-0-9752298-0-X'] + samples = ['978-99921-58-10-4', '978-9971-5-0210-2', '978-960-425-059-2', '978-80-902734-1-2', + '978-85-359-0277-8', '978-1-84356-028-9', '978-0-684-84328-5', '978-0-8044-2957-3', + '978-0-85131-041-1', '978-93-86954-21-3', '978-0-943396-04-0', '978-0-9752298-0-4', '9798531132178', + '9798577456832', '979-8-886-45174-0', '9781319145446'] samples.each do |isbn| actual = Detector::StandardIdentifiers.new(isbn).detections @@ -35,8 +49,21 @@ class StandardIdentifiersTest < ActiveSupport::TestCase end end + test 'ISBN-13 examples with incorrect check digits are not detected' do + samples = ['978-99921-58-10-1', '978-9971-5-0210-1', '978-960-425-059-1', '978-80-902734-1-1', + '978-85-359-0277-1', '978-1-84356-028-1', '978-0-684-84328-1', '978-0-8044-2957-1', + '978-0-85131-041-2', '978-93-86954-21-1', '978-0-943396-04-1', '978-0-9752298-0-1', '9798531132171', + '9798577456831', '979-8-886-45174-1', '9781319145441'] + + samples.each do |isbn| + actual = Detector::StandardIdentifiers.new(isbn).detections + + assert_nil(actual[:isbn]) + end + end + test 'not ISBNs' do - samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here'] + samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here', '99921-58-10-1', '979-8-886-45174-1'] samples.each do |notisbn| actual = Detector::StandardIdentifiers.new(notisbn).detections