From 3fb34d6ab103f0c824c51b2932391b581708df1d Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Fri, 19 Jul 2024 09:37:32 -0400 Subject: [PATCH 1/2] Removes filtering of SSN as it also catches ISSN We output ISSN data regularly with this application and this filter has been annoying. As we never intend to use SSN data in this application it feels safe enough to remove this rather than renaming ISSN to something less meaningful to avoid the filtering. --- config/initializers/filter_parameter_logging.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/initializers/filter_parameter_logging.rb b/config/initializers/filter_parameter_logging.rb index c2d89e2..3ae771a 100644 --- a/config/initializers/filter_parameter_logging.rb +++ b/config/initializers/filter_parameter_logging.rb @@ -3,6 +3,6 @@ # Configure parameters to be partially matched (e.g. passw matches password) and filtered from the log file. # Use this to limit dissemination of sensitive information. # See the ActiveSupport::ParameterFilter documentation for supported notations and behaviors. -Rails.application.config.filter_parameters += [ - :passw, :secret, :token, :_key, :crypt, :salt, :certificate, :otp, :ssn +Rails.application.config.filter_parameters += %i[ + passw secret token _key crypt salt certificate otp ] From 9838367614a5b8e55bcce5ea30734cd9d2fd57c8 Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Fri, 19 Jul 2024 09:37:39 -0400 Subject: [PATCH 2/2] Adds OpenAlex harvers and Journal loader Why are these changes being introduced: * Loading journals from an external source is how we'll be internally detecting journal name matches in this experimental detector. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TCO-41 * https://mitlibraries.atlassian.net/browse/TCO-48 How does this address that need: * Adds data harvester from OpenAlex Sources API endpoint * Adds data loader from the result of the OpenAlex harvest which can load from local or remote file Document any side effects to this change: * Technically there are more than Journals being loaded in the current query configuration from OpenAlex. We are pulling both Journals and Book Series. It felt like Journals was still a fine internal name for this but we could also consider Serials to be more accurate. * Remote file loading only handles URI hosted files. If we move the OpenAlex Harvester functionality into a data pipeline that exports into S3 directly in the future we may want to add S3 support to the loader rather than exposing the S3 files via https (although the data is open so there is no harm in exposing direclty via https so it we could choose either) --- lib/tasks/journals.rake | 110 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 lib/tasks/journals.rake diff --git a/lib/tasks/journals.rake b/lib/tasks/journals.rake new file mode 100644 index 0000000..75cd577 --- /dev/null +++ b/lib/tasks/journals.rake @@ -0,0 +1,110 @@ +# frozen_string_literal: true + +# Loaders can bulk load data +namespace :journals do + # openalex_harvester uses the OpenAleX Sources API endpoint to harvest to a local json file + # + # @note currently, we are limiting to just core Sources but not limiting to journals. We may want to consider + # filtering to journals and not core to compare which is better at some point. This would be done by changing the + # `base_url`. As of July 2024, there are 27722 `journal` and 2084 `book series` records in core which felt like a + # good size for initial experimentation. + # + # @note see https://docs.openalex.org/api-entities/sources for full API documentation + # + # @example + # @param path [String] email address OpenAlex can contact with any issues or concerns with our harvester. + # In development, use your own. If used in production, use a team Moira list. + desc 'Harvest from Open Alex' + task :openalex_harvester, %i[email] => :environment do |_task, args| + raise ArgumentError.new, 'Email is required' unless args.email.present? + + base_url = 'https://api.openalex.org/sources?filter=is_core:true' + next_cursor = '*' + email = args.email + sleep_timer = 1 # value in seconds + per_page = 200 # max 200 per openalex api docs + filename = "tmp/openalex_core_#{DateTime.now.strftime('%Y_%m_%d')}.json" + + f = File.open(filename, 'w') + + # setup initial json structure in the file. This feels a bit clunky but works. + f.write('{') + f.write('"core":[') + + records_processed = 0 # purely for informational output + + until next_cursor.nil? + next_url = "#{base_url}&cursor=#{next_cursor}&per_page=#{per_page}&mailto=#{email}" + + Rails.logger.info("Next url request #{next_url}") + resp = HTTP.headers(accept: 'application/json').get(next_url) + + json = resp.parse + + next_cursor = json['meta']['next_cursor'] + total_records = json['meta']['count'] + + json['results'].each do |item| + records_processed += 1 + record = { + title: item['display_name'], + issns: item['issn'], + publisher: item['host_organization_name'], + alternate_titles: item['alternate_titles'], + abbreviated_title: item['abbreviated_title'], + type: item['type'] + } + f.write(JSON.pretty_generate(record)) + f.write(',') unless records_processed == total_records # skips final comma + end + + pp "Processed #{records_processed} of #{total_records}" + + sleep(sleep_timer) + end + + # close the json structure in the file + f.write(']') + f.write('}') + end + + # openalex_loader can bulk load Journal information + # + # A file to load can be generated by running the `openalex_harvester` task + # + # @example + # bin/rails journals:openalex_loader['local_path_to_file.json'] + # + # @example + # bin/rails journals:openalex_loader['https://SERVER/remote_path_to_file.json'] + # + # @param path [String] local file path or URI to a JSON file to load + desc 'Load from OpenAlex harvest' + task :openalex_loader, %i[file] => :environment do |_task, args| + raise ArgumentError.new, 'File is required' unless args.file.present? + + # does the file look like a path or a URI + if URI(args.file).scheme + Rails.logger.info("Loading data from remote file #{args.file}") + data = URI.open(args.file, 'rb', &:read) + else + Rails.logger.info("Loading data from local file #{args.file}") + data = File.read(args.file) + end + + # Delete all journals. We do this to simplify the loader process to avoid consideration of updates/deletes. + Detector::Journal.delete_all + + # not ideal, we should consider streaming the file rather than loading it fully into memory + json = JSON.parse(data) + + json['core'].each do |journal| + Detector::Journal.create(name: journal['title'], + additional_info: { issns: journal['issns'], + publisher: journal['publisher'], + alternate_titles: journal['alternate_titles'], + type: journal['type'], + abbreviated_title: journal['abbreviated_title'] }) + end + end +end