diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c6d9ae0 --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# only let through requests that Google gives a score of > this score (between 0 and 1), +# where 1.0 is very likely a good interaction, 0.0 is very likely a bot; +# If the initial request fails to pass, our code (currently) falls back to a v2 reCAPTCHA challenge. +# See https://developers.google.com/recaptcha/docs/v3 +RECAPTCHA_MINIMUM_SCORE=0.5 +# v3 +RECAPTCHA_SITE_KEY_V3='your_recaptcha_v3_site_key' +RECAPTCHA_SECRET_KEY_V3='your_recaptcha_v3_secret_key' +# v2 -- used for fallback if v3 verification fails +RECAPTCHA_SITE_KEY='your_recaptcha_v2_site_key' +RECAPTCHA_SECRET_KEY='your_recaptcha_v2_secret_key' diff --git a/.gitignore b/.gitignore index ee2a4f9..88c098d 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,9 @@ config/skylight.yml config/settings.local.yml config/settings/*.local.yml .secrets.sh +.env +# other things to ignore +vendor/* import/* *.swp diff --git a/Gemfile b/Gemfile index 4ba1b78..86eb9ce 100644 --- a/Gemfile +++ b/Gemfile @@ -136,3 +136,6 @@ group :development, :test do end gem 'willow_sword', github: 'notch8/willow_sword' + +gem 'dotenv-rails' +gem 'recaptcha' diff --git a/Gemfile.lock b/Gemfile.lock index 739ee3e..d498927 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -229,6 +229,10 @@ GEM devise diff-lcs (1.5.1) docopt (0.5.0) + dotenv (2.8.1) + dotenv-rails (2.8.1) + dotenv (= 2.8.1) + railties (>= 3.2) down (4.8.1) addressable (~> 2.5) draper (4.0.2) @@ -781,6 +785,7 @@ GEM rdf-xsd (3.2.1) rdf (~> 3.2) rexml (~> 3.2) + recaptcha (5.16.0) redic (1.5.3) hiredis redis (4.8.1) @@ -1048,6 +1053,7 @@ DEPENDENCIES database_cleaner devise devise-guests (~> 0.6) + dotenv-rails down (~> 4.4) edtf ezid-client @@ -1073,6 +1079,7 @@ DEPENDENCIES rack (~> 2.2.6) rails (~> 5.1.6) rails-controller-testing + recaptcha redis (~> 4.0) resque resque-pool diff --git a/app/controllers/archive_controller.rb b/app/controllers/archive_controller.rb index a051403..05c9d7d 100644 --- a/app/controllers/archive_controller.rb +++ b/app/controllers/archive_controller.rb @@ -5,7 +5,7 @@ class ArchiveController < ApplicationController def user_is_authorized? set_variables - true # satisfy open access requirement + authenticated_user? && recaptcha_success? end def status @@ -18,10 +18,9 @@ def status def download_request if user_is_authorized? - result = @archive_file.get! + result = @archive_file.get!(request_metadata) if result[:file_path].present? send_file(result[:file_path], filename: result[:filename]) - @archive_file.downloaded! else unless result[:message] Rails.logger.error("Message missing from #{@archive_file} result: #{result}") @@ -34,13 +33,14 @@ def download_request end end else - redirect_back fallback_location: root_url, alert: 'Action unavailable' + @archive_file.log_denied_attempt!(request_metadata, update_only: true) + redirect_back fallback_location: root_url, alert: @failure_description end end private def variable_params - params.permit(:collection, :object, :format, :request) + params.permit(:collection, :object, :format, :request, 'g-recaptcha-response'.to_sym, 'g-recaptcha-response-data'.to_sym => [:sda_request]) end def set_variables @@ -48,4 +48,24 @@ def set_variables @object = "#{variable_params[:object]}.#{variable_params[:format]}" @archive_file = ArchiveFile.new(collection: @collection, object: @object) end + + def authenticated_user? + return true unless Settings.archive_api.require_user_authentication + @failure_description = 'Action available only to signed-in users.' + user_signed_in? + end + + def recaptcha_success? + return true unless Settings.recaptcha.use? + v3_success = verify_recaptcha(action: 'sda_request', minimum_score: Settings.recaptcha.minimum_score.to_f, secret_key: Settings.recaptcha.v3.secret_key) + v2_success = verify_recaptcha unless v3_success + @failure_description = 'Action requires successful recaptcha completion.' + v3_success || v2_success + end + + def request_metadata + user_metadata = { time: Time.now, user: current_user&.email } + user_metadata.merge!(recaptcha: recaptcha_reply || {}) if Settings.recaptcha.use? + user_metadata + end end diff --git a/app/models/archive_file.rb b/app/models/archive_file.rb index 46508c8..867ec86 100644 --- a/app/models/archive_file.rb +++ b/app/models/archive_file.rb @@ -28,92 +28,65 @@ def status end end - def display_status - display_message_for(status) + def description_for_status(method:, lookup_status:, lookup_hash:) + Rails.logger.error("##{method} called with invalid key: #{lookup_status}") unless lookup_status.in?(lookup_hash.keys) + lookup_hash[lookup_status] end - # a single archive_status can map to more than one #status - # multiple #status values map to the same end user message - def display_messages - @display_messages ||= begin - available = 'File found in archives but not yet staged for download. Attempt file download to initiate transfer from archives.' - requested = 'File transfer from archives has started. Please allow up to 1 hour for transfer to complete, then re-attempt download.' - { staging_available: available, # refined 503/unstaged status - staging_requested: requested, # refined 503/unstaged status - staged_after_request: requested, # refined 200/staged status -- don't consider available for download until "local" status, copied from SDA cache to scratch - staged_without_request: available, # refined 200/staged status -- requires user request to start downloading workflow - local: 'File is available for immediate download', - not_found: 'File not found in archives', - no_response: 'File archives server is not responding', - unexpected: 'Unexpected response from file archives server', - too_many_requests: 'File is available in archives, but too many transfer requests are running. Please try again later.' } - end - end - - def display_message_for(current_status) - Rails.logger.error("#display_message_for called with invalid key: #{current_status}") unless current_status.in?(display_messages.keys) - display_messages[current_status] - end - - def request_action - request_action_for(status) - end - - def request_action_for(current_status) - request_actions[current_status] + # used in descriptive fields, above action button + def display_status(current_status = status) + description_for_status(method: :display_status, lookup_status: current_status, lookup_hash: Settings.archive_api.status_messages.to_hash.with_indifferent_access) end - def request_actions - @request_actions ||= begin - available = 'Request file from archives' - requested = 'File transfer from archives has started' - { staging_available: available, # refined 503/unstaged status - staging_requested: requested, # refined 503/unstaged status - staged_after_request: requested, # refined 200/staged status -- don't consider available for download until "local" status, copied from SDA cache to scratch - staged_without_request: available, # refined 200/staged status -- requires user request to start downloading workflow - local: 'Download', - not_found: 'File not found in archives', - no_response: 'File archives server is not responding', - unexpected: 'Unexpected response from file archives server', - too_many_requests: 'File is available in archives, but too many transfer requests are running. Please try again later.' } - end + # used for button text + def request_action(current_status = status) + description_for_status(method: :request_action, lookup_status: current_status, lookup_hash: Settings.archive_api.request_actions.to_hash.with_indifferent_access) end def request_actionable?(request_status = status) request_status.in? [:staging_available, :staged_without_request, :local] end + # used for :notice and :alert messages in controller flash + def flash_message(current_status = status) + description_for(method: :flash_message, lookup_status: current_status, lookup_hash: Settings.archive_api.flash_messages.to_hash.with_indifferent_access) + end + # requests staging (if available and not requested yet) # returns describing status, action taken (if any), and descriptive message # @return Hash - def get! + def get!(request_hash = {}) current_status = status + request_hash.merge!({ status: current_status }) case current_status when :local - { status: current_status, action: nil, file_path: local_path, filename: local_filename, message: display_message_for(current_status) } + create_or_update_job_file!({ latest_user_download: Time.now, downloads: [request_hash] }) + { status: current_status, action: nil, file_path: local_path, filename: local_filename, message: display_status(current_status) } when :staging_available, :staged_without_request - stage_request!(current_status) + stage_request!(request_hash) when :staging_requested, :staged_after_request # no action -- wait for DownloadArchivalFilesTask to stage and download - { status: current_status, action: nil, message: display_message_for(:staging_requested) } + create_or_update_job_file!({ requests: [request_hash] }) + { status: current_status, action: nil, message: display_status(:staging_requested) } when :not_found, :no_response, :unexpected - { status: current_status, action: nil, message: display_message_for(current_status) } + create_or_update_job_file!({ requests: [request_hash] }) + { status: current_status, action: nil, message: display_status(current_status) } else Rails.logger.warn("Unexpected archive file status: #{current_status}") + create_or_update_job_file!({ requests: [request_hash] }) { status: current_status, action: nil, message: 'Unknown file status' } end end + def log_denied_attempt!(request_hash = {}, update_only: false) + create_or_update_job_file!({ denials: [request_hash] }, update_only: update_only) + end + # bypasses status in job file via checking directly def downloaded? File.exist?(local_path) end - # called by ArchiveController after successful user download - def downloaded! - create_or_update_job_file!({ user_downloaded: Time.now }) - end - def staged? archive_status.in? [:staged_without_request, :staged_after_request] end @@ -225,13 +198,14 @@ def archive_request(method: Net::HTTP::Head) # if not yet staged: requests for staging (if possible) # @return Hash - def stage_request!(current_status) + def stage_request!(request_hash = {}) Rails.logger.warn("Staging request for #{archive_url} made in status: #{status}") if staged? # log :staged_without_request cases if block_new_jobs? - { status: current_status, action: :throttled, message: display_message_for(:too_many_requests), alert: true } + log_denied_attempt!(request_hash.merge({ reason: 'block_new_jobs' })) # FIXME: update_only false or true here? + { status: request_hash[:status], action: :throttled, message: display_status(:too_many_requests), alert: true } else - create_or_update_job_file! - { status: current_status, action: :create_or_update_job_file!, message: display_message_for(:staging_requested) } + create_or_update_job_file!({ requests: [request_hash.merge({ action: 'create_or_update_job_file!'})] }) + { status: request_hash[:status], action: :create_or_update_job_file!, message: display_status(:staging_requested) } end end @@ -241,35 +215,37 @@ def job_file_path # @return nil, Symbol [:staging_available, :staging_requested, :staged_after_request, :local] def job_status - return unless job_file? - current_job_parameters[:status] + archive_file_worker&.job_status end def job_file? File.exist?(job_file_path) end - def current_job_parameters - return {} unless job_file? - YAML.load_file(job_file_path) + # avoid memoization for current results + def archive_file_worker + @archive_worker ||= begin + return unless job_file? + ArchiveFileWorker.new(job_file_path, logger: Rails.logger) + end end def default_job_parameters { url: archive_url, filename: local_filename, file_path: local_path, collection: collection, object: object, status: status, created_at: Time.now } end - def create_or_update_job_file!(new_params = nil) + def create_or_update_job_file!(new_params = nil, update_only: false) if job_file? unless new_params # only update an existing file with new, non-default job parameters Rails.logger.warn("Ignoring duplicate call to create default job parameters file for #{archive_url}") return end - new_params = current_job_parameters.merge(new_params) - else + archive_file_worker.update_job_yaml(new_params) + elsif !update_only new_params ||= {} new_params = default_job_parameters.merge(new_params) + new_params = new_params.merge(updated_at: Time.now) + File.write(job_file_path, new_params.to_yaml) end - new_params = new_params.merge(updated_at: Time.now) - File.write(job_file_path, new_params.to_yaml) end end diff --git a/app/models/archive_file_worker.rb b/app/models/archive_file_worker.rb index bfde737..113472d 100644 --- a/app/models/archive_file_worker.rb +++ b/app/models/archive_file_worker.rb @@ -14,6 +14,11 @@ def job_yaml YAML.load_file(yaml_path) end + # @return Symbol [:staging_available, :staging_requested, :staged_after_request, :local] + def job_status + job_yaml[:status] + end + # memoization is okay here -- collection, object values are stable def archive_file @archive_file ||= ::ArchiveFile.new(collection: job_yaml[:collection], object: job_yaml[:object]) @@ -34,17 +39,19 @@ def self.too_many_jobs? job_files.select { |job_file| YAML.load_file(job_file)[:status].in?(max_settings[:statuses]) }.size >= max_settings[:limit] end.any? end + delegate :too_many_jobs?, to: :class def self.too_much_space_used? - file_paths = ArchiveFileWorker.job_files.map { |job_file| YAML.load_file(job_file)[:file_path] } + file_paths = job_files.map { |job_file| YAML.load_file(job_file)[:file_path] } size_used = file_paths.map { |path| (File.size(path) if File.file?(path)).to_i }.sum - size_used > Settings.archive_api.maximum_disk_space + size_used >= Settings.archive_api.maximum_disk_space end delegate :too_much_space_used?, to: :class def self.block_new_jobs? too_many_jobs? || too_much_space_used? end + delegate :bloack_new_jobs?, to: :class def process_file # if the file is not currently open by another process @@ -62,7 +69,7 @@ def process_file when :staging_requested stage_file # TODO: reconsider? when :staged_after_request, :staged_without_request - download_file + file else process_error("unexpected file status: #{current_status}") end @@ -99,17 +106,17 @@ def stage_file logger.info("Staging request submitted") end - def download_file + def file logger.info("Download initiated for #{yaml_path}") update_job_yaml({ status: :staged_after_request }) if too_much_space_used? logger.warn("Disk quota exceeded. Blocking file download until space is available.") else - update_job_yaml({ download_started: Time.now }) + update_job_yaml({ transfer_started: Time.now }) system(curl_command(output: true)) - FileUtils.mv(download_path, file_path) - update_job_yaml({ status: :local, download_completed: Time.now }) + FileUtils.mv(path, file_path) + update_job_yaml({ status: :local, transfer_completed: Time.now }) logger.info("Download completed at #{file_path}") end end @@ -118,14 +125,14 @@ def file_path job_yaml[:file_path] end - def download_path + def path file_path = '.datacore.yml' end def curl_command(output: false) header = "Authorization: #{Settings.archive_api.username}:#{Settings.archive_api.password}" if output - "curl -H '#{header}' #{job_yaml[:url]} --output #{download_path}" + "curl -H '#{header}' #{job_yaml[:url]} --output #{path}" else "curl -H '#{header}' #{job_yaml[:url]}" end @@ -145,8 +152,8 @@ def clean_local_file end def delete_file? - return false unless job_yaml[:user_downloaded] || job_yaml[:download_completed] - return true if job_yaml[:user_downloaded] && ((Time.now - job_yaml[:user_downloaded]).to_i > TIMEOUT_AFTER_DOWNLOAD.to_i) - return true if job_yaml[:download_completed] && ((Time.now - job_yaml[:download_completed]).to_i > TIMEOUT_BEFORE_DOWNLOAD.to_i) + return false unless job_yaml[:latest_user_download] || job_yaml[:transfer_completed] + return true if job_yaml[:latest_user_download] && ((Time.now - job_yaml[:latest_user_download]).to_i > TIMEOUT_AFTER_DOWNLOAD.to_i) + return true if job_yaml[:transfer_completed] && ((Time.now - job_yaml[:transfer_completed]).to_i > TIMEOUT_BEFORE_DOWNLOAD.to_i) end end diff --git a/app/presenters/concerns/datacore/presents_archive_file.rb b/app/presenters/concerns/datacore/presents_archive_file.rb new file mode 100644 index 0000000..79e2aa6 --- /dev/null +++ b/app/presenters/concerns/datacore/presents_archive_file.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module Datacore + module PresentsArchiveFile + + # archive files bypass fedora storage + def archive_file? + mime_type.match(/^message\/external-body\;.*access-type=URL/).present? + end + + def archive_request_url + return '/' unless archive_file? + mime_type.split('"').last + end + + def archive_status_url + archive_request_url.sub('/request/', '/status/') + end + + def archive_file + @archive_file ||= + if archive_file? + # nested objects should be stored in format + # /sda/request/collection/subdir%2Ffilename + # but normalize if unencoded / slipped in + collection_and_object = mime_type.split('"').last.sub('/sda/request/', '').split('/') + collection = collection_and_object.first + object = collection_and_object[1, collection_and_object.size].join('%2F') + ArchiveFile.new(collection: collection, object: object) + end + end + delegate :status, :request_action, :request_actionable?, to: :archive_file, allow_nil: true + alias_method :archive_status, :status + end +end diff --git a/app/presenters/hyrax/ds_file_set_presenter.rb b/app/presenters/hyrax/ds_file_set_presenter.rb index e427127..4370a83 100644 --- a/app/presenters/hyrax/ds_file_set_presenter.rb +++ b/app/presenters/hyrax/ds_file_set_presenter.rb @@ -1,8 +1,8 @@ # frozen_string_literal: true module Hyrax - class DsFileSetPresenter < Hyrax::FileSetPresenter + include ::Datacore::PresentsArchiveFile delegate :doi, :doi_the_correct_one, :doi_minted?, @@ -28,40 +28,6 @@ class DsFileSetPresenter < Hyrax::FileSetPresenter # @solr_document[ Solrizer.solr_name( 'doi', :symbol ) ].first == ::Deepblue::DoiBehavior::DOI_PENDING # end - # archive files bypass fedora storage - def archive_file? - mime_type.match(/^message\/external-body\;.*access-type=URL/).present? - end - - def archive_request_url - return '/' unless archive_file? - mime_type.split('"').last - end - - def archive_status_url - archive_request_url.sub('/request/', '/status/') - end - - def archive_file - @archive_file ||= - if archive_file? - collection, object = mime_type.split('"').last.sub('/sda/request/', '').split('/') - ArchiveFile.new(collection: collection, object: object) - end - end - - def archive_status - @archive_status ||= archive_file.status - end - - def request_action - @request_action ||= archive_file.request_action - end - - def request_actionable? - @request_actionable ||= archive_file.request_actionable?(archive_status) - end - def relative_url_root rv = ::DeepBlueDocs::Application.config.relative_url_root return rv if rv @@ -118,7 +84,5 @@ def file_name( parent_presenter, link_to ) def file_size_too_large_to_download? !@solr_document.file_size.nil? && @solr_document.file_size >= DeepBlueDocs::Application.config.max_work_file_size_to_download end - end - end diff --git a/app/views/hyrax/file_sets/show.html.erb b/app/views/hyrax/file_sets/show.html.erb index ad7bcc6..a0cc700 100644 --- a/app/views/hyrax/file_sets/show.html.erb +++ b/app/views/hyrax/file_sets/show.html.erb @@ -33,15 +33,24 @@   <% else %> <%# TODO: render 'show_descriptions' See https://github.com/samvera/hyrax/issues/1481 %> - <%# FIXME: hide details if archive_file? %> <%= render 'show_details' %> <% if @presenter.archive_file? %> Archival file status: <%= @presenter.archive_file&.display_status %>
- class="btn btn-primary"> + <% if Settings.recaptcha.use? %> + <% if flash[:alert]&.match /recaptcha/ %> + <%= recaptcha_tags %> + <% else %> + <%= recaptcha_v3(action: 'sda_request', site_key: Settings.recaptcha.v3.site_key) %> + <% end %> + <% end %> + + class="btn <%= @presenter.request_actionable? ? 'btn-primary' : 'btn-danger' %>">
<% unless @presenter.request_actionable? %> - Refresh this page to check for updated download availability + Refresh this page to update archive retrieval status and file availability for download <% end %> <% end %> <%= render 'hyrax/users/activity_log', events: @presenter.events %> diff --git a/config/settings.yml b/config/settings.yml index 059061e..2553337 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -113,6 +113,7 @@ ingest: outbox: tmp/ingest/out archive_api: + require_user_authentication: false url: archive.url.invalid/%s/%s username: archiveusername.invalid password: archivepassword.invalid @@ -131,3 +132,39 @@ archive_api: - :local # retained downloaded files timeout_before_download: 86400 # 24 hours timeout_after_download: 172800 # 48 hours + # used in descriptive fields, above action button + # a single archive_status can map to more than one #status + # multiple #status values map to the same end user message + status_messages: &status_messages + staging_available: &available "File found in archives but not yet staged for download. Request retreival from archives to make available for download." + staging_requested: &requested "File retrieval from archives has started, and the file can be downloaded once the transfer is complete. Please allow up to 1 hour for the transfer to complete." + staged_after_request: *requested + staged_without_request: *available + local: "File is available for immediate download" + not_found: "File not found in archives. Unable to request file." + no_response: "File archives server is not responding. Unable to request file." + unexpected: "Unexpected response from file archives server. Unable to request file." + too_many_requests: "File is available in archives, but too many transfer requests are currently running. Please try again later." + # used for button text + request_actions: + staging_available: &available 'Initiate file retrieval from archives' + staging_requested: &requested 'File retrieval from archives is in process' + staged_after_request: *requested + staged_without_request: *available + local: 'Download' + not_found: &unavailable 'File is not available' + no_response: *unavailable + unexpected: *unavailable + too_many_requests: *unavailable + # used for :notice and :alert messages in controller flash + flash_messages: *status_messages + +recaptcha: + use?: false + minimum_score: 0.5 + v2: + site_key: 'your_recaptcha_v2_site_key' + secret_key: 'your_recaptcha_v2_secret_key' + v3: + site_key: 'your_recaptcha_v3_site_key' + secret_key: 'your_recaptcha_v3_secret_key' diff --git a/lib/tasks/download_archival_files.rake b/lib/tasks/download_archival_files.rake index 8f44ad5..bfc6ded 100644 --- a/lib/tasks/download_archival_files.rake +++ b/lib/tasks/download_archival_files.rake @@ -11,12 +11,12 @@ namespace :datacore do desc "Download archival files" task download_archival_files: :environment do - DataCore::DownloadArchivalFilesTask.new.run + Datacore::DownloadArchivalFilesTask.new.run end end -module DataCore +module Datacore class DownloadArchivalFilesTask include ActionView::Helpers::NumberHelper diff --git a/lib/tasks/ingest_files.rake b/lib/tasks/ingest_files.rake index 2370b70..84f9de0 100644 --- a/lib/tasks/ingest_files.rake +++ b/lib/tasks/ingest_files.rake @@ -5,12 +5,12 @@ namespace :datacore do desc "Ingest dataset files from directory for previously created datasets." task ingest_directory: :environment do - DataCore::IngestFilesFromDirectoryTask.new.run + Datacore::IngestFilesFromDirectoryTask.new.run end end -module DataCore +module Datacore class IngestFilesFromDirectoryTask include ActionView::Helpers::NumberHelper