From 33530ff4ae4f7f0ab9e0a6272bd1f30b5b6b9466 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Mon, 26 Aug 2024 15:52:23 -0400 Subject: [PATCH 1/2] Add search_events:url_loader rake task ** Why are these changes being introduced: We have needed a rake task to load search events and terms from remote urls in Heroku lately, rather than from local files. ** Relevant ticket(s): n/a ** How does this address that need: This adds a search_events:url_loader method, cobbled together from the existing search_events:csv_loader task and suggested_resources:reload tasks. There is also some rudimentary tests to confirm the two cases where the tasks raise errors. ** Document any side effects to this change: This elects not to resolve some architectural differences between our existing tasks (search_events tasks are pretty self-contained, while suggested_resources relies on model methods). A general refactor of our rake tasks might be needed soon, but right now I'm not sure what direction that resolution should take. --- lib/tasks/search_event_loader.rake | 35 ++++++++++++++++ test/tasks/search_event_loader_rake_test.rb | 37 +++++++++++++++++ ...arch_events_url_loader_from_remote_csv.yml | 41 +++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 test/tasks/search_event_loader_rake_test.rb create mode 100644 test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml diff --git a/lib/tasks/search_event_loader.rake b/lib/tasks/search_event_loader.rake index d57b0a5..1b87f8b 100644 --- a/lib/tasks/search_event_loader.rake +++ b/lib/tasks/search_event_loader.rake @@ -28,4 +28,39 @@ namespace :search_events do term.search_events.create!(source: args.source, created_at: row.last) end end + + # url loader can bulk load SearchEvents and Terms. + # + # @note This is not for use in production. It is intended for use in review apps on Heroku, to load records in + # preparation for demonstrations. + # + # @note the csv should be formated as `term phrase`, `timestamp`. A dataclip is available that can export in this + # format. + # + # @example + # bin/rake search_events:url_loader['https://example.org/file.csv', 'some-source-to-use-for-all-loaded-records'] + # + # @param addr [String] a URL for a CSV file to load + # @param source [String] source name to load the data under + desc 'Load search_events from url' + task :url_loader, %i[addr source] => :environment do |_task, args| + raise ArgumentError.new, 'URL is required' if args.addr.blank? + raise ArgumentError.new, 'Source is required' if args.source.blank? + + Rails.logger.info("Term count before import: #{Term.count}") + Rails.logger.info("SearchEvent count before import: #{SearchEvent.count}") + + url = URI.parse(args.addr) + Rails.logger.info("Loading data from #{url}") + + file = url.open.read + data = CSV.parse(file) + data.each do |row| + term = Term.create_or_find_by!(phrase: row.first) + term.search_events.create!(source: args.source, created_at: row.last) + end + + Rails.logger.info("Term count after import: #{Term.count}") + Rails.logger.info("SearchEvent count after import: #{SearchEvent.count}") + end end diff --git a/test/tasks/search_event_loader_rake_test.rb b/test/tasks/search_event_loader_rake_test.rb new file mode 100644 index 0000000..964d750 --- /dev/null +++ b/test/tasks/search_event_loader_rake_test.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +require 'test_helper' +require 'rake' + +class SearchEventLoaderRakeTest < ActiveSupport::TestCase + def setup + Tacos::Application.load_tasks if Rake::Task.tasks.empty? + Rake::Task['search_events:url_loader'].reenable + end + + test 'url_reload can accept a url and source parameter' do + records_before = SearchEvent.count + VCR.use_cassette('search_events:url_loader from remote csv') do + remote_file = 'http://static.lndo.site/search_events.csv' + Rake::Task['search_events:url_loader'].invoke(remote_file, 'test') + end + + assert_not_equal records_before, SearchEvent.count + end + + test 'url_reload errors without any parameters' do + error = assert_raises(ArgumentError) do + Rake::Task['search_events:url_loader'].invoke() + end + assert_equal 'URL is required', error.message + end + + test 'url_reload errors without a source parameter' do + error = assert_raises(ArgumentError) do + remote_file = 'http://static.lndo.site/search_events.csv' + Rake::Task['search_events:url_loader'].invoke(remote_file) + end + assert_equal 'Source is required', error.message + end + +end diff --git a/test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml b/test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml new file mode 100644 index 0000000..cb7cf14 --- /dev/null +++ b/test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml @@ -0,0 +1,41 @@ +--- +http_interactions: +- request: + method: get + uri: http://static.lndo.site/search_events.csv + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Accept-Ranges: + - bytes + Content-Length: + - '92' + Content-Type: + - text/csv + Date: + - Wed, 28 Aug 2024 21:47:08 GMT + Etag: + - '"5c-620c52829ded5"' + Last-Modified: + - Wed, 28 Aug 2024 21:36:54 GMT + Server: + - Apache/2.4.54 (Debian) + body: + encoding: UTF-8 + string: | + first search term,1996-04-13 19:30:00.000000 + another search term,1999-05-15 19:30:00.000000 + recorded_at: Wed, 28 Aug 2024 21:47:08 GMT +recorded_with: VCR 6.2.0 From 42da8dd18dc72401d66dd5481cbd453796931055 Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Thu, 29 Aug 2024 09:46:00 -0400 Subject: [PATCH 2/2] Refactor to a single task that can load local or remote files --- lib/tasks/search_event_loader.rake | 49 ++++++--------------- test/tasks/search_event_loader_rake_test.rb | 17 ++++--- 2 files changed, 21 insertions(+), 45 deletions(-) diff --git a/lib/tasks/search_event_loader.rake b/lib/tasks/search_event_loader.rake index 1b87f8b..b511cc0 100644 --- a/lib/tasks/search_event_loader.rake +++ b/lib/tasks/search_event_loader.rake @@ -14,6 +14,9 @@ namespace :search_events do # @example # bin/rails search_events:csv_loader['local_path_to_file.csv', 'some-source-to-use-for-all-loaded-records'] # + # @example + # bin/rails search_events:csv_loader['https://SERVER/remote_path_to_file.json', 'some-source-to-use-for-all-loaded-records'] + # # @param path [String] local file path to a CSV file to load # @param source [String] source name to load the data under desc 'Load search_events from csv' @@ -21,46 +24,20 @@ namespace :search_events do raise ArgumentError.new, 'Path is required' if args.path.blank? raise ArgumentError.new, 'Source is required' if args.source.blank? - Rails.logger.info("Loading data from #{args.path}") - - CSV.foreach(args.path) do |row| - term = Term.create_or_find_by!(phrase: row.first) - term.search_events.create!(source: args.source, created_at: row.last) + # does the file look like a path or a URI + if URI(args.path).scheme + Rails.logger.info("Loading data from remote file #{args.path}") + data = URI.parse(args.path).open('rb', &:read) + else + Rails.logger.info("Loading data from local file #{args.path}") + data = File.read(args.path) end - end - # url loader can bulk load SearchEvents and Terms. - # - # @note This is not for use in production. It is intended for use in review apps on Heroku, to load records in - # preparation for demonstrations. - # - # @note the csv should be formated as `term phrase`, `timestamp`. A dataclip is available that can export in this - # format. - # - # @example - # bin/rake search_events:url_loader['https://example.org/file.csv', 'some-source-to-use-for-all-loaded-records'] - # - # @param addr [String] a URL for a CSV file to load - # @param source [String] source name to load the data under - desc 'Load search_events from url' - task :url_loader, %i[addr source] => :environment do |_task, args| - raise ArgumentError.new, 'URL is required' if args.addr.blank? - raise ArgumentError.new, 'Source is required' if args.source.blank? - - Rails.logger.info("Term count before import: #{Term.count}") - Rails.logger.info("SearchEvent count before import: #{SearchEvent.count}") - - url = URI.parse(args.addr) - Rails.logger.info("Loading data from #{url}") - - file = url.open.read - data = CSV.parse(file) - data.each do |row| + # not ideal, we should consider streaming the file rather than loading it fully into memory + # if you run into issues with this, consider loading subsets (such as a single month) at a time + CSV.parse(data) do |row| term = Term.create_or_find_by!(phrase: row.first) term.search_events.create!(source: args.source, created_at: row.last) end - - Rails.logger.info("Term count after import: #{Term.count}") - Rails.logger.info("SearchEvent count after import: #{SearchEvent.count}") end end diff --git a/test/tasks/search_event_loader_rake_test.rb b/test/tasks/search_event_loader_rake_test.rb index 964d750..cc2e4e8 100644 --- a/test/tasks/search_event_loader_rake_test.rb +++ b/test/tasks/search_event_loader_rake_test.rb @@ -6,32 +6,31 @@ class SearchEventLoaderRakeTest < ActiveSupport::TestCase def setup Tacos::Application.load_tasks if Rake::Task.tasks.empty? - Rake::Task['search_events:url_loader'].reenable + Rake::Task['search_events:csv_loader'].reenable end - test 'url_reload can accept a url and source parameter' do + test 'csv_loader can accept a url and source parameter' do records_before = SearchEvent.count VCR.use_cassette('search_events:url_loader from remote csv') do remote_file = 'http://static.lndo.site/search_events.csv' - Rake::Task['search_events:url_loader'].invoke(remote_file, 'test') + Rake::Task['search_events:csv_loader'].invoke(remote_file, 'test') end assert_not_equal records_before, SearchEvent.count end - test 'url_reload errors without any parameters' do + test 'csv_loader errors without any parameters' do error = assert_raises(ArgumentError) do - Rake::Task['search_events:url_loader'].invoke() + Rake::Task['search_events:csv_loader'].invoke end - assert_equal 'URL is required', error.message + assert_equal 'Path is required', error.message end - test 'url_reload errors without a source parameter' do + test 'csv_loader errors without a source parameter' do error = assert_raises(ArgumentError) do remote_file = 'http://static.lndo.site/search_events.csv' - Rake::Task['search_events:url_loader'].invoke(remote_file) + Rake::Task['search_events:csv_loader'].invoke(remote_file) end assert_equal 'Source is required', error.message end - end