diff --git a/lib/tasks/search_event_loader.rake b/lib/tasks/search_event_loader.rake index d57b0a5..b511cc0 100644 --- a/lib/tasks/search_event_loader.rake +++ b/lib/tasks/search_event_loader.rake @@ -14,6 +14,9 @@ namespace :search_events do # @example # bin/rails search_events:csv_loader['local_path_to_file.csv', 'some-source-to-use-for-all-loaded-records'] # + # @example + # bin/rails search_events:csv_loader['https://SERVER/remote_path_to_file.json', 'some-source-to-use-for-all-loaded-records'] + # # @param path [String] local file path to a CSV file to load # @param source [String] source name to load the data under desc 'Load search_events from csv' @@ -21,9 +24,18 @@ namespace :search_events do raise ArgumentError.new, 'Path is required' if args.path.blank? raise ArgumentError.new, 'Source is required' if args.source.blank? - Rails.logger.info("Loading data from #{args.path}") + # does the file look like a path or a URI + if URI(args.path).scheme + Rails.logger.info("Loading data from remote file #{args.path}") + data = URI.parse(args.path).open('rb', &:read) + else + Rails.logger.info("Loading data from local file #{args.path}") + data = File.read(args.path) + end - CSV.foreach(args.path) do |row| + # not ideal, we should consider streaming the file rather than loading it fully into memory + # if you run into issues with this, consider loading subsets (such as a single month) at a time + CSV.parse(data) do |row| term = Term.create_or_find_by!(phrase: row.first) term.search_events.create!(source: args.source, created_at: row.last) end diff --git a/test/tasks/search_event_loader_rake_test.rb b/test/tasks/search_event_loader_rake_test.rb new file mode 100644 index 0000000..cc2e4e8 --- /dev/null +++ b/test/tasks/search_event_loader_rake_test.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +require 'test_helper' +require 'rake' + +class SearchEventLoaderRakeTest < ActiveSupport::TestCase + def setup + Tacos::Application.load_tasks if Rake::Task.tasks.empty? + Rake::Task['search_events:csv_loader'].reenable + end + + test 'csv_loader can accept a url and source parameter' do + records_before = SearchEvent.count + VCR.use_cassette('search_events:url_loader from remote csv') do + remote_file = 'http://static.lndo.site/search_events.csv' + Rake::Task['search_events:csv_loader'].invoke(remote_file, 'test') + end + + assert_not_equal records_before, SearchEvent.count + end + + test 'csv_loader errors without any parameters' do + error = assert_raises(ArgumentError) do + Rake::Task['search_events:csv_loader'].invoke + end + assert_equal 'Path is required', error.message + end + + test 'csv_loader errors without a source parameter' do + error = assert_raises(ArgumentError) do + remote_file = 'http://static.lndo.site/search_events.csv' + Rake::Task['search_events:csv_loader'].invoke(remote_file) + end + assert_equal 'Source is required', error.message + end +end diff --git a/test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml b/test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml new file mode 100644 index 0000000..cb7cf14 --- /dev/null +++ b/test/vcr_cassettes/search_events_url_loader_from_remote_csv.yml @@ -0,0 +1,41 @@ +--- +http_interactions: +- request: + method: get + uri: http://static.lndo.site/search_events.csv + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Accept-Ranges: + - bytes + Content-Length: + - '92' + Content-Type: + - text/csv + Date: + - Wed, 28 Aug 2024 21:47:08 GMT + Etag: + - '"5c-620c52829ded5"' + Last-Modified: + - Wed, 28 Aug 2024 21:36:54 GMT + Server: + - Apache/2.4.54 (Debian) + body: + encoding: UTF-8 + string: | + first search term,1996-04-13 19:30:00.000000 + another search term,1999-05-15 19:30:00.000000 + recorded_at: Wed, 28 Aug 2024 21:47:08 GMT +recorded_with: VCR 6.2.0