From 0c64fb0f06520e42b59dba9c965731140c812d66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nishiki=20=28=E9=8C=A6=E8=8F=AF=29?= Date: Sat, 22 Jun 2024 12:47:54 -0700 Subject: [PATCH] fix: company sync upserts (#21) # Overview This adds a convenience upsert method that should be called per data broker registry sync process. This revises the upsert process to anchor by sanitized website/domain first before sanitizing and updating other attributes. It also prevents category overrides to retain priority of sources (e.g. if a company was first sourced from CPPA, it won't say it was sourced from DataBrokersWatch.org if there was new information from there). --- app/models/company.rb | 18 +++++++++++ .../california_data_brokers_requestable.rb | 14 ++++----- .../company/data_brokers_watch_requestable.rb | 16 +++++----- app/models/concerns/company/sanitizable.rb | 6 ++-- test/models/company_test.rb | 30 +++++++++++++++++++ 5 files changed, 66 insertions(+), 18 deletions(-) diff --git a/app/models/company.rb b/app/models/company.rb index a3f28f7..f96000e 100644 --- a/app/models/company.rb +++ b/app/models/company.rb @@ -43,4 +43,22 @@ def source I18n.t("models.company.source", source: humanized_category) end + + # Convenience method to upsert companies when multiple sources are being used. + # Anchors on the website to determine if the company already exists. Sanitizes + # the website before checking. + # + # @param website [String] Unsanitized website of the company to upsert + # @param attributes [Hash] Attributes to upsert company with + # @return [Company] + def self.upsert_by_website(website:, **attributes) + sanitized_website = Company.new(website: website).domainify_website! + company = Company.find_by(website: sanitized_website) + + if company.present? + company.update(name: attributes[:name], email: attributes[:email]) + else + create(website: website, **attributes) + end + end end diff --git a/app/models/concerns/company/california_data_brokers_requestable.rb b/app/models/concerns/company/california_data_brokers_requestable.rb index 5785d8c..5ec7e47 100644 --- a/app/models/concerns/company/california_data_brokers_requestable.rb +++ b/app/models/concerns/company/california_data_brokers_requestable.rb @@ -18,16 +18,16 @@ def update_california_data_brokers registry_csv = Net::HTTP.get(REGISTRY_CSV_URI) CSV.parse(registry_csv, headers: true, row_sep: "\r\n") do |row| - email = row["Business primary contact email address"] - name = row["Business name"] website = row["Business primary website"] - next if email.blank? || name.blank? || website.blank? + name = row["Business name"] + email = row["Business primary contact email address"] + next if website.blank? || name.blank? || email.blank? - company = Company.find_or_initialize_by(email: email) - company.update \ - category: Company::CATEGORIES[:california_data_broker], + upsert_by_website \ + website: website, name: name, - website: website + email: email, + category: Company::CATEGORIES[:california_data_broker] rescue ActiveRecord::RecordNotUnique end end diff --git a/app/models/concerns/company/data_brokers_watch_requestable.rb b/app/models/concerns/company/data_brokers_watch_requestable.rb index c2821a5..8965870 100644 --- a/app/models/concerns/company/data_brokers_watch_requestable.rb +++ b/app/models/concerns/company/data_brokers_watch_requestable.rb @@ -19,17 +19,17 @@ def update_data_brokers_watch_companies companies = response_json.dig("DataBrokers") companies.each do |company| - emails = company.dig("Emails").split(";") - email = Company.most_likely_email(emails) - name = company.dig("Company Name") || company.dig("Domain") website = company.dig("Domain") - next if email.blank? || name.blank? || website.blank? + name = company.dig("Company Name") || company.dig("Domain") + emails = company.dig("Emails").split(";") + email = most_likely_email(emails) + next if website.blank? || name.blank? || email.blank? - company = Company.find_or_initialize_by(email: email) - company.update \ - category: Company::CATEGORIES[:data_brokers_watch], + upsert_by_website \ + website: website, name: name, - website: website + email: email, + category: Company::CATEGORIES[:data_brokers_watch] rescue ActiveRecord::RecordNotUnique end end diff --git a/app/models/concerns/company/sanitizable.rb b/app/models/concerns/company/sanitizable.rb index a2001c6..cb56eb7 100644 --- a/app/models/concerns/company/sanitizable.rb +++ b/app/models/concerns/company/sanitizable.rb @@ -9,7 +9,7 @@ module Company::Sanitizable # Ensures we're stripping leading and trailing whitespace from the name. # - # @return [void] + # @return [String] def strip_name! name.strip! end @@ -19,7 +19,7 @@ def strip_name! # a regex extraction to account for incorrectly formatted URLs from third-party # sources. # - # @return [void] + # @return [String] def domainify_website! stripped_website = website.strip extracted_website = stripped_website.scan(/[\w|\d\.]+/).last @@ -29,7 +29,7 @@ def domainify_website! # Ensures we're normalizing emails before saving. # - # @return [void] + # @return [String] def downcase_email! self.email = email.strip.downcase end diff --git a/test/models/company_test.rb b/test/models/company_test.rb index 7ea7143..1ed4227 100644 --- a/test/models/company_test.rb +++ b/test/models/company_test.rb @@ -77,4 +77,34 @@ class CompanyTest < ActiveSupport::TestCase assert_equal "localhost", company.website end + + test ".upsert_by_website doesn't override category" do + company = Company.create \ + category: "california_data_broker", + email: "test-company@localhost", + name: "Test Company", + website: "https://localhost" + + Company.upsert_by_website \ + category: "data_brokers_watch", + email: "test-company-updated-email@localhost", + name: "Test Company with updated name", + website: "https://localhost" + + assert_equal "california_data_broker", company.reload.category + assert_equal "Test Company with updated name", company.name + assert_equal "test-company-updated-email@localhost", company.email + end + + test ".upsert_by_website successfully creates a new company" do + assert_equal 1, Company.count # Account for fixture + + Company.upsert_by_website \ + category: "california_data_broker", + email: "test-company@localhost", + name: "Test Company", + website: "https://localhost" + + assert_equal 2, Company.count + end end