Skip to content

Commit

Permalink
fix: company sync upserts (#21)
Browse files Browse the repository at this point in the history
# Overview

This adds a convenience upsert method that should be called per data
broker registry sync process. This revises the upsert process to anchor
by sanitized website/domain first before sanitizing and updating other
attributes. It also prevents category overrides to retain priority of
sources (e.g. if a company was first sourced from CPPA, it won't say it
was sourced from DataBrokersWatch.org if there was new information from
there).
  • Loading branch information
nshki authored Jun 22, 2024
1 parent 9cd6ae2 commit 0c64fb0
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 18 deletions.
18 changes: 18 additions & 0 deletions app/models/company.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,22 @@ def source

I18n.t("models.company.source", source: humanized_category)
end

# Convenience method to upsert companies when multiple sources are being used.
# Anchors on the website to determine if the company already exists. Sanitizes
# the website before checking.
#
# @param website [String] Unsanitized website of the company to upsert
# @param attributes [Hash] Attributes to upsert company with
# @return [Company]
def self.upsert_by_website(website:, **attributes)
sanitized_website = Company.new(website: website).domainify_website!
company = Company.find_by(website: sanitized_website)

if company.present?
company.update(name: attributes[:name], email: attributes[:email])
else
create(website: website, **attributes)
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ def update_california_data_brokers
registry_csv = Net::HTTP.get(REGISTRY_CSV_URI)

CSV.parse(registry_csv, headers: true, row_sep: "\r\n") do |row|
email = row["Business primary contact email address"]
name = row["Business name"]
website = row["Business primary website"]
next if email.blank? || name.blank? || website.blank?
name = row["Business name"]
email = row["Business primary contact email address"]
next if website.blank? || name.blank? || email.blank?

company = Company.find_or_initialize_by(email: email)
company.update \
category: Company::CATEGORIES[:california_data_broker],
upsert_by_website \
website: website,
name: name,
website: website
email: email,
category: Company::CATEGORIES[:california_data_broker]
rescue ActiveRecord::RecordNotUnique
end
end
Expand Down
16 changes: 8 additions & 8 deletions app/models/concerns/company/data_brokers_watch_requestable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ def update_data_brokers_watch_companies
companies = response_json.dig("DataBrokers")

companies.each do |company|
emails = company.dig("Emails").split(";")
email = Company.most_likely_email(emails)
name = company.dig("Company Name") || company.dig("Domain")
website = company.dig("Domain")
next if email.blank? || name.blank? || website.blank?
name = company.dig("Company Name") || company.dig("Domain")
emails = company.dig("Emails").split(";")
email = most_likely_email(emails)
next if website.blank? || name.blank? || email.blank?

company = Company.find_or_initialize_by(email: email)
company.update \
category: Company::CATEGORIES[:data_brokers_watch],
upsert_by_website \
website: website,
name: name,
website: website
email: email,
category: Company::CATEGORIES[:data_brokers_watch]
rescue ActiveRecord::RecordNotUnique
end
end
Expand Down
6 changes: 3 additions & 3 deletions app/models/concerns/company/sanitizable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module Company::Sanitizable

# Ensures we're stripping leading and trailing whitespace from the name.
#
# @return [void]
# @return [String]
def strip_name!
name.strip!
end
Expand All @@ -19,7 +19,7 @@ def strip_name!
# a regex extraction to account for incorrectly formatted URLs from third-party
# sources.
#
# @return [void]
# @return [String]
def domainify_website!
stripped_website = website.strip
extracted_website = stripped_website.scan(/[\w|\d\.]+/).last
Expand All @@ -29,7 +29,7 @@ def domainify_website!

# Ensures we're normalizing emails before saving.
#
# @return [void]
# @return [String]
def downcase_email!
self.email = email.strip.downcase
end
Expand Down
30 changes: 30 additions & 0 deletions test/models/company_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,34 @@ class CompanyTest < ActiveSupport::TestCase

assert_equal "localhost", company.website
end

test ".upsert_by_website doesn't override category" do
company = Company.create \
category: "california_data_broker",
email: "test-company@localhost",
name: "Test Company",
website: "https://localhost"

Company.upsert_by_website \
category: "data_brokers_watch",
email: "test-company-updated-email@localhost",
name: "Test Company with updated name",
website: "https://localhost"

assert_equal "california_data_broker", company.reload.category
assert_equal "Test Company with updated name", company.name
assert_equal "test-company-updated-email@localhost", company.email
end

test ".upsert_by_website successfully creates a new company" do
assert_equal 1, Company.count # Account for fixture

Company.upsert_by_website \
category: "california_data_broker",
email: "test-company@localhost",
name: "Test Company",
website: "https://localhost"

assert_equal 2, Company.count
end
end

0 comments on commit 0c64fb0

Please sign in to comment.