-
Notifications
You must be signed in to change notification settings - Fork 0
/
DetermineCompany.rb
140 lines (113 loc) · 4.48 KB
/
DetermineCompany.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
require 'bundler/setup'
require './config.rb'
require './database.rb'
require './models/person.rb'
require 'public_suffix'
require 'levenshtein'
require './strategies/linkedin.rb'
require './strategies/whois.rb'
require './strategies/website.rb'
require './strategies/domain.rb'
# Basic class for consuming the API
class DetermineCompany
def initialize options
@verbose = options[:verbose]
@clean_sweep = options[:clean_sweep]
@known_email_providers = options[:known_email_providers]
@cache = {}
@strategies = [LinkedIn.new, Whois.new, Website.new, Domain.new]
end
def execute
clean_previous_results! if @clean_sweep
puts "Starting classification..." if @verbose
Person.unclassified.find_each do |person|
next unless valid? person
# Extract domain
domain = /.*@(.*)/.match(person.email)[1]
# Check cache for previously stored solution
if @cache[domain]
accept person, @cache[domain]
next
end
# Execute all strategies
results = @strategies.map { |s| s.execute domain }
results.flatten!
# Purge all strategies which could not give a solid answer
results.reject! &:blank?
# Skip if there's no answer
next if results.count == 0
# If we have only one valid answer, take that one (obviously)
if results.count == 1
accept person, results[0]
@cache[domain] = results[0]
next
end
# Merge identical names with weights, merging substrings too
# i.e. ["Microsoft", "Microsoft India", "IBM", "Microsoft"] becomes {"Microsoft" => 3, "IBM" => 1}
# this algorithm also captures typos and very near misses with a levenshtein distance < 3
results.sort_by! &:length
names = Hash.new(0)
results.each do |r|
desired_key = r
names.each do |name, weight|
if r.include? name or Levenshtein.distance(r, name) < 3
desired_key = name
break
end
end
names[desired_key] = names[desired_key] + 1
end
# Accept the answer with the largest weight
answer = names.max_by { |name, weight| weight }[0]
accept person, answer
@cache[domain] = answer
end
# Report success
puts "Classification finished. #{Person.classified.count}/#{Person.count} successful (#{(Person.classified.count.to_f/Person.count.to_f*100).round}%)" if @verbose
end
private
def clean_previous_results!
print "Clearing previous sessions..." if @verbose
Person.update_all company_name: nil, status_code: nil
puts 'done' if @verbose
end
def valid? person
# Only proceed if we have an e-mail address we can parse
unless /.*@.+\..+/.match person.email
reject(person, :email_format_invalid, 'not a parseable e-mail address')
return false
end
# Extract domain
domain = /.*@(.*)/.match(person.email)[1]
# Do not process IP addresses
if /^[0-9\.]+$/.match domain
reject(person, :domain_IP_address, 'is an IP-address')
return false
end
# Determine the domain root based on the list of public suffixes
begin
root_domain = PublicSuffix.parse(domain).domain
rescue PublicSuffix::DomainInvalid
reject(person, :invalid_TLD, 'invalid TLD')
return false
end
# Do not process e-mails addresses from known providers (e.g. @gmail.com, @hotmail.com)
if @known_email_providers.include? root_domain
reject(person, :known_email_provider, 'is a known e-mail provider')
return false
end
true
end
def accept person, company_name
# Always truncate to match max field length (45)
company_name = company_name[0..40]
# Write to database
person.update_attribute :company_name, company_name
# Output result
puts "#{person.email} works at #{company_name}" if @verbose
end
def reject person, error_code, message
person.update_attribute 'status_code', error_code
puts "#{person.email} rejected: #{message} (#{error_code})" if @verbose
end
end