Skip to content

Commit

Permalink
updated examples
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Jun 26, 2024
1 parent a767fa5 commit 32dd501
Show file tree
Hide file tree
Showing 15 changed files with 996 additions and 843 deletions.
34 changes: 16 additions & 18 deletions csv_example/csv_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,46 @@
from future.utils import viewitems

import csv
import collections
import csv
import itertools


def evaluateDuplicates(found_dupes, true_dupes):
true_positives = found_dupes.intersection(true_dupes)
false_positives = found_dupes.difference(true_dupes)
uncovered_dupes = true_dupes.difference(found_dupes)

print('found duplicate')
print("found duplicate")
print(len(found_dupes))

print('precision')
print("precision")
print(1 - len(false_positives) / float(len(found_dupes)))

print('recall')
print("recall")
print(len(true_positives) / float(len(true_dupes)))


def dupePairs(filename, rowname) :
def dupePairs(filename, rowname):
dupe_d = collections.defaultdict(list)

with open(filename) as f:
reader = csv.DictReader(f, delimiter=',', quotechar='"')
reader = csv.DictReader(f, delimiter=",", quotechar='"')
for row in reader:
dupe_d[row[rowname]].append(row['Id'])
dupe_d[row[rowname]].append(row["Id"])

if 'x' in dupe_d :
del dupe_d['x']
if "x" in dupe_d:
del dupe_d["x"]

dupe_s = set([])
for (unique_id, cluster) in viewitems(dupe_d) :
for unique_id, cluster in dupe_d.items():
if len(cluster) > 1:
for pair in itertools.combinations(cluster, 2):
dupe_s.add(frozenset(pair))

return dupe_s

manual_clusters = 'csv_example_input_with_true_ids.csv'
dedupe_clusters = 'csv_example_output.csv'

true_dupes = dupePairs(manual_clusters, 'True Id')
test_dupes = dupePairs(dedupe_clusters, 'Cluster ID')
manual_clusters = "csv_example_input_with_true_ids.csv"
dedupe_clusters = "csv_example_output.csv"

evaluateDuplicates(test_dupes, true_dupes)
true_dupes = dupePairs(manual_clusters, "True Id")
test_dupes = dupePairs(dedupe_clusters, "Cluster ID")

evaluateDuplicates(test_dupes, true_dupes)
68 changes: 36 additions & 32 deletions csv_example/csv_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
For larger datasets, see our [mysql_example](mysql_example.html)
"""

import os
import csv
import re
import logging
import optparse
import os
import re

import dedupe
from unidecode import unidecode
Expand All @@ -30,8 +30,8 @@ def preProcess(column):
Things like casing, extra spaces, quotes and new lines can be ignored.
"""
column = unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = re.sub(" +", " ", column)
column = re.sub("\n", " ", column)
column = column.strip().strip('"').strip("'").lower().strip()
# If data is missing, indicate that by setting the value to `None`
if not column:
Expand All @@ -50,13 +50,13 @@ def readData(filename):
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
row_id = int(row["Id"])
data_d[row_id] = dict(clean_row)

return data_d


if __name__ == '__main__':
if __name__ == "__main__":

# ## Logging

Expand All @@ -65,9 +65,13 @@ def readData(filename):
# line. You don't need it if you don't want that. To enable verbose
# logging, run `python examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
optp.add_option(
"-v",
"--verbose",
dest="verbose",
action="count",
help="Increase verbosity (specify multiple times for more)",
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
Expand All @@ -79,29 +83,29 @@ def readData(filename):

# ## Setup

input_file = 'csv_example_messy_input.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'
input_file = "csv_example_messy_input.csv"
output_file = "csv_example_output.csv"
settings_file = "csv_example_learned_settings"
training_file = "csv_example_training.json"

print('importing data ...')
print("importing data ...")
data_d = readData(input_file)

# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
print('reading from', settings_file)
with open(settings_file, 'rb') as f:
print("reading from", settings_file)
with open(settings_file, "rb") as f:
deduper = dedupe.StaticDedupe(f)
else:
# ## Training

# Define the fields dedupe will pay attention to
fields = [
{'field': 'Site name', 'type': 'String'},
{'field': 'Address', 'type': 'String'},
{'field': 'Zip', 'type': 'Exact', 'has missing': True},
{'field': 'Phone', 'type': 'String', 'has missing': True},
]
dedupe.variables.String("Site name"),
dedupe.variables.String("Address"),
dedupe.variables.Exact("Zip", has_missing=True),
dedupe.variables.String("Phone", has_missing=True),
]

# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)
Expand All @@ -110,8 +114,8 @@ def readData(filename):
# look for it and load it in.
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file, 'rb') as f:
print("reading labeled examples from ", training_file)
with open(training_file, "rb") as f:
deduper.prepare_training(data_d, f)
else:
deduper.prepare_training(data_d)
Expand All @@ -122,7 +126,7 @@ def readData(filename):
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print('starting active labeling...')
print("starting active labeling...")

dedupe.console_label(deduper)

Expand All @@ -131,24 +135,24 @@ def readData(filename):
deduper.train()

# When finished, save our training to disk
with open(training_file, 'w') as tf:
with open(training_file, "w") as tf:
deduper.write_training(tf)

# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
with open(settings_file, 'wb') as sf:
with open(settings_file, "wb") as sf:
deduper.write_settings(sf)

# ## Clustering

# `partition` will return sets of records that dedupe
# believes are all referring to the same entity.

print('clustering...')
print("clustering...")
clustered_dupes = deduper.partition(data_d, 0.5)

print('# duplicate sets', len(clustered_dupes))
print("# duplicate sets", len(clustered_dupes))

# ## Writing Results

Expand All @@ -160,18 +164,18 @@ def readData(filename):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score
"confidence_score": score,
}

with open(output_file, 'w') as f_output, open(input_file) as f_input:
with open(output_file, "w") as f_output, open(input_file) as f_input:

reader = csv.DictReader(f_input)
fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames
fieldnames = ["Cluster ID", "confidence_score"] + reader.fieldnames

writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()

for row in reader:
row_id = int(row['Id'])
row_id = int(row["Id"])
row.update(cluster_membership[row_id])
writer.writerow(row)
Loading

0 comments on commit 32dd501

Please sign in to comment.