updated examples

dedupeio · Jun 26, 2024 · 32dd501 · 32dd501
1 parent a767fa5
commit 32dd501
Show file tree

Hide file tree

Showing 15 changed files with 996 additions and 843 deletions.
diff --git a/csv_example/csv_evaluation.py b/csv_example/csv_evaluation.py
@@ -1,48 +1,46 @@
-from future.utils import viewitems
-
-import csv
 import collections
+import csv
 import itertools
 
+
 def evaluateDuplicates(found_dupes, true_dupes):
     true_positives = found_dupes.intersection(true_dupes)
     false_positives = found_dupes.difference(true_dupes)
-    uncovered_dupes = true_dupes.difference(found_dupes)
 
-    print('found duplicate')
+    print("found duplicate")
     print(len(found_dupes))
 
-    print('precision')
+    print("precision")
     print(1 - len(false_positives) / float(len(found_dupes)))
 
-    print('recall')
+    print("recall")
     print(len(true_positives) / float(len(true_dupes)))
 
 
-def dupePairs(filename, rowname) :
+def dupePairs(filename, rowname):
     dupe_d = collections.defaultdict(list)
 
     with open(filename) as f:
-        reader = csv.DictReader(f, delimiter=',', quotechar='"')
+        reader = csv.DictReader(f, delimiter=",", quotechar='"')
         for row in reader:
-            dupe_d[row[rowname]].append(row['Id'])
+            dupe_d[row[rowname]].append(row["Id"])
 
-    if 'x' in dupe_d :
-        del dupe_d['x']
+    if "x" in dupe_d:
+        del dupe_d["x"]
 
     dupe_s = set([])
-    for (unique_id, cluster) in viewitems(dupe_d) :
+    for unique_id, cluster in dupe_d.items():
         if len(cluster) > 1:
             for pair in itertools.combinations(cluster, 2):
                 dupe_s.add(frozenset(pair))
 
     return dupe_s
 
-manual_clusters = 'csv_example_input_with_true_ids.csv'
-dedupe_clusters = 'csv_example_output.csv'
 
-true_dupes = dupePairs(manual_clusters, 'True Id')
-test_dupes = dupePairs(dedupe_clusters, 'Cluster ID')
+manual_clusters = "csv_example_input_with_true_ids.csv"
+dedupe_clusters = "csv_example_output.csv"
 
-evaluateDuplicates(test_dupes, true_dupes)
+true_dupes = dupePairs(manual_clusters, "True Id")
+test_dupes = dupePairs(dedupe_clusters, "Cluster ID")
 
+evaluateDuplicates(test_dupes, true_dupes)
diff --git a/csv_example/csv_example.py b/csv_example/csv_example.py
@@ -14,11 +14,11 @@
 For larger datasets, see our [mysql_example](mysql_example.html)
 """
 
-import os
 import csv
-import re
 import logging
 import optparse
+import os
+import re
 
 import dedupe
 from unidecode import unidecode
@@ -30,8 +30,8 @@ def preProcess(column):
     Things like casing, extra spaces, quotes and new lines can be ignored.
     """
     column = unidecode(column)
-    column = re.sub('  +', ' ', column)
-    column = re.sub('\n', ' ', column)
+    column = re.sub("  +", " ", column)
+    column = re.sub("\n", " ", column)
     column = column.strip().strip('"').strip("'").lower().strip()
     # If data is missing, indicate that by setting the value to `None`
     if not column:
@@ -50,13 +50,13 @@ def readData(filename):
         reader = csv.DictReader(f)
         for row in reader:
             clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
-            row_id = int(row['Id'])
+            row_id = int(row["Id"])
             data_d[row_id] = dict(clean_row)
 
     return data_d
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     # ## Logging
 
@@ -65,9 +65,13 @@ def readData(filename):
     # line. You don't need it if you don't want that. To enable verbose
     # logging, run `python examples/csv_example/csv_example.py -v`
     optp = optparse.OptionParser()
-    optp.add_option('-v', '--verbose', dest='verbose', action='count',
-                    help='Increase verbosity (specify multiple times for more)'
-                    )
+    optp.add_option(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        action="count",
+        help="Increase verbosity (specify multiple times for more)",
+    )
     (opts, args) = optp.parse_args()
     log_level = logging.WARNING
     if opts.verbose:
@@ -79,29 +83,29 @@ def readData(filename):
 
     # ## Setup
 
-    input_file = 'csv_example_messy_input.csv'
-    output_file = 'csv_example_output.csv'
-    settings_file = 'csv_example_learned_settings'
-    training_file = 'csv_example_training.json'
+    input_file = "csv_example_messy_input.csv"
+    output_file = "csv_example_output.csv"
+    settings_file = "csv_example_learned_settings"
+    training_file = "csv_example_training.json"
 
-    print('importing data ...')
+    print("importing data ...")
     data_d = readData(input_file)
 
     # If a settings file already exists, we'll just load that and skip training
     if os.path.exists(settings_file):
-        print('reading from', settings_file)
-        with open(settings_file, 'rb') as f:
+        print("reading from", settings_file)
+        with open(settings_file, "rb") as f:
             deduper = dedupe.StaticDedupe(f)
     else:
         # ## Training
 
         # Define the fields dedupe will pay attention to
         fields = [
-            {'field': 'Site name', 'type': 'String'},
-            {'field': 'Address', 'type': 'String'},
-            {'field': 'Zip', 'type': 'Exact', 'has missing': True},
-            {'field': 'Phone', 'type': 'String', 'has missing': True},
-            ]
+            dedupe.variables.String("Site name"),
+            dedupe.variables.String("Address"),
+            dedupe.variables.Exact("Zip", has_missing=True),
+            dedupe.variables.String("Phone", has_missing=True),
+        ]
 
         # Create a new deduper object and pass our data model to it.
         deduper = dedupe.Dedupe(fields)
@@ -110,8 +114,8 @@ def readData(filename):
         # look for it and load it in.
         # __Note:__ if you want to train from scratch, delete the training_file
         if os.path.exists(training_file):
-            print('reading labeled examples from ', training_file)
-            with open(training_file, 'rb') as f:
+            print("reading labeled examples from ", training_file)
+            with open(training_file, "rb") as f:
                 deduper.prepare_training(data_d, f)
         else:
             deduper.prepare_training(data_d)
@@ -122,7 +126,7 @@ def readData(filename):
         # or not.
         # use 'y', 'n' and 'u' keys to flag duplicates
         # press 'f' when you are finished
-        print('starting active labeling...')
+        print("starting active labeling...")
 
         dedupe.console_label(deduper)
 
@@ -131,24 +135,24 @@ def readData(filename):
         deduper.train()
 
         # When finished, save our training to disk
-        with open(training_file, 'w') as tf:
+        with open(training_file, "w") as tf:
             deduper.write_training(tf)
 
         # Save our weights and predicates to disk.  If the settings file
         # exists, we will skip all the training and learning next time we run
         # this file.
-        with open(settings_file, 'wb') as sf:
+        with open(settings_file, "wb") as sf:
             deduper.write_settings(sf)
 
     # ## Clustering
 
     # `partition` will return sets of records that dedupe
     # believes are all referring to the same entity.
 
-    print('clustering...')
+    print("clustering...")
     clustered_dupes = deduper.partition(data_d, 0.5)
 
-    print('# duplicate sets', len(clustered_dupes))
+    print("# duplicate sets", len(clustered_dupes))
 
     # ## Writing Results
 
@@ -160,18 +164,18 @@ def readData(filename):
         for record_id, score in zip(records, scores):
             cluster_membership[record_id] = {
                 "Cluster ID": cluster_id,
-                "confidence_score": score
+                "confidence_score": score,
             }
 
-    with open(output_file, 'w') as f_output, open(input_file) as f_input:
+    with open(output_file, "w") as f_output, open(input_file) as f_input:
 
         reader = csv.DictReader(f_input)
-        fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames
+        fieldnames = ["Cluster ID", "confidence_score"] + reader.fieldnames
 
         writer = csv.DictWriter(f_output, fieldnames=fieldnames)
         writer.writeheader()
 
         for row in reader:
-            row_id = int(row['Id'])
+            row_id = int(row["Id"])
             row.update(cluster_membership[row_id])
             writer.writerow(row)