-
Notifications
You must be signed in to change notification settings - Fork 216
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
996 additions
and
843 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,46 @@ | ||
from future.utils import viewitems | ||
|
||
import csv | ||
import collections | ||
import csv | ||
import itertools | ||
|
||
|
||
def evaluateDuplicates(found_dupes, true_dupes): | ||
true_positives = found_dupes.intersection(true_dupes) | ||
false_positives = found_dupes.difference(true_dupes) | ||
uncovered_dupes = true_dupes.difference(found_dupes) | ||
|
||
print('found duplicate') | ||
print("found duplicate") | ||
print(len(found_dupes)) | ||
|
||
print('precision') | ||
print("precision") | ||
print(1 - len(false_positives) / float(len(found_dupes))) | ||
|
||
print('recall') | ||
print("recall") | ||
print(len(true_positives) / float(len(true_dupes))) | ||
|
||
|
||
def dupePairs(filename, rowname) : | ||
def dupePairs(filename, rowname): | ||
dupe_d = collections.defaultdict(list) | ||
|
||
with open(filename) as f: | ||
reader = csv.DictReader(f, delimiter=',', quotechar='"') | ||
reader = csv.DictReader(f, delimiter=",", quotechar='"') | ||
for row in reader: | ||
dupe_d[row[rowname]].append(row['Id']) | ||
dupe_d[row[rowname]].append(row["Id"]) | ||
|
||
if 'x' in dupe_d : | ||
del dupe_d['x'] | ||
if "x" in dupe_d: | ||
del dupe_d["x"] | ||
|
||
dupe_s = set([]) | ||
for (unique_id, cluster) in viewitems(dupe_d) : | ||
for unique_id, cluster in dupe_d.items(): | ||
if len(cluster) > 1: | ||
for pair in itertools.combinations(cluster, 2): | ||
dupe_s.add(frozenset(pair)) | ||
|
||
return dupe_s | ||
|
||
manual_clusters = 'csv_example_input_with_true_ids.csv' | ||
dedupe_clusters = 'csv_example_output.csv' | ||
|
||
true_dupes = dupePairs(manual_clusters, 'True Id') | ||
test_dupes = dupePairs(dedupe_clusters, 'Cluster ID') | ||
manual_clusters = "csv_example_input_with_true_ids.csv" | ||
dedupe_clusters = "csv_example_output.csv" | ||
|
||
evaluateDuplicates(test_dupes, true_dupes) | ||
true_dupes = dupePairs(manual_clusters, "True Id") | ||
test_dupes = dupePairs(dedupe_clusters, "Cluster ID") | ||
|
||
evaluateDuplicates(test_dupes, true_dupes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.