Skip to content

Commit

Permalink
Merge pull request #58 from mitre/further_household_fixes
Browse files Browse the repository at this point in the history
Additional households fixes based on testing
  • Loading branch information
dehall authored Apr 26, 2023
2 parents f6d11fc + 4b9256a commit 6ec0195
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 64 deletions.
34 changes: 28 additions & 6 deletions households.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@ def parse_arguments():
" Smaller numbers may result in out of memory errors. Larger numbers"
" may increase runtime. Default is 4",
)
parser.add_argument(
"--exact_addresses",
action="store_true",
help="Use exact matches on address as the definition of a household."
" By default the inference process will split up addresses into"
" street, number, suffix, etc, and considers phone # and family name"
" when making a determination which records belong to which household."
" Enabling this feature causes the process to use the entire address"
" as a single string for comparisons, and only the address. "
" If addresses have not been standardized/validated, this setting"
" will likely increase false negatives (records not being included "
" in households where they should be).",
)
parser.add_argument(
"--pairsfile",
help="Location of matching pairs file",
Expand Down Expand Up @@ -179,7 +192,7 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
# so it can be traversed sort of like a graph from any given patient
# note the key is patient position within the pii_lines dataframe
pos_to_pairs = get_household_matches(
pii_lines, args.split_factor, args.debug, args.pairsfile
pii_lines, args.split_factor, args.debug, args.exact_addresses, args.pairsfile
)

mapping_file = Path(args.mappingfile)
Expand Down Expand Up @@ -207,12 +220,13 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
pii_lines["written_to_file"] = False
hclk_position = 0
lines_processed = 0
hh_sizes = []
five_percent = int(len(pii_lines) / 20)
# Match households
for position, line in pii_lines.sample(frac=1).iterrows():
for position, _line in pii_lines.sample(frac=1).iterrows():
# sample(frac=1) shuffles the entire dataframe
# note that "position" is the index and still relative to the original

line = pii_lines.loc[position]
lines_processed += 1

if args.debug and (lines_processed % five_percent) == 0:
Expand All @@ -223,20 +237,22 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar

if line["written_to_file"]:
continue
line["written_to_file"] = True

if position in pos_to_pairs:
pat_positions = bfs_traverse_matches(pos_to_pairs, position)
# map those row numbers to PATIDs
pat_ids = list(
map(lambda p: pii_lines.at[p, "record_id"], pat_positions)
)
# mark all these rows as written to file
pii_lines.loc[pat_positions, ["written_to_file"]] = True
else:
pat_positions = [position]
pat_ids = [line[0]]

# mark all these rows as written to file
pii_lines.loc[pat_positions, ["written_to_file"]] = True

hh_sizes.append(len(pat_positions))

string_pat_positions = [str(p) for p in pat_positions]
pat_string = ",".join(string_pat_positions)
mapping_writer.writerow([hclk_position, pat_string])
Expand All @@ -258,6 +274,12 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
]
hclk_position += 1
pii_writer.writerow(output_row)

hh_sizes_series = pd.Series(hh_sizes, dtype=int)

print("Household size stats:")
print(hh_sizes_series.describe())

return n_households


Expand Down
170 changes: 114 additions & 56 deletions households/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,22 @@
from definitions import TIMESTAMP_FMT

MATCH_THRESHOLD = 0.85
FN_WEIGHT = 0.2
PHONE_WEIGHT = 0.15
ADDR_WEIGHT = 0.35
ZIP_WEIGHT = 0.3
FN_WEIGHT = 0.25
PHONE_WEIGHT = 0.2
ADDR_WEIGHT = 0.55
# ZIP_WEIGHT = 0.25
# zip is not used in weighting since all candidate pairs match on zip

# a separate address threshold so that pairs with medium-low scores across all fields
# don't wind up getting matched anyway
ADDR_THRESHOLD = 0.95
# using address_distance() below:
# "205 GARDEN ST" v "206 GARDEN ST" --> 0.8333
# "205 GARDEN ST" v "205 GAREDN ST" --> 0.98444
# "205 GARDEN STREET" v "205 GAREDN ST" --> 0.9666
# "205 GARDEN ST APT 5F" v "205 GARDEN ST APT 5J" --> 0.9472
# so 0.95 should give us a good balance of not linking all apartments together
# while still allowing some room for typos and variation


def addr_parse(addr):
Expand Down Expand Up @@ -53,6 +65,23 @@ def address_distance(addr1, addr2):
score = 0
secondary_score = 0

a1 = addr1["household_street_address"]
a2 = addr2["household_street_address"]

if not a1 or not a2:
# if either is blank they get a score of 0
# this matches textdistance.jaro_winkler("", x)
# but textdistance.jaro_winkler("", "") is normally 1
# without this, 2 missing addresses could be a "perfect match"
# which is not what we want
return 0

if a1 == a2:
# if the strings are exactly identical,
# don't waste time with detailed comparisons
# this matches textdistance.jaro_winkler(x, x)
return 1

# Change weights based on existence of second level address
if (
not addr1["prefix"]
Expand Down Expand Up @@ -213,15 +242,10 @@ def address_distance(addr1, addr2):

# See if simple string compare of all things combined
# with a 0.6 adjustment is better
a1 = addr1["household_street_address"]
a2 = addr2["household_street_address"]
if a1 and a2:
score = max(
score,
textdistance.jaro_winkler(a1, a2)
* (weight_number + weight_street_name)
* 0.6,
) + (secondary_score * weight_secondary)
score = max(
score,
textdistance.jaro_winkler(a1, a2) * (weight_number + weight_street_name) * 0.6,
) + (secondary_score * weight_secondary)
return score


Expand Down Expand Up @@ -271,7 +295,9 @@ def explode_address(row):
return parsed


def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None):
def get_household_matches(
pii_lines, split_factor=4, debug=False, exact_addresses=False, pairsfile=None
):
if pairsfile:
if debug:
print(f"[{datetime.now()}] Loading matching pairs file")
Expand All @@ -283,28 +309,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
print(f"[{datetime.now()}] Done loading matching pairs")

else:
# break out the address into number, street, suffix, etc,
# so we can prefilter matches based on those
addr_cols = pii_lines.apply(
explode_address,
axis="columns",
result_type="expand",
)
pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")

if exact_addresses:
pii_lines_exploded = pii_lines
else:
# break out the address into number, street, suffix, etc,
# so we can prefilter matches based on those
addr_cols = pii_lines.apply(
explode_address,
axis="columns",
result_type="expand",
)
pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")

if debug:
print(f"[{datetime.now()}] Done pre-processing PII file")

candidate_links = get_candidate_links(pii_lines_exploded, split_factor, debug)
gc.collect()

matching_pairs = get_matching_pairs(
pii_lines_exploded, candidate_links, split_factor, debug
candidate_links = get_candidate_links(
pii_lines_exploded, split_factor, exact_addresses, debug
)
del candidate_links
del pii_lines_exploded
gc.collect()

if exact_addresses:
# the candidate links are already all the pairs with matching [address, zip]
matching_pairs = candidate_links
else:
matching_pairs = get_matching_pairs(
pii_lines_exploded,
candidate_links,
split_factor,
exact_addresses,
debug,
)
del pii_lines_exploded
del candidate_links
gc.collect()

if debug:
timestamp = datetime.now().strftime(TIMESTAMP_FMT)
pairs_path = Path("temp-data") / f"households_pairs-{timestamp}.csv"
Expand Down Expand Up @@ -347,21 +387,25 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
return pos_to_pairs


def get_candidate_links(pii_lines, split_factor=4, debug=False):
def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=False):
# indexing step defines the pairs of records for comparison
# indexer.full() does a full n^2 comparison, but we can do better
indexer = recordlinkage.Index()
# use two block indexes to reduce the number of candidates
# use block indexes to reduce the number of candidates
# while still retaining enough candidates to identify real households.
# a block only on zip could work, but seems to run into memory issues
# note sortedneighborhood on zip probably doesn't make sense
# (zip codes in a geographic area will be too similar)
# but if data is dirty then blocks may discard typos

indexer.block(["household_zip", "street", "number"])
indexer.block(["household_zip", "family_name"])
if exact_addresses:
indexer.block(["household_zip", "household_street_address"])
else:
indexer.block(["household_zip", "street", "number"])
indexer.block(["household_zip", "family_name"])

candidate_links = None
# start with an empty index we can append to
candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1])

# break up the dataframe into subframes,
# and iterate over every pair of subframes.
Expand Down Expand Up @@ -404,20 +448,26 @@ def get_candidate_links(pii_lines, split_factor=4, debug=False):
pairs_subset = pairs_subset[pairs_subset[0] < pairs_subset[1]]
pairs_subset = pd.MultiIndex.from_frame(pairs_subset)

if candidate_links is None:
candidate_links = pairs_subset
else:
candidate_links = candidate_links.append(pairs_subset)
candidate_links = candidate_links.append(pairs_subset)

gc.collect()

# rows with blank address match ("" == "") so drop those here
# TODO: ideally we wouldn't compare blank address lines in the first place
# but the indexing and splitting bits get complicated if we drop them earlier
blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index
candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore")
candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore")

if debug:
print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs")

return candidate_links


def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
def get_matching_pairs(
pii_lines, candidate_links, split_factor, exact_addresses, debug
):
# Comparison step performs the defined comparison algorithms
# against the candidate pairs
compare_cl = recordlinkage.Compare()
Expand All @@ -428,24 +478,35 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
compare_cl.string(
"phone_number", "phone_number", method="jarowinkler", label="phone_number"
)
compare_cl.add(
AddressComparison(
"exploded_address",
"exploded_address",
if exact_addresses:
compare_cl.string(
"household_street_address",
"household_street_address",
method="jarowinkler",
label="household_street_address",
)
)
compare_cl.string(
"household_zip", "household_zip", method="levenshtein", label="household_zip"
)
else:
compare_cl.add(
AddressComparison(
"exploded_address",
"exploded_address",
label="household_street_address",
)
)

# NOTE: zip code is DISABLED because our indexes block on zip code
# compare_cl.string(
# "household_zip", "household_zip", method="levenshtein", label="household_zip"
# )
# note: hamming distance is not implemented in this library,
# but levenshtein is. the two metrics are likely similar enough
# that it's not worth implementing hamming again

if debug:
print(f"[{datetime.now()}] Starting detailed comparison of indexed pairs")

matching_pairs = None
# start with an empty index we can append to
matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1])
# we know that we could support len(subset_A) in memory above,
# so use the same amount here
len_subset_A = int(len(pii_lines) / split_factor)
Expand All @@ -470,18 +531,18 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):

features = compare_cl.compute(subset_links, relevant_pii_lines)

# first filter by address similarity
features = features[features["household_street_address"] > ADDR_THRESHOLD]

features["family_name"] *= FN_WEIGHT
features["phone_number"] *= PHONE_WEIGHT
features["household_street_address"] *= ADDR_WEIGHT
features["household_zip"] *= ZIP_WEIGHT
# features["household_zip"] *= ZIP_WEIGHT

# filter the matches down based on the cumulative score
matches = features[features.sum(axis=1) > MATCH_THRESHOLD]

if matching_pairs is None:
matching_pairs = matches.index
else:
matching_pairs = matching_pairs.append(matches.index)
matching_pairs = matching_pairs.append(matches.index)
# matching pairs are bi-directional and not duplicated,
# ex if (1,9) is in the list then (9,1) won't be

Expand All @@ -492,9 +553,6 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
del matches
gc.collect()

# drop exploded address because it's not used past this point
pii_lines.drop(columns=["exploded_address"], inplace=True)

if debug:
print(f"[{datetime.now()}] Found {len(matching_pairs)} matching pairs")

Expand Down
Loading

0 comments on commit 6ec0195

Please sign in to comment.