Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(linking): implement possible match short-term #111

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions assets/initial_algorithms.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"label": "dibbs-basic",
"description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives",
"is_default": true,
"include_multiple_matches": true,
"belongingness_ratio": [0.75, 0.9],
"passes": [
{
"blocking_keys": [
Expand All @@ -15,7 +17,6 @@
"LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact"
},
"rule": "func:recordlinker.linking.matchers.eval_perfect_match",
"cluster_ratio": 0.9,
"kwargs": {
"thresholds": {
"FIRST_NAME": 0.9,
Expand All @@ -39,7 +40,6 @@
"BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_exact"
},
"rule": "func:recordlinker.linking.matchers.eval_perfect_match",
"cluster_ratio": 0.9,
"kwargs": {
"thresholds": {
"FIRST_NAME": 0.9,
Expand All @@ -57,6 +57,8 @@
"label": "dibbs-enhanced",
"description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.",
"is_default": false,
"include_multiple_matches": true,
"belongingness_ratio": [0.75, 0.9],
"passes": [
{
"blocking_keys": [
Expand All @@ -69,7 +71,6 @@
"LAST_NAME": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
},
"rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
"cluster_ratio": 0.9,
"kwargs": {
"similarity_measure": "JaroWinkler",
"thresholds": {
Expand Down Expand Up @@ -106,7 +107,6 @@
"BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
},
"rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
"cluster_ratio": 0.9,
"kwargs": {
"similarity_measure": "JaroWinkler",
"thresholds": {
Expand Down
30 changes: 23 additions & 7 deletions src/recordlinker/linking/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def link_record_against_mpi(
session: orm.Session,
algorithm: models.Algorithm,
external_person_id: typing.Optional[str] = None,
) -> tuple[bool, uuid.UUID, uuid.UUID]:
) -> tuple[models.Patient, models.Person | None, list[dict]]:
"""
Runs record linkage on a single incoming record (extracted from a FHIR
bundle) using an existing database as an MPI. Uses a flexible algorithm
Expand All @@ -81,10 +81,10 @@ def link_record_against_mpi(
# Membership scores need to persist across linkage passes so that we can
# find the highest scoring match across all passes
scores: dict[models.Person, float] = collections.defaultdict(float)
# the minimum ratio of matches needed to be considered a cluster member
belongingness_ratio_lower_bound, belongingness_ratio_upper_bound = algorithm.belongingness_ratio
for algorithm_pass in algorithm.passes:
with TRACER.start_as_current_span("link.pass"):
# the minimum ratio of matches needed to be considered a cluster member
cluster_ratio = algorithm_pass.cluster_ratio
# initialize a dictionary to hold the clusters of patients for each person
clusters: dict[models.Person, list[models.Patient]] = collections.defaultdict(list)
# block on the pii_record and the algorithm's blocking criteria, then
Expand All @@ -107,17 +107,33 @@ def link_record_against_mpi(
if compare(record, patient, algorithm_pass):
matched_count += 1
# calculate the match ratio for this person cluster
match_ratio = matched_count / len(patients)
if match_ratio >= cluster_ratio:
belongingness_ratio = matched_count / len(patients)
if belongingness_ratio >= belongingness_ratio_lower_bound:
# The match ratio is larger than the minimum cluster threshold,
# optionally update the max score for this person
scores[person] = max(scores[person], match_ratio)
scores[person] = max(scores[person], belongingness_ratio)

matched_person: typing.Optional[models.Person] = None
if scores:
# Find the person with the highest matching score
matched_person, _ = max(scores.items(), key=lambda i: i[1])

sorted_scores = [{"person": k, "belongingness_ratio": v} for k, v in sorted(scores.items(), key=lambda item: item[1])]
if not scores:
# No match
matched_person = models.Person() # Create new Person Cluster
results = []
elif sorted_scores[0]["belongingness_ratio"] >= belongingness_ratio_upper_bound:
# Match (1 or many)
matched_person = sorted_scores[0]["person"]
results = [x for x in sorted_scores if x["belongingness_ratio"] >= belongingness_ratio_upper_bound] # Multiple matches
if not algorithm.include_multiple_matches:
results = results[0:0] # 1 Match
else:
# Possible match
matched_person = None
results = sorted_scores

with TRACER.start_as_current_span("insert"):
patient = mpi_service.insert_patient(
session,
Expand All @@ -128,4 +144,4 @@ def link_record_against_mpi(
)

# return a tuple indicating whether a match was found and the person ID
return (bool(matched_person), patient.person.reference_id, patient.reference_id)
return (patient, patient.person, results)
2 changes: 0 additions & 2 deletions src/recordlinker/linking/mpi_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def insert_patient(
"""
Insert a new patient record into the database.
"""
# create a new Person record if one isn't provided
person = person or models.Person()

patient = models.Patient(person=person, record=record, external_patient_id=external_patient_id)

Expand Down
18 changes: 17 additions & 1 deletion src/recordlinker/models/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,27 @@ class Algorithm(Base):
is_default: orm.Mapped[bool] = orm.mapped_column(default=False, index=True)
label: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), unique=True)
description: orm.Mapped[str] = orm.mapped_column(sqltypes.Text(), nullable=True)
include_multiple_matches: orm.Mapped[bool] = orm.mapped_column(sqltypes.Boolean, default=True)
belongingness_ratio_lower_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
belongingness_ratio_upper_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
passes: orm.Mapped[list["AlgorithmPass"]] = orm.relationship(
back_populates="algorithm", cascade="all, delete-orphan"
)

@property
def belongingness_ratio(self) -> tuple[float, float]:
"""
Get the Belongingness Ratio Threshold Range for this algorithm pass.
"""
return (self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound)

@belongingness_ratio.setter # type: ignore
def belongingness_ratio(self, value: tuple[float, float]):
"""
Set the Belongingess Ratio for this algorithm pass.
"""
self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound = value

@classmethod
def from_dict(cls, **data: dict) -> "Algorithm":
"""
Expand Down Expand Up @@ -81,7 +98,6 @@ class AlgorithmPass(Base):
blocking_keys: orm.Mapped[list[str]] = orm.mapped_column(sqltypes.JSON)
_evaluators: orm.Mapped[dict[str, str]] = orm.mapped_column("evaluators", sqltypes.JSON)
_rule: orm.Mapped[str] = orm.mapped_column("rule", sqltypes.String(255))
cluster_ratio: orm.Mapped[float] = orm.mapped_column(sqltypes.Float)
kwargs: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON, default=dict)

@property
Expand Down
2 changes: 1 addition & 1 deletion src/recordlinker/models/mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class Patient(Base):
__tablename__ = "mpi_patient"

id: orm.Mapped[int] = orm.mapped_column(get_bigint_pk(), autoincrement=True, primary_key=True)
person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"))
person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"), nullable=True)
person: orm.Mapped["Person"] = orm.relationship(back_populates="patients")
# NOTE: We're using a protected attribute here to store the data string, as we
# want getter/setter access to the data dictionary to trigger updating the
Expand Down
18 changes: 9 additions & 9 deletions src/recordlinker/routes/link_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ async def link_piirecord(
# link the record
try:
# Make a copy of record_to_link so we don't modify the original
(found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
(patient, person, results) = link.link_record_against_mpi(
record=input.record,
session=db_session,
algorithm=algorithm,
external_person_id=input.external_person_id,
)
return schemas.LinkResponse(
is_match=found_match,
patient_reference_id=patient_reference_id,
person_reference_id=new_person_id,
patient_reference_id=patient.reference_id,
person_reference_id=person.reference_id,
results=results
)

except ValueError:
Expand Down Expand Up @@ -173,16 +173,16 @@ async def link_fhir(
# link the record
try:
# Make a copy of pii_record so we don't modify the original
(found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
(patient, person, results) = link.link_record_against_mpi(
record=pii_record,
session=db_session,
algorithm=algorithm,
external_person_id=external_id,
)
return schemas.LinkResponse(
is_match=found_match,
patient_reference_id=patient_reference_id,
person_reference_id=new_person_id,
return schemas.LinkResponse(
patient_reference_id=patient.reference_id,
person_reference_id=person.reference_id,
results=results
)

except ValueError:
Expand Down
19 changes: 17 additions & 2 deletions src/recordlinker/schemas/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class AlgorithmPass(pydantic.BaseModel):
blocking_keys: list[str]
evaluators: dict[str, str]
rule: str
cluster_ratio: float
kwargs: dict[str, typing.Any] = {}

@pydantic.field_validator("blocking_keys", mode="before")
Expand All @@ -43,7 +42,7 @@ def validate_blocking_keys(cls, value):
@pydantic.field_validator("evaluators", mode="before")
def validate_evaluators(cls, value):
"""
Validated the evaluators into a list of feature comparison functions.
Validate the evaluators into a list of feature comparison functions.
"""
for k, v in value.items():
try:
Expand Down Expand Up @@ -77,8 +76,24 @@ class Algorithm(pydantic.BaseModel):
label: str = pydantic.Field(pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$")
description: typing.Optional[str] = None
is_default: bool = False
include_multiple_matches: bool = True
belongingness_ratio: tuple[float, float]
passes: typing.Sequence[AlgorithmPass]

@pydantic.field_validator("belongingness_ratio", mode="before")
def validate_belongingness_ratio(cls, value):
"""
Validate the Belongingness Ratio Threshold Range.
"""
lower_bound, upper_bound = value
if lower_bound < 0 or lower_bound > 1:
raise ValueError(f"Invalid lower bound: {lower_bound}")
if upper_bound < 0 or upper_bound > 1:
raise ValueError(f"Invalid upper bound: {upper_bound}")
if lower_bound > upper_bound:
raise ValueError(f"Invalid range. Lower bound must be less than upper bound: {value}")
return (lower_bound, upper_bound)


class AlgorithmSummary(Algorithm):
"""
Expand Down
38 changes: 31 additions & 7 deletions src/recordlinker/schemas/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,46 @@ class LinkInput(pydantic.BaseModel):
)


class LinkResult(pydantic.BaseModel):
"""
TODO
"""

person_reference_id: uuid.UUID = pydantic.Field(
description="The identifier for a person that the patient may be linked to."
)

belongingness_ratio: float = pydantic.Field(
description="The percentage of patient records matched in this person cluster."
)


class LinkResponse(pydantic.BaseModel):
"""
Schema for responses from the link endpoint.
"""

is_match: bool = pydantic.Field(
description="A true value indicates that one or more existing records "
"matched with the provided record, and these results have been linked."
)

patient_reference_id: uuid.UUID = pydantic.Field(
description="The unique identifier for the patient that has been linked"
description="The unique identifier for the patient that has been linked."
)
person_reference_id: uuid.UUID = pydantic.Field(
description="The identifier for the person that the patient record has " "been linked to.",
person_reference_id: uuid.UUID | None = pydantic.Field(
description="The identifier for the person that the patient record has been linked to.",
)
results: list[LinkResult] = pydantic.Field(
description="TODO"
)

@pydantic.computed_field
@property
def prediction(self) -> typing.Literal["match", "possible_match", "no_match"]:
if self.person_reference_id and self.results:
return "match"
elif not self.results:
return "no_match"
else:
return "possible_match"


class LinkFhirInput(pydantic.BaseModel):
"""
Expand Down
17 changes: 8 additions & 9 deletions tests/unit/linking/test_algorithm_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ def test_load_algorithm_created(self, session):
data = schemas.Algorithm(
label="dibss-basic",
description="First algorithm",
belongingness_ratio=(0.75, 0.8),
passes=[
schemas.AlgorithmPass(
blocking_keys=["FIRST_NAME"],
evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
rule="func:recordlinker.linking.matchers.eval_perfect_match",
cluster_ratio=0.8,
)
],
)
Expand All @@ -82,25 +82,26 @@ def test_load_algorithm_created(self, session):
assert obj.id == 1
assert obj.label == "dibss-basic"
assert obj.description == "First algorithm"
assert obj.belongingness_ratio == (0.75, 0.8)
assert len(obj.passes) == 1
assert obj.passes[0].algorithm_id == 1
assert obj.passes[0].blocking_keys == ["FIRST_NAME"]
assert obj.passes[0].evaluators == {
"ZIP": "func:recordlinker.linking.matchers.feature_match_any"
}
assert obj.passes[0].rule == "func:recordlinker.linking.matchers.eval_perfect_match"
assert obj.passes[0].cluster_ratio == 0.8


def test_load_algorithm_updated(self, session):
data = schemas.Algorithm(
label="dibss-basic",
description="First algorithm",
belongingness_ratio=(0.75, 0.8),
passes=[
schemas.AlgorithmPass(
blocking_keys=["FIRST_NAME"],
evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
rule="func:recordlinker.linking.matchers.eval_perfect_match",
cluster_ratio=0.8,
rule="func:recordlinker.linking.matchers.eval_perfect_match"
)
],
)
Expand All @@ -114,14 +115,14 @@ def test_load_algorithm_updated(self, session):
assert obj.id == 1
assert obj.label == "dibss-basic"
assert obj.description == "Updated description"
assert obj.belongingness_ratio == (0.75, 0.8)
assert len(obj.passes) == 1
assert obj.passes[0].algorithm_id == 1
assert obj.passes[0].blocking_keys == ["LAST_NAME"]
assert obj.passes[0].evaluators == {
"ZIP": "func:recordlinker.linking.matchers.feature_match_any"
}
assert obj.passes[0].rule == "func:recordlinker.linking.matchers.eval_perfect_match"
assert obj.passes[0].cluster_ratio == 0.8


def test_delete_algorithm(session):
Expand All @@ -133,8 +134,7 @@ def test_delete_algorithm(session):
algorithm=algo1,
blocking_keys=["FIRST_NAME"],
evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
rule="func:recordlinker.linking.matchers.eval_perfect_match",
cluster_ratio=0.8,
rule="func:recordlinker.linking.matchers.eval_perfect_match"
)
session.add(pass1)
session.commit()
Expand All @@ -153,8 +153,7 @@ def test_clear_algorithms(session):
algorithm=algo1,
blocking_keys=["FIRST_NAME"],
evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
rule="func:recordlinker.linking.matchers.eval_perfect_match",
cluster_ratio=0.8,
rule="func:recordlinker.linking.matchers.eval_perfect_match"
)
session.add(pass1)
session.commit()
Expand Down
Loading
Loading