CDCgov · alhayward · Oct 30, 2024 · Oct 31, 2024 · Nov 1, 2024 · Nov 1, 2024
@@ -3,6 +3,8 @@
         "label": "dibbs-basic",
         "description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives",
         "is_default": true,
+        "include_multiple_matches": true,
+        "belongingness_ratio": [0.75, 0.9],
         "passes": [
             {
                 "blocking_keys": [
@@ -15,7 +17,6 @@
                     "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_perfect_match",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "thresholds": {
                         "FIRST_NAME": 0.9,
@@ -39,7 +40,6 @@
                     "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_exact"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_perfect_match",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "thresholds": {
                         "FIRST_NAME": 0.9,
@@ -57,6 +57,8 @@
         "label": "dibbs-enhanced",
         "description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.",
         "is_default": false,
+        "include_multiple_matches": true,
+        "belongingness_ratio": [0.75, 0.9],
         "passes": [
             {
                 "blocking_keys": [
@@ -69,7 +71,6 @@
                     "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "similarity_measure": "JaroWinkler",
                     "thresholds": {
@@ -106,7 +107,6 @@
                     "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "similarity_measure": "JaroWinkler",
                     "thresholds": {

@@ -58,7 +58,7 @@ def link_record_against_mpi(
     session: orm.Session,
     algorithm: models.Algorithm,
     external_person_id: typing.Optional[str] = None,
-) -> tuple[bool, uuid.UUID, uuid.UUID]:
+) -> tuple[models.Patient, models.Person | None, list[dict]]:
     """
     Runs record linkage on a single incoming record (extracted from a FHIR
     bundle) using an existing database as an MPI. Uses a flexible algorithm
@@ -81,10 +81,10 @@ def link_record_against_mpi(
     # Membership scores need to persist across linkage passes so that we can
     # find the highest scoring match across all passes
     scores: dict[models.Person, float] = collections.defaultdict(float)
+    # the minimum ratio of matches needed to be considered a cluster member
+    belongingness_ratio_lower_bound, belongingness_ratio_upper_bound = algorithm.belongingness_ratio
     for algorithm_pass in algorithm.passes:
         with TRACER.start_as_current_span("link.pass"):
-            # the minimum ratio of matches needed to be considered a cluster member
-            cluster_ratio = algorithm_pass.cluster_ratio
             # initialize a dictionary to hold the clusters of patients for each person
             clusters: dict[models.Person, list[models.Patient]] = collections.defaultdict(list)
             # block on the pii_record and the algorithm's blocking criteria, then
@@ -107,17 +107,33 @@ def link_record_against_mpi(
                             if compare(record, patient, algorithm_pass):
                                 matched_count += 1
                     # calculate the match ratio for this person cluster
-                    match_ratio = matched_count / len(patients)
-                    if match_ratio >= cluster_ratio:
+                    belongingness_ratio = matched_count / len(patients)
+                    if belongingness_ratio >= belongingness_ratio_lower_bound:
                         # The match ratio is larger than the minimum cluster threshold,
                         # optionally update the max score for this person
-                        scores[person] = max(scores[person], match_ratio)
+                        scores[person] = max(scores[person], belongingness_ratio)
 
     matched_person: typing.Optional[models.Person] = None
     if scores:
         # Find the person with the highest matching score
         matched_person, _ = max(scores.items(), key=lambda i: i[1])
 
+    sorted_scores = [{"person": k, "belongingness_ratio": v} for k, v in sorted(scores.items(), key=lambda item: item[1])]
+    if not scores:
+        # No match
+        matched_person = models.Person() # Create new Person Cluster
+        results = []
+    elif sorted_scores[0]["belongingness_ratio"] >= belongingness_ratio_upper_bound:
+        # Match (1 or many)
+        matched_person = sorted_scores[0]["person"]
+        results = [x for x in sorted_scores if x["belongingness_ratio"] >= belongingness_ratio_upper_bound] # Multiple matches
+        if not algorithm.include_multiple_matches:
+            results = results[0:0] # 1 Match
+    else:
+        # Possible match
+        matched_person = None
+        results = sorted_scores
+
     with TRACER.start_as_current_span("insert"):
         patient = mpi_service.insert_patient(
             session,
@@ -128,4 +144,4 @@ def link_record_against_mpi(
         )
 
     # return a tuple indicating whether a match was found and the person ID
-    return (bool(matched_person), patient.person.reference_id, patient.reference_id)
+    return (patient, patient.person, results)
@@ -73,8 +73,6 @@ def insert_patient(
     """
     Insert a new patient record into the database.
     """
-    # create a new Person record if one isn't provided
-    person = person or models.Person()
 
     patient = models.Patient(person=person, record=record, external_patient_id=external_patient_id)
 

@@ -22,10 +22,27 @@ class Algorithm(Base):
     is_default: orm.Mapped[bool] = orm.mapped_column(default=False, index=True)
     label: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), unique=True)
     description: orm.Mapped[str] = orm.mapped_column(sqltypes.Text(), nullable=True)
+    include_multiple_matches: orm.Mapped[bool] = orm.mapped_column(sqltypes.Boolean, default=True)
+    belongingness_ratio_lower_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
+    belongingness_ratio_upper_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
     passes: orm.Mapped[list["AlgorithmPass"]] = orm.relationship(
         back_populates="algorithm", cascade="all, delete-orphan"
     )
 
+    @property
+    def belongingness_ratio(self) -> tuple[float, float]:
+        """
+        Get the Belongingness Ratio Threshold Range for this algorithm pass.
+        """
+        return (self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound)
+
+    @belongingness_ratio.setter  # type: ignore
+    def belongingness_ratio(self, value: tuple[float, float]):
+        """
+        Set the Belongingess Ratio for this algorithm pass.
+        """
+        self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound = value
+
     @classmethod
     def from_dict(cls, **data: dict) -> "Algorithm":
         """
@@ -81,7 +98,6 @@ class AlgorithmPass(Base):
     blocking_keys: orm.Mapped[list[str]] = orm.mapped_column(sqltypes.JSON)
     _evaluators: orm.Mapped[dict[str, str]] = orm.mapped_column("evaluators", sqltypes.JSON)
     _rule: orm.Mapped[str] = orm.mapped_column("rule", sqltypes.String(255))
-    cluster_ratio: orm.Mapped[float] = orm.mapped_column(sqltypes.Float)
     kwargs: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON, default=dict)
 
     @property

@@ -39,7 +39,7 @@ class Patient(Base):
     __tablename__ = "mpi_patient"
 
     id: orm.Mapped[int] = orm.mapped_column(get_bigint_pk(), autoincrement=True, primary_key=True)
-    person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"))
+    person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"), nullable=True)
     person: orm.Mapped["Person"] = orm.relationship(back_populates="patients")
     # NOTE: We're using a protected attribute here to store the data string, as we
     # want getter/setter access to the data dictionary to trigger updating the

@@ -46,16 +46,16 @@ async def link_piirecord(
     # link the record
     try:
         # Make a copy of record_to_link so we don't modify the original
-        (found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=input.record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=input.external_person_id,
         )
         return schemas.LinkResponse(
-            is_match=found_match,
-            patient_reference_id=patient_reference_id,
-            person_reference_id=new_person_id,
+            patient_reference_id=patient.reference_id,
+            person_reference_id=person.reference_id,
+            results=results
         )
 
     except ValueError:
@@ -173,16 +173,16 @@ async def link_fhir(
     # link the record
     try:
         # Make a copy of pii_record so we don't modify the original
-        (found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=pii_record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=external_id,
         )
-        return schemas.LinkResponse(
-            is_match=found_match,
-            patient_reference_id=patient_reference_id,
-            person_reference_id=new_person_id,
+        return schemas.LinkResponse(    
+            patient_reference_id=patient.reference_id,
+            person_reference_id=person.reference_id,
+            results=results
         )
 
     except ValueError:

@@ -25,7 +25,6 @@ class AlgorithmPass(pydantic.BaseModel):
     blocking_keys: list[str]
     evaluators: dict[str, str]
     rule: str
-    cluster_ratio: float
     kwargs: dict[str, typing.Any] = {}
 
     @pydantic.field_validator("blocking_keys", mode="before")
@@ -43,7 +42,7 @@ def validate_blocking_keys(cls, value):
     @pydantic.field_validator("evaluators", mode="before")
     def validate_evaluators(cls, value):
         """
-        Validated the evaluators into a list of feature comparison functions.
+        Validate the evaluators into a list of feature comparison functions.
         """
         for k, v in value.items():
             try:
@@ -77,8 +76,24 @@ class Algorithm(pydantic.BaseModel):
     label: str = pydantic.Field(pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$")
     description: typing.Optional[str] = None
     is_default: bool = False
+    include_multiple_matches: bool = True
+    belongingness_ratio: tuple[float, float]
     passes: typing.Sequence[AlgorithmPass]
 
+    @pydantic.field_validator("belongingness_ratio", mode="before")
+    def validate_belongingness_ratio(cls, value):
+        """
+        Validate the Belongingness Ratio Threshold Range.
+        """
+        lower_bound, upper_bound = value
+        if lower_bound < 0 or lower_bound > 1:
+            raise ValueError(f"Invalid lower bound: {lower_bound}")
+        if upper_bound < 0 or upper_bound > 1:
+            raise ValueError(f"Invalid upper bound: {upper_bound}")
+        if lower_bound > upper_bound:
+            raise ValueError(f"Invalid range. Lower bound must be less than upper bound: {value}")
+        return (lower_bound, upper_bound)
+
 
 class AlgorithmSummary(Algorithm):
     """

@@ -31,22 +31,46 @@ class LinkInput(pydantic.BaseModel):
     )
 
 
+class LinkResult(pydantic.BaseModel):
+    """
+    TODO
+    """
+
+    person_reference_id: uuid.UUID = pydantic.Field(
+        description="The identifier for a person that the patient may be linked to."
+    )
+
+    belongingness_ratio: float = pydantic.Field(
+        description="The percentage of patient records matched in this person cluster."
+    )
+
+
 class LinkResponse(pydantic.BaseModel):
     """
     Schema for responses from the link endpoint.
     """
 
-    is_match: bool = pydantic.Field(
-        description="A true value indicates that one or more existing records "
-        "matched with the provided record, and these results have been linked."
-    )
+
     patient_reference_id: uuid.UUID = pydantic.Field(
-        description="The unique identifier for the patient that has been linked"
+        description="The unique identifier for the patient that has been linked."
     )
-    person_reference_id: uuid.UUID = pydantic.Field(
-        description="The identifier for the person that the patient record has " "been linked to.",
+    person_reference_id: uuid.UUID | None = pydantic.Field(
+        description="The identifier for the person that the patient record has been linked to.",
+    )
+    results: list[LinkResult] = pydantic.Field(
+        description="TODO"
     )
 
+    @pydantic.computed_field
+    @property
+    def prediction(self) -> typing.Literal["match", "possible_match", "no_match"]:
+        if self.person_reference_id and self.results:
+            return "match"
+        elif not self.results:
+            return "no_match"
+        else:
+            return "possible_match"
+
 
 class LinkFhirInput(pydantic.BaseModel):
     """

@@ -67,12 +67,12 @@ def test_load_algorithm_created(self, session):
         data = schemas.Algorithm(
             label="dibss-basic",
             description="First algorithm",
+            belongingness_ratio=(0.75, 0.8),
             passes=[
                 schemas.AlgorithmPass(
                     blocking_keys=["FIRST_NAME"],
                     evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
                     rule="func:recordlinker.linking.matchers.eval_perfect_match",
-                    cluster_ratio=0.8,
                 )
             ],
         )
@@ -82,25 +82,26 @@ def test_load_algorithm_created(self, session):
         assert obj.id == 1
         assert obj.label == "dibss-basic"
         assert obj.description == "First algorithm"
+        assert obj.belongingness_ratio == (0.75, 0.8)
         assert len(obj.passes) == 1
         assert obj.passes[0].algorithm_id == 1
         assert obj.passes[0].blocking_keys == ["FIRST_NAME"]
         assert obj.passes[0].evaluators == {
             "ZIP": "func:recordlinker.linking.matchers.feature_match_any"
         }
         assert obj.passes[0].rule == "func:recordlinker.linking.matchers.eval_perfect_match"
-        assert obj.passes[0].cluster_ratio == 0.8
+
 
     def test_load_algorithm_updated(self, session):
         data = schemas.Algorithm(
             label="dibss-basic",
             description="First algorithm",
+            belongingness_ratio=(0.75, 0.8),
             passes=[
                 schemas.AlgorithmPass(
                     blocking_keys=["FIRST_NAME"],
                     evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
-                    rule="func:recordlinker.linking.matchers.eval_perfect_match",
-                    cluster_ratio=0.8,
+                    rule="func:recordlinker.linking.matchers.eval_perfect_match"
                 )
             ],
         )
@@ -114,14 +115,14 @@ def test_load_algorithm_updated(self, session):
         assert obj.id == 1
         assert obj.label == "dibss-basic"
         assert obj.description == "Updated description"
+        assert obj.belongingness_ratio == (0.75, 0.8)
         assert len(obj.passes) == 1
         assert obj.passes[0].algorithm_id == 1
         assert obj.passes[0].blocking_keys == ["LAST_NAME"]
         assert obj.passes[0].evaluators == {
             "ZIP": "func:recordlinker.linking.matchers.feature_match_any"
         }
         assert obj.passes[0].rule == "func:recordlinker.linking.matchers.eval_perfect_match"
-        assert obj.passes[0].cluster_ratio == 0.8
 
 
 def test_delete_algorithm(session):
@@ -133,8 +134,7 @@ def test_delete_algorithm(session):
         algorithm=algo1,
         blocking_keys=["FIRST_NAME"],
         evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
-        rule="func:recordlinker.linking.matchers.eval_perfect_match",
-        cluster_ratio=0.8,
+        rule="func:recordlinker.linking.matchers.eval_perfect_match"
     )
     session.add(pass1)
     session.commit()
@@ -153,8 +153,7 @@ def test_clear_algorithms(session):
         algorithm=algo1,
         blocking_keys=["FIRST_NAME"],
         evaluators={"ZIP": "func:recordlinker.linking.matchers.feature_match_any"},
-        rule="func:recordlinker.linking.matchers.eval_perfect_match",
-        cluster_ratio=0.8,
+        rule="func:recordlinker.linking.matchers.eval_perfect_match"
     )
     session.add(pass1)
     session.commit()