Skip to content

Commit

Permalink
Merge pull request #65 from MITLibraries/TIMX-388-TIMX-389-missing-re…
Browse files Browse the repository at this point in the history
…cords-and-final-records

TIMX 388, 389 - handle missing A or B records and construction of final records dataset
  • Loading branch information
ghukill authored Nov 13, 2024
2 parents 082b9f3 + afb5a6f commit d150081
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 23 deletions.
21 changes: 14 additions & 7 deletions abdiff/core/calc_ab_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def calc_record_diff(
*,
ignore_order: bool = True,
report_repetition: bool = True,
) -> tuple[str, set[str], bool]:
) -> tuple[str | None, set[str], bool]:
"""Calculate diff from two JSON byte strings.
The DeepDiff library has the property 'affected_root_keys' on the produced diff object
Expand All @@ -124,14 +124,21 @@ def calc_record_diff(
We also serialize the full diff to JSON via the to_json() method for storage and
possible further analysis.
If either A or B record is None, we immediately return (None, set(), True) which
indicates the records are different, but it would be inaccurate to provide any details
about the difference given that one is absent. If both records are None, we return
(None, set(), False) because truly no diff.
Returns tuple(ab_diff, modified_timdex_fields, has_diff):
- ab_diff: [str] - full diff as JSON
- modified_timdex_fields: list[str] - list of modified root keys (TIMDEX fields)
- has_diff: bool - True/False if any diff present
- ab_diff: full diff as JSON
- modified_timdex_fields: set of modified root keys (TIMDEX fields)
- has_diff: True/False if any diff present
"""
# Replace None with empty dict
record_a = record_a or {}
record_b = record_b or {}
# handle cases where record A and/or B is None
if record_a is None and record_b is None:
return None, set(), False # both absent, no diff
if record_a is None or record_b is None:
return None, set(), True # one absent, so diff, but no details needed

# Parse JSON strings or bytes into dictionaries
if isinstance(record_a, (str | bytes)):
Expand Down
21 changes: 17 additions & 4 deletions abdiff/core/calc_ab_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

logger = logging.getLogger(__name__)

NON_TIMDEX_FIELD_COLUMNS = ["timdex_record_id", "source", "has_diff"]
NON_TIMDEX_FIELD_COLUMNS = ["timdex_record_id", "source", "has_diff", "a_or_b_missing"]


def calc_ab_metrics(
Expand Down Expand Up @@ -65,22 +65,35 @@ def create_record_diff_matrix_dataset(
for i, batch in enumerate(
diffs_ds.to_batches(
batch_size=batch_size,
columns=["timdex_record_id", "source", "modified_timdex_fields", "has_diff"],
columns=[
"timdex_record_id",
"source",
"modified_timdex_fields",
"has_diff",
"record_a",
"record_b",
],
)
):
start_time = time.time()
batch_df = batch.to_pandas()

batch_metrics = []
for _, row in batch_df.iterrows():
has_diff = 1 if row["has_diff"] == "true" else 0
a_or_b_missing = (
1 if (pd.isna(row["record_a"]) or pd.isna(row["record_b"])) else 0
)

record_metrics = {
"timdex_record_id": row["timdex_record_id"],
"source": row["source"],
"has_diff": (1 if row["has_diff"] == "true" else 0),
"has_diff": has_diff,
"a_or_b_missing": a_or_b_missing,
}

# for each modified field (root key in diff), set column and value = 1 (True)
if row["modified_timdex_fields"] is not None:
if not a_or_b_missing and row["modified_timdex_fields"] is not None:
for field in row["modified_timdex_fields"]:
record_metrics[field] = 1

Expand Down
35 changes: 24 additions & 11 deletions abdiff/core/create_final_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import duckdb
import pyarrow as pa
import pyarrow.dataset as ds

from abdiff.config import Config
from abdiff.core.utils import (
Expand Down Expand Up @@ -38,13 +37,10 @@ def create_final_records(
"""
logger.info("Creating final records dataset from 'diffs' and 'metrics' datasets.")
run_data = read_run_json(run_directory)

diffs_dataset = load_dataset(diffs_dataset_path)
metrics_dataset = load_dataset(metrics_dataset_path)

metrics_timdex_field_columns = run_data["metrics"]["summary"]["fields_with_diffs"]

# get list of unique columns from metrics dataset, and create final dataset schema
metrics_dataset = load_dataset(metrics_dataset_path)
metrics_columns = (
pa.field(name, pa.int64())
for name in metrics_dataset.schema.names
Expand All @@ -59,6 +55,7 @@ def create_final_records(
pa.field("ab_diff", pa.string()),
pa.field("modified_timdex_fields", pa.list_(pa.string())),
pa.field("has_diff", pa.string()),
pa.field("a_or_b_missing", pa.int32()),
*metrics_columns, # type: ignore[arg-type]
)
)
Expand All @@ -67,7 +64,7 @@ def create_final_records(
records_dataset_path = str(Path(run_directory) / "records")
write_to_dataset(
get_final_records_iter(
diffs_dataset, metrics_dataset, metrics_timdex_field_columns
diffs_dataset_path, metrics_dataset_path, metrics_timdex_field_columns
),
base_dir=records_dataset_path,
schema=final_records_dataset_schema,
Expand All @@ -83,16 +80,32 @@ def create_final_records(


def get_final_records_iter(
diffs_dataset: ds.Dataset,
metrics_dataset: ds.Dataset,
diffs_dataset_path: str,
metrics_dataset_path: str,
metrics_timdex_field_columns: list[str],
) -> Generator[pa.RecordBatch, None, None]:

with duckdb.connect(":memory:") as conn:

# register datasets in DuckDB for use
conn.register("diffs", diffs_dataset.to_table())
conn.register("metrics", metrics_dataset.to_table())
# create views for diffs and metrics datasets to join
conn.execute(
f"""
create view diffs as
select * from read_parquet(
'{diffs_dataset_path}/**/*.parquet',
hive_partitioning=true
)
"""
)
conn.execute(
f"""
create view metrics as
select * from read_parquet(
'{metrics_dataset_path}/**/*.parquet',
hive_partitioning=true
)
"""
)

# prepare select columns
select_columns = ",".join(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_calc_ab_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_calc_record_diff_array_repetition_is_reported_when_diff():
def test_calc_record_handles_missing_a_or_b():
a, b = None, {"color": "green"}
ab_diff, modified_timdex_fields, has_diff = calc_record_diff(a, b)
assert modified_timdex_fields == ["color"]
assert modified_timdex_fields == set()
assert has_diff


Expand Down
2 changes: 2 additions & 0 deletions tests/test_calc_ab_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def test_duckdb_context_creates_record_diff_matrix_view(
"timdex_record_id",
"source",
"has_diff",
"a_or_b_missing",
"color",
"fruit",
"number",
Expand All @@ -96,6 +97,7 @@ def test_duckdb_context_creates_record_diff_matrix_view(
"timdex_record_id": "abc123",
"source": "alma",
"has_diff": 1.0,
"a_or_b_missing": 0.0,
"color": 1.0,
"fruit": 0.0,
"number": 0.0,
Expand Down

0 comments on commit d150081

Please sign in to comment.