diff --git a/abdiff/core/calc_ab_diffs.py b/abdiff/core/calc_ab_diffs.py index ab8ee94..e311111 100644 --- a/abdiff/core/calc_ab_diffs.py +++ b/abdiff/core/calc_ab_diffs.py @@ -110,7 +110,7 @@ def calc_record_diff( *, ignore_order: bool = True, report_repetition: bool = True, -) -> tuple[str, set[str], bool]: +) -> tuple[str | None, set[str], bool]: """Calculate diff from two JSON byte strings. The DeepDiff library has the property 'affected_root_keys' on the produced diff object @@ -124,14 +124,21 @@ def calc_record_diff( We also serialize the full diff to JSON via the to_json() method for storage and possible further analysis. + If either A or B record is None, we immediately return (None, set(), True) which + indicates the records are different, but it would be inaccurate to provide any details + about the difference given that one is absent. If both records are None, we return + (None, set(), False) because truly no diff. + Returns tuple(ab_diff, modified_timdex_fields, has_diff): - - ab_diff: [str] - full diff as JSON - - modified_timdex_fields: list[str] - list of modified root keys (TIMDEX fields) - - has_diff: bool - True/False if any diff present + - ab_diff: full diff as JSON + - modified_timdex_fields: set of modified root keys (TIMDEX fields) + - has_diff: True/False if any diff present """ - # Replace None with empty dict - record_a = record_a or {} - record_b = record_b or {} + # handle cases where record A and/or B is None + if record_a is None and record_b is None: + return None, set(), False # both absent, no diff + if record_a is None or record_b is None: + return None, set(), True # one absent, so diff, but no details needed # Parse JSON strings or bytes into dictionaries if isinstance(record_a, (str | bytes)): diff --git a/abdiff/core/calc_ab_metrics.py b/abdiff/core/calc_ab_metrics.py index bc232b6..40fcff2 100644 --- a/abdiff/core/calc_ab_metrics.py +++ b/abdiff/core/calc_ab_metrics.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) -NON_TIMDEX_FIELD_COLUMNS = ["timdex_record_id", "source", "has_diff"] +NON_TIMDEX_FIELD_COLUMNS = ["timdex_record_id", "source", "has_diff", "a_or_b_missing"] def calc_ab_metrics( @@ -65,7 +65,14 @@ def create_record_diff_matrix_dataset( for i, batch in enumerate( diffs_ds.to_batches( batch_size=batch_size, - columns=["timdex_record_id", "source", "modified_timdex_fields", "has_diff"], + columns=[ + "timdex_record_id", + "source", + "modified_timdex_fields", + "has_diff", + "record_a", + "record_b", + ], ) ): start_time = time.time() @@ -73,14 +80,20 @@ def create_record_diff_matrix_dataset( batch_metrics = [] for _, row in batch_df.iterrows(): + has_diff = 1 if row["has_diff"] == "true" else 0 + a_or_b_missing = ( + 1 if (pd.isna(row["record_a"]) or pd.isna(row["record_b"])) else 0 + ) + record_metrics = { "timdex_record_id": row["timdex_record_id"], "source": row["source"], - "has_diff": (1 if row["has_diff"] == "true" else 0), + "has_diff": has_diff, + "a_or_b_missing": a_or_b_missing, } # for each modified field (root key in diff), set column and value = 1 (True) - if row["modified_timdex_fields"] is not None: + if not a_or_b_missing and row["modified_timdex_fields"] is not None: for field in row["modified_timdex_fields"]: record_metrics[field] = 1 diff --git a/abdiff/core/create_final_records.py b/abdiff/core/create_final_records.py index f7bc40b..b7517f8 100644 --- a/abdiff/core/create_final_records.py +++ b/abdiff/core/create_final_records.py @@ -4,7 +4,6 @@ import duckdb import pyarrow as pa -import pyarrow.dataset as ds from abdiff.config import Config from abdiff.core.utils import ( @@ -38,13 +37,10 @@ def create_final_records( """ logger.info("Creating final records dataset from 'diffs' and 'metrics' datasets.") run_data = read_run_json(run_directory) - - diffs_dataset = load_dataset(diffs_dataset_path) - metrics_dataset = load_dataset(metrics_dataset_path) - metrics_timdex_field_columns = run_data["metrics"]["summary"]["fields_with_diffs"] # get list of unique columns from metrics dataset, and create final dataset schema + metrics_dataset = load_dataset(metrics_dataset_path) metrics_columns = ( pa.field(name, pa.int64()) for name in metrics_dataset.schema.names @@ -59,6 +55,7 @@ def create_final_records( pa.field("ab_diff", pa.string()), pa.field("modified_timdex_fields", pa.list_(pa.string())), pa.field("has_diff", pa.string()), + pa.field("a_or_b_missing", pa.int32()), *metrics_columns, # type: ignore[arg-type] ) ) @@ -67,7 +64,7 @@ def create_final_records( records_dataset_path = str(Path(run_directory) / "records") write_to_dataset( get_final_records_iter( - diffs_dataset, metrics_dataset, metrics_timdex_field_columns + diffs_dataset_path, metrics_dataset_path, metrics_timdex_field_columns ), base_dir=records_dataset_path, schema=final_records_dataset_schema, @@ -83,16 +80,32 @@ def create_final_records( def get_final_records_iter( - diffs_dataset: ds.Dataset, - metrics_dataset: ds.Dataset, + diffs_dataset_path: str, + metrics_dataset_path: str, metrics_timdex_field_columns: list[str], ) -> Generator[pa.RecordBatch, None, None]: with duckdb.connect(":memory:") as conn: - # register datasets in DuckDB for use - conn.register("diffs", diffs_dataset.to_table()) - conn.register("metrics", metrics_dataset.to_table()) + # create views for diffs and metrics datasets to join + conn.execute( + f""" + create view diffs as + select * from read_parquet( + '{diffs_dataset_path}/**/*.parquet', + hive_partitioning=true + ) + """ + ) + conn.execute( + f""" + create view metrics as + select * from read_parquet( + '{metrics_dataset_path}/**/*.parquet', + hive_partitioning=true + ) + """ + ) # prepare select columns select_columns = ",".join( diff --git a/tests/test_calc_ab_diffs.py b/tests/test_calc_ab_diffs.py index 68d5537..48e53b0 100644 --- a/tests/test_calc_ab_diffs.py +++ b/tests/test_calc_ab_diffs.py @@ -77,7 +77,7 @@ def test_calc_record_diff_array_repetition_is_reported_when_diff(): def test_calc_record_handles_missing_a_or_b(): a, b = None, {"color": "green"} ab_diff, modified_timdex_fields, has_diff = calc_record_diff(a, b) - assert modified_timdex_fields == ["color"] + assert modified_timdex_fields == set() assert has_diff diff --git a/tests/test_calc_ab_metrics.py b/tests/test_calc_ab_metrics.py index 53c4970..045cdea 100644 --- a/tests/test_calc_ab_metrics.py +++ b/tests/test_calc_ab_metrics.py @@ -88,6 +88,7 @@ def test_duckdb_context_creates_record_diff_matrix_view( "timdex_record_id", "source", "has_diff", + "a_or_b_missing", "color", "fruit", "number", @@ -96,6 +97,7 @@ def test_duckdb_context_creates_record_diff_matrix_view( "timdex_record_id": "abc123", "source": "alma", "has_diff": 1.0, + "a_or_b_missing": 0.0, "color": 1.0, "fruit": 0.0, "number": 0.0,