From 6f8f4f755b29539cf38cf1682b684ad5371e3d98 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 7 Nov 2023 10:56:05 +0800 Subject: [PATCH] Revert "fix(python): Add `include_nulls` parameter to `update` (#11830)" This reverts commit d9c63161fa62ac67b560e40e5a85a6ee8c559a9b. --- py-polars/polars/dataframe/frame.py | 34 ++---------- py-polars/polars/lazyframe/frame.py | 56 ++++---------------- py-polars/tests/unit/operations/test_join.py | 22 -------- 3 files changed, 12 insertions(+), 100 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 2487d47b2b8fa..cfb35c023a134 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -9758,19 +9758,13 @@ def update( left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, how: Literal["left", "inner", "outer"] = "left", - include_nulls: bool | None = False, ) -> DataFrame: """ - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. + Update the values in this `DataFrame` with the non-null values in `other`. Notes ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. + This is syntactic sugar for a left/inner join + coalesce Warnings -------- @@ -9794,9 +9788,6 @@ def update( * 'inner' keeps only those rows where the key exists in both frames. * 'outer' will update existing rows where the key matches while also adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. Examples -------- @@ -9872,29 +9863,10 @@ def update( │ 5 ┆ -66 │ └─────┴─────┘ - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - """ return ( self.lazy() - .update(other.lazy(), on, left_on, right_on, how, include_nulls) + .update(other.lazy(), on, left_on, right_on, how) .collect(_eager=True) ) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index fd0f4d24d9f3c..70a9798ef8c60 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -5663,7 +5663,6 @@ def update( left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, how: Literal["left", "inner", "outer"] = "left", - include_nulls: bool | None = False, ) -> Self: """ Update the values in this `LazyFrame` with the non-null values in `other`. @@ -5685,14 +5684,10 @@ def update( * 'inner' keeps only those rows where the key exists in both frames. * 'outer' will update existing rows where the key matches while also adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. Notes ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. + This is syntactic sugar for a join + coalesce (upsert) operation. Examples -------- @@ -5768,25 +5763,6 @@ def update( │ 5 ┆ -66 │ └─────┴─────┘ - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - """ if how not in ("left", "inner", "outer"): raise ValueError( @@ -5835,38 +5811,24 @@ def update( # only use non-idx right columns present in left frame right_other = set(other.columns).intersection(self.columns) - set(right_on) - # When include_nulls is True, we need to distinguish records after the join that - # were originally null in the right frame, as opposed to records that were null - # because the key was missing from the right frame. - # Add a validity column to track whether row was matched or not. - if include_nulls: - validity = ("__POLARS_VALIDITY",) - other = other.with_columns(F.lit(True).alias(validity[0])) - else: - validity = () # type: ignore[assignment] - tmp_name = "__POLARS_RIGHT" - drop_columns = [*(f"{name}{tmp_name}" for name in right_other), *validity] result = ( self.join( - other.select(*right_on, *right_other, *validity), + other.select(*right_on, *right_other), left_on=left_on, right_on=right_on, how=how, suffix=tmp_name, ) .with_columns( - ( - # use left value only when right value failed to join - F.when(F.col(validity).is_null()) - .then(F.col(name)) - .otherwise(F.col(f"{name}{tmp_name}")) - if include_nulls - else F.coalesce([f"{name}{tmp_name}", F.col(name)]) - ).alias(name) - for name in right_other + [ + F.coalesce([f"{column_name}{tmp_name}", F.col(column_name)]).alias( + column_name + ) + for column_name in right_other + ] ) - .drop(drop_columns) + .drop([f"{name}{tmp_name}" for name in right_other]) ) if row_count_used: result = result.drop(row_count_name) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index d33c5920593c9..97c30f5f113b1 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -562,28 +562,6 @@ def test_update() -> None: a.update(b.rename({"b": "a"}), how="outer", on="a").collect().to_series() ) - # check behavior of include_nulls=True - df = pl.DataFrame( - { - "A": [1, 2, 3, 4], - "B": [400, 500, 600, 700], - } - ) - new_df = pl.DataFrame( - { - "B": [-66, None, -99], - "C": [5, 3, 1], - } - ) - out = df.update(new_df, left_on="A", right_on="C", how="outer", include_nulls=True) - expected = pl.DataFrame( - { - "A": [1, 2, 3, 4, 5], - "B": [-99, 500, None, 700, -66], - } - ) - assert_frame_equal(out, expected) - # edge-case #11684 x = pl.DataFrame({"a": [0, 1]}) y = pl.DataFrame({"a": [2, 3]})