Skip to content

Commit

Permalink
Revert "fix(python): Add include_nulls parameter to update (pola-…
Browse files Browse the repository at this point in the history
…rs#11830)"

This reverts commit d9c6316.
  • Loading branch information
reswqa committed Nov 7, 2023
1 parent d5cb4d1 commit 6f8f4f7
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 100 deletions.
34 changes: 3 additions & 31 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9758,19 +9758,13 @@ def update(
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
how: Literal["left", "inner", "outer"] = "left",
include_nulls: bool | None = False,
) -> DataFrame:
"""
Update the values in this `DataFrame` with the values in `other`.
By default, null values in the right dataframe are ignored. Use
`ignore_nulls=False` to overwrite values in this frame with null values in other
frame.
Update the values in this `DataFrame` with the non-null values in `other`.
Notes
-----
This is syntactic sugar for a left/inner join, with an optional coalesce when
`include_nulls = False`.
This is syntactic sugar for a left/inner join + coalesce
Warnings
--------
Expand All @@ -9794,9 +9788,6 @@ def update(
* 'inner' keeps only those rows where the key exists in both frames.
* 'outer' will update existing rows where the key matches while also
adding any new rows contained in the given frame.
include_nulls
If True, null values from the right dataframe will be used to update the
left dataframe.
Examples
--------
Expand Down Expand Up @@ -9872,29 +9863,10 @@ def update(
│ 5 ┆ -66 │
└─────┴─────┘
Update `df` values including null values in `new_df`, using an outer join
strategy that defines explicit join columns in each frame:
>>> df.update(
... new_df, left_on="A", right_on="C", how="outer", include_nulls=True
... )
shape: (5, 2)
┌─────┬──────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪══════╡
│ 1 ┆ -99 │
│ 2 ┆ 500 │
│ 3 ┆ null │
│ 4 ┆ 700 │
│ 5 ┆ -66 │
└─────┴──────┘
"""
return (
self.lazy()
.update(other.lazy(), on, left_on, right_on, how, include_nulls)
.update(other.lazy(), on, left_on, right_on, how)
.collect(_eager=True)
)

Expand Down
56 changes: 9 additions & 47 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5663,7 +5663,6 @@ def update(
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
how: Literal["left", "inner", "outer"] = "left",
include_nulls: bool | None = False,
) -> Self:
"""
Update the values in this `LazyFrame` with the non-null values in `other`.
Expand All @@ -5685,14 +5684,10 @@ def update(
* 'inner' keeps only those rows where the key exists in both frames.
* 'outer' will update existing rows where the key matches while also
adding any new rows contained in the given frame.
include_nulls
If True, null values from the right dataframe will be used to update the
left dataframe.
Notes
-----
This is syntactic sugar for a left/inner join, with an optional coalesce when
`include_nulls = False`.
This is syntactic sugar for a join + coalesce (upsert) operation.
Examples
--------
Expand Down Expand Up @@ -5768,25 +5763,6 @@ def update(
│ 5 ┆ -66 │
└─────┴─────┘
Update `df` values including null values in `new_df`, using an outer join
strategy that defines explicit join columns in each frame:
>>> lf.update(
... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True
... ).collect()
shape: (5, 2)
┌─────┬──────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪══════╡
│ 1 ┆ -99 │
│ 2 ┆ 500 │
│ 3 ┆ null │
│ 4 ┆ 700 │
│ 5 ┆ -66 │
└─────┴──────┘
"""
if how not in ("left", "inner", "outer"):
raise ValueError(
Expand Down Expand Up @@ -5835,38 +5811,24 @@ def update(
# only use non-idx right columns present in left frame
right_other = set(other.columns).intersection(self.columns) - set(right_on)

# When include_nulls is True, we need to distinguish records after the join that
# were originally null in the right frame, as opposed to records that were null
# because the key was missing from the right frame.
# Add a validity column to track whether row was matched or not.
if include_nulls:
validity = ("__POLARS_VALIDITY",)
other = other.with_columns(F.lit(True).alias(validity[0]))
else:
validity = () # type: ignore[assignment]

tmp_name = "__POLARS_RIGHT"
drop_columns = [*(f"{name}{tmp_name}" for name in right_other), *validity]
result = (
self.join(
other.select(*right_on, *right_other, *validity),
other.select(*right_on, *right_other),
left_on=left_on,
right_on=right_on,
how=how,
suffix=tmp_name,
)
.with_columns(
(
# use left value only when right value failed to join
F.when(F.col(validity).is_null())
.then(F.col(name))
.otherwise(F.col(f"{name}{tmp_name}"))
if include_nulls
else F.coalesce([f"{name}{tmp_name}", F.col(name)])
).alias(name)
for name in right_other
[
F.coalesce([f"{column_name}{tmp_name}", F.col(column_name)]).alias(
column_name
)
for column_name in right_other
]
)
.drop(drop_columns)
.drop([f"{name}{tmp_name}" for name in right_other])
)
if row_count_used:
result = result.drop(row_count_name)
Expand Down
22 changes: 0 additions & 22 deletions py-polars/tests/unit/operations/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,28 +562,6 @@ def test_update() -> None:
a.update(b.rename({"b": "a"}), how="outer", on="a").collect().to_series()
)

# check behavior of include_nulls=True
df = pl.DataFrame(
{
"A": [1, 2, 3, 4],
"B": [400, 500, 600, 700],
}
)
new_df = pl.DataFrame(
{
"B": [-66, None, -99],
"C": [5, 3, 1],
}
)
out = df.update(new_df, left_on="A", right_on="C", how="outer", include_nulls=True)
expected = pl.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"B": [-99, 500, None, 700, -66],
}
)
assert_frame_equal(out, expected)

# edge-case #11684
x = pl.DataFrame({"a": [0, 1]})
y = pl.DataFrame({"a": [2, 3]})
Expand Down

0 comments on commit 6f8f4f7

Please sign in to comment.