diff --git a/docs/api-reference/expressions.md b/docs/api-reference/expressions.md index c01dd8fe6..c487ff0ff 100644 --- a/docs/api-reference/expressions.md +++ b/docs/api-reference/expressions.md @@ -10,6 +10,7 @@ - cast - diff - drop_nulls + - fill_null - filter - is_between - is_in diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 84e66e404..e1171c31f 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -11,6 +11,7 @@ - diff - drop_nulls - dtype + - fill_null - filter - is_between - is_in diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 6b002a41d..180fa4da8 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -178,6 +178,9 @@ def is_between( def is_null(self) -> Self: return register_expression_call(self, "is_null") + def fill_null(self, value: Any) -> Self: + return register_expression_call(self, "fill_null", value) + def is_in(self, other: Any) -> Self: return register_expression_call(self, "is_in", other) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 0bbe52657..28747366c 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -335,6 +335,10 @@ def is_null(self) -> PandasSeries: ser = self._series return self._from_series(ser.isna()) + def fill_null(self, value: Any) -> PandasSeries: + ser = self._series + return self._from_series(ser.fillna(value)) + def drop_nulls(self) -> PandasSeries: ser = self._series return self._from_series(ser.dropna()) diff --git a/narwhals/expression.py b/narwhals/expression.py index 915766f9d..e7100d4e2 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -604,6 +604,10 @@ def is_null(self) -> Expr: """ Returns a boolean Series indicating which values are null. + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + Examples: >>> import pandas as pd >>> import polars as pl @@ -657,6 +661,64 @@ def is_null(self) -> Expr: """ return self.__class__(lambda plx: self._call(plx).is_null()) + def fill_null(self, value: Any) -> Expr: + """ + Fill null values with given value. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> df_pd = pd.DataFrame( + ... { + ... 'a': [2, 4, None, 3, 5], + ... 'b': [2.0, 4.0, float("nan"), 3.0, 5.0] + ... } + ... ) + >>> df_pl = pl.DataFrame( + ... { + ... 'a': [2, 4, None, 3, 5], + ... 'b': [2.0, 4.0, float("nan"), 3.0, 5.0] + ... } + ... ) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.with_columns(nw.col('a', 'b').fill_null(0)) + ... return nw.to_native(df) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 0 2.0 2.0 + 1 4.0 4.0 + 2 0.0 0.0 + 3 3.0 3.0 + 4 5.0 5.0 + + >>> func(df_pl) # nan != null for polars + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 2 ┆ 2.0 │ + │ 4 ┆ 4.0 │ + │ 0 ┆ NaN │ + │ 3 ┆ 3.0 │ + │ 5 ┆ 5.0 │ + └─────┴─────┘ + """ + return self.__class__(lambda plx: self._call(plx).fill_null(value)) + # --- partial reduction --- def drop_nulls(self) -> Expr: return self.__class__(lambda plx: self._call(plx).drop_nulls()) diff --git a/narwhals/series.py b/narwhals/series.py index 16d9a9560..ef549c6fc 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -196,6 +196,9 @@ def sort(self, *, descending: bool = False) -> Self: def is_null(self) -> Self: return self._from_series(self._series.is_null()) + def fill_null(self, value: Any) -> Self: + return self._from_series(self._series.fill_null(value)) + def is_between( self, lower_bound: Any, upper_bound: Any, closed: str = "both" ) -> Self: diff --git a/tests/expr/fill_null_test.py b/tests/expr/fill_null_test.py new file mode 100644 index 000000000..f48815495 --- /dev/null +++ b/tests/expr/fill_null_test.py @@ -0,0 +1,32 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [0.0, None, 2, 3, 4], + "b": [1.0, None, None, 5, 3], + "c": [5.0, None, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_over_single(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.with_columns(nw.all().fill_null(99)) + expected = { + "a": [0.0, 99, 2, 3, 4], + "b": [1.0, 99, 99, 5, 3], + "c": [5.0, 99, 3, 2, 1], + } + compare_dicts(result, expected) + result = df.with_columns( + a=df["a"].fill_null(99), + b=df["b"].fill_null(99), + c=df["c"].fill_null(99), + ) + compare_dicts(result, expected)