From 89fe3f4c370210537aff0deffaf6a90626fbbf69 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 1 Nov 2024 19:13:28 +0100 Subject: [PATCH 1/6] is-finite for eager --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 3 ++ narwhals/_arrow/series.py | 5 ++++ narwhals/_pandas_like/expr.py | 3 ++ narwhals/_pandas_like/series.py | 5 ++++ narwhals/expr.py | 51 +++++++++++++++++++++++++++++++++ narwhals/series.py | 46 +++++++++++++++++++++++++++++ 8 files changed, 115 insertions(+) diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 7188b2c36..99cbd8762 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -21,6 +21,7 @@ - clip - is_between - is_duplicated + - is_finite - is_first_distinct - is_in - is_last_distinct diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e8572dda8..2a8dba1af 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -26,6 +26,7 @@ - is_between - is_duplicated - is_empty + - is_finite - is_first_distinct - is_in - is_last_distinct diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 35e936d72..3dab97678 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -372,6 +372,9 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: def mode(self: Self) -> Self: return reuse_series_implementation(self, "mode") + def is_finite(self: Self) -> Self: + return reuse_series_implementation(self, "is_finite") + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 70009df43..c20276b1b 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -721,6 +721,11 @@ def mode(self: Self) -> ArrowSeries: plx.col(col_token) == plx.col(col_token).max() )[self.name] + def is_finite(self: Self) -> Self: + import pyarrow.compute as pc # ignore-banned-import + + return self._from_native_series(pc.is_finite(self._native_series)) + def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index a58597eea..f5512219d 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -387,6 +387,9 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: def mode(self: Self) -> Self: return reuse_series_implementation(self, "mode") + def is_finite(self: Self) -> Self: + return reuse_series_implementation(self, "is_finite") + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 35df78e2f..c2c525786 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -686,6 +686,11 @@ def mode(self: Self) -> Self: def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() + def is_finite(self: Self) -> Self: + import numpy as np # ignore-banned-import + + return self._from_native_series(np.isfinite(self._native_series)) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 6c2d28962..077e4841d 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2313,6 +2313,57 @@ def mode(self: Self) -> Self: """ return self.__class__(lambda plx: self._call(plx).mode()) + def is_finite(self: Self) -> Self: + """ + Returns a boolean Series indicating which values are finite. + + Returns: + Expression of `Boolean` data type. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1.0, 2], + ... "b": [3.0, float("inf")], + ... } + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.all().is_finite()) + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> func(pd.DataFrame(data)) + a b + 0 True True + 1 True False + + >>> func(pl.DataFrame(data)) + shape: (2, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + >>> func(pa.table(data)) + pyarrow.Table + a: bool + b: bool + ---- + a: [[true,true]] + b: [[true,false]] + """ + return self.__class__(lambda plx: self._call(plx).is_finite()) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index 6f5223202..79460408a 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2525,6 +2525,52 @@ def mode(self: Self) -> Self: """ return self._from_compliant_series(self._compliant_series.mode()) + def is_finite(self: Self) -> Self: + """ + Returns a boolean Series indicating which values are finite. + + Returns: + Expression of `Boolean` data type. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [1.0, float("inf")] + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.is_finite() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> func(pd.Series(data)) + 0 True + 1 False + dtype: bool + + >>> func(pl.Series(data)) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [bool] + [ + true + false + ] + + >>> func(pa.chunked_array([data])) # doctest: +ELLIPSIS + + [ + [ + true, + false + ] + ] + """ + return self._from_compliant_series(self._compliant_series.is_finite()) + def __iter__(self: Self) -> Iterator[Any]: yield from self._compliant_series.__iter__() From cc3e72205b180431e2112d0b8859ab552f29a403 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 1 Nov 2024 19:33:27 +0100 Subject: [PATCH 2/6] add dask and test --- narwhals/_dask/expr.py | 9 +++++++++ tests/expr_and_series/is_finite_test.py | 27 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 tests/expr_and_series/is_finite_test.py diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index db29f6c4d..0adb58814 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -725,6 +725,15 @@ def func(_input: Any, dtype: DType | type[DType]) -> Any: returns_scalar=False, ) + def is_finite(self: Self) -> Self: + import dask.array as da # ignore-banned-import + + return self._from_call( + lambda _input: da.isfinite(_input), + "is_finite", + returns_scalar=False, + ) + class DaskExprStringNamespace: def __init__(self, expr: DaskExpr) -> None: diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py new file mode 100644 index 000000000..6fee3311d --- /dev/null +++ b/tests/expr_and_series/is_finite_test.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +data = {"a": [float("nan"), float("inf"), 2.0]} +expected = {"a": [False, False, True]} + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") +def test_is_finite_expr(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select(nw.col("a").is_finite()) + + assert_equal_data(result, expected) + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") +def test_is_finite_series(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + result = {"a": df["a"].is_finite()} + + assert_equal_data(result, expected) From f603f6bcc134680d22b9ba1a1da5b74995d4b95c Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 9 Nov 2024 20:16:03 +0100 Subject: [PATCH 3/6] pandas treat nulls as nan --- narwhals/_pandas_like/series.py | 4 +- narwhals/expr.py | 54 +++++++++++++------------ narwhals/series.py | 21 +++++++--- tests/expr_and_series/is_finite_test.py | 14 +++++-- 4 files changed, 59 insertions(+), 34 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index a881f71c2..35d781699 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -722,7 +722,9 @@ def __iter__(self: Self) -> Iterator[Any]: def is_finite(self: Self) -> Self: import numpy as np # ignore-banned-import - return self._from_native_series(np.isfinite(self._native_series)) + return self._from_native_series( + np.isfinite(self._native_series) & ~self._native_series.isna() + ) @property def str(self) -> PandasLikeSeriesStringNamespace: diff --git a/narwhals/expr.py b/narwhals/expr.py index 45d8d4575..a2563eb07 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1003,7 +1003,8 @@ def replace_strict( ... def func(df): ... return df.with_columns( ... b=nw.col("a").replace_strict( - ... [0,1,2,3], ['zero', 'one', 'two', 'three'] + ... [0, 1, 2, 3], + ... ["zero", "one", "two", "three"], ... return_dtype=nw.String, ... ) ... ) @@ -1301,8 +1302,8 @@ def is_null(self) -> Self: Returns a boolean Series indicating which values are null. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas, Polars and PyArrow handle null values differently. Polars and PyArrow + distinguish between NaN and Null, whereas pandas doesn't. Examples: >>> import pandas as pd @@ -2387,7 +2388,12 @@ def mode(self: Self) -> Self: def is_finite(self: Self) -> Self: """ - Returns a boolean Series indicating which values are finite. + Returns boolean values indicating which original values are finite. + + Warning: + Different backend handle null values differently. `is_finite` will return + False for NaN and Null's in the pandas and Dask backend, while for Polars and + PyArrow null values are kept as such. Returns: Expression of `Boolean` data type. @@ -2397,42 +2403,40 @@ def is_finite(self: Self) -> Self: >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = { - ... "a": [1.0, 2], - ... "b": [3.0, float("inf")], - ... } + >>> data = {"a": [float("nan"), float("inf"), 2.0, None]} We define a library agnostic function: >>> @nw.narwhalify ... def func(df): - ... return df.select(nw.all().is_finite()) + ... return df.select(nw.col("a").is_finite()) We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(pd.DataFrame(data)) - a b - 0 True True - 1 True False - + a + 0 False + 1 False + 2 True + 3 False >>> func(pl.DataFrame(data)) - shape: (2, 2) - ┌──────┬───────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ + shape: (4, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + │ null │ + └───────┘ >>> func(pa.table(data)) pyarrow.Table a: bool - b: bool ---- - a: [[true,true]] - b: [[true,false]] + a: [[false,false,true,null]] """ return self.__class__(lambda plx: self._call(plx).is_finite()) diff --git a/narwhals/series.py b/narwhals/series.py index 94bcd9982..223d492c7 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2602,6 +2602,11 @@ def is_finite(self: Self) -> Self: """ Returns a boolean Series indicating which values are finite. + Warning: + Different backend handle null values differently. `is_finite` will return + False for NaN and Null's in the pandas and Dask backend, while for Polars and + PyArrow null values are kept as such. + Returns: Expression of `Boolean` data type. @@ -2610,7 +2615,7 @@ def is_finite(self: Self) -> Self: >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = [1.0, float("inf")] + >>> data = [float("nan"), float("inf"), 2.0, None] We define a library agnostic function: @@ -2621,24 +2626,30 @@ def is_finite(self: Self) -> Self: We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(pd.Series(data)) - 0 True + 0 False 1 False + 2 True + 3 False dtype: bool >>> func(pl.Series(data)) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) + shape: (4,) Series: '' [bool] [ - true false + false + true + null ] >>> func(pa.chunked_array([data])) # doctest: +ELLIPSIS [ [ + false, + false, true, - false + null ] ] """ diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 6fee3311d..423e89a4b 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -7,20 +7,28 @@ from tests.utils import ConstructorEager from tests.utils import assert_equal_data -data = {"a": [float("nan"), float("inf"), 2.0]} -expected = {"a": [False, False, True]} +data = {"a": [float("nan"), float("inf"), 2.0, None]} @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_is_finite_expr(constructor: Constructor) -> None: + if "polars" in str(constructor) or "pyarrow_table" in str(constructor): + expected = {"a": [False, False, True, None]} + else: + expected = {"a": [False, False, True, False]} + df = nw.from_native(constructor(data)) result = df.select(nw.col("a").is_finite()) - assert_equal_data(result, expected) @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_is_finite_series(constructor_eager: ConstructorEager) -> None: + if "polars" in str(constructor_eager) or "pyarrow_table" in str(constructor_eager): + expected = {"a": [False, False, True, None]} + else: + expected = {"a": [False, False, True, False]} + df = nw.from_native(constructor_eager(data), eager_only=True) result = {"a": df["a"].is_finite()} From 28d329afe7e4740372e71d18b936fe607fdaf2b4 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 9 Nov 2024 20:18:08 +0100 Subject: [PATCH 4/6] rm dask from series warning --- narwhals/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/series.py b/narwhals/series.py index 223d492c7..05a531124 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2604,7 +2604,7 @@ def is_finite(self: Self) -> Self: Warning: Different backend handle null values differently. `is_finite` will return - False for NaN and Null's in the pandas and Dask backend, while for Polars and + False for NaN and Null's in the pandas backend, while for Polars and PyArrow null values are kept as such. Returns: From d6afe0147ee5240c798403ab2f21b5fadf315e80 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 12 Nov 2024 22:36:32 +0100 Subject: [PATCH 5/6] pin numpy instead --- tests/expr_and_series/is_finite_test.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 7a4d0f184..e63a36dcf 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -1,10 +1,9 @@ from __future__ import annotations -import sys - import pytest import narwhals.stable.v1 as nw +from tests.utils import NUMPY_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -14,7 +13,7 @@ @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_is_finite_expr(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if sys.version_info < (3, 9) and "pandas_pyarrow" in str(constructor): + if NUMPY_VERSION < (1, 25) and "pandas_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) or "pyarrow_table" in str(constructor): @@ -31,7 +30,7 @@ def test_is_finite_expr(request: pytest.FixtureRequest, constructor: Constructor def test_is_finite_series( request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: - if sys.version_info < (3, 9) and "pandas_pyarrow" in str(constructor_eager): + if NUMPY_VERSION < (1, 25) and "pandas_pyarrow" in str(constructor_eager): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor_eager) or "pyarrow_table" in str(constructor_eager): From 40a894dd349341d14f963e5b1765bf82ba8a134d Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 12 Nov 2024 22:42:50 +0100 Subject: [PATCH 6/6] nevermind its pandas version to pin --- tests/expr_and_series/is_finite_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index e63a36dcf..dd695aea4 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -3,7 +3,7 @@ import pytest import narwhals.stable.v1 as nw -from tests.utils import NUMPY_VERSION +from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -13,7 +13,7 @@ @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_is_finite_expr(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if NUMPY_VERSION < (1, 25) and "pandas_pyarrow" in str(constructor): + if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) or "pyarrow_table" in str(constructor): @@ -30,7 +30,7 @@ def test_is_finite_expr(request: pytest.FixtureRequest, constructor: Constructor def test_is_finite_series( request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: - if NUMPY_VERSION < (1, 25) and "pandas_pyarrow" in str(constructor_eager): + if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor_eager): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor_eager) or "pyarrow_table" in str(constructor_eager):