From dc2b100b4ddf38c78ad098bc5bd827fd591af6f7 Mon Sep 17 00:00:00 2001 From: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:50:22 +0200 Subject: [PATCH 1/4] feat: add arrow value_counts (#576) --- narwhals/_arrow/series.py | 24 ++++++++++++++++++++++++ narwhals/_pandas_like/series.py | 2 +- tests/series_only/test_common.py | 23 ----------------------- tests/series_only/value_counts_test.py | 26 ++++++++++++++++++++++++++ utils/check_backend_completeness.py | 1 - 5 files changed, 51 insertions(+), 25 deletions(-) create mode 100644 tests/series_only/value_counts_test.py diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 8bed41892..b51ca401c 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -344,6 +344,30 @@ def item(self: Self, index: int | None = None) -> Any: return self._native_series[0].as_py() return self._native_series[index].as_py() + def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any: # noqa: ARG002 + """Parallel is unused, exists for compatibility""" + from narwhals._arrow.dataframe import ArrowDataFrame + + pc = get_pyarrow_compute() + pa = get_pyarrow() + + name_ = ( + "index" if self._native_series._name is None else self._native_series._name + ) + + val_count = pc.value_counts(self._native_series) + val_count = pa.Table.from_arrays( + [val_count.field("values"), val_count.field("counts")], names=[name_, "count"] + ) + + if sort: + val_count = val_count.sort_by([("count", "descending")]) + + return ArrowDataFrame( + val_count, + backend_version=self._backend_version, + ) + def zip_with(self: Self, mask: Self, other: Self) -> Self: pc = get_pyarrow_compute() diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 650225afd..de85df4ea 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -567,7 +567,7 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A ).reset_index() val_count.columns = [name_, "count"] if sort: - val_count = val_count.sort_values(name_) + val_count = val_count.sort_values("count", ascending=False) return PandasLikeDataFrame( val_count, diff --git a/tests/series_only/test_common.py b/tests/series_only/test_common.py index bb5996e9c..499bd6520 100644 --- a/tests/series_only/test_common.py +++ b/tests/series_only/test_common.py @@ -114,29 +114,6 @@ def test_to_numpy() -> None: assert nw_series.shape == (3,) -def test_value_counts(request: Any, constructor_series: Any) -> None: - if "pyarrow_series" in str(constructor_series): - request.applymarker(pytest.mark.xfail) - - if "pandas_series_nullable" in str(constructor_series): # fails for py3.8 - pytest.skip() - - series = nw.from_native(constructor_series(data_dups).rename("b"), series_only=True) - - sorted_result = series.value_counts(sort=True) - assert sorted_result.columns == ["b", "count"] - - expected = np.array([[4, 2], [6, 1]]) - assert (sorted_result.to_numpy() == expected).all() - - unsorted_result = series.value_counts(sort=False) - assert unsorted_result.columns == ["b", "count"] - - a = unsorted_result.to_numpy() - - assert (a[a[:, 0].argsort()] == expected).all() - - @pytest.mark.parametrize( ("interpolation", "expected"), [ diff --git a/tests/series_only/value_counts_test.py b/tests/series_only/value_counts_test.py new file mode 100644 index 000000000..abda59798 --- /dev/null +++ b/tests/series_only/value_counts_test.py @@ -0,0 +1,26 @@ +import sys +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + +data = {"a": [4, 4, 6, 4, 1, 1]} + + +def test_value_counts(request: Any, constructor: Any) -> None: + if "pandas_nullable_constructor" in str(constructor) and sys.version_info < ( + 3, + 9, + ): # fails for py3.8 + request.applymarker(pytest.mark.xfail) + + series = nw.from_native(constructor(data), eager_only=True)["a"] + + sorted_result = series.value_counts(sort=True) + expected = {"a": [4, 1, 6], "count": [3, 2, 1]} + compare_dicts(sorted_result, expected) + + unsorted_result = series.value_counts(sort=False).sort("count", descending=True) + compare_dicts(unsorted_result, expected) diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py index d51ecf642..ad486f4b2 100644 --- a/utils/check_backend_completeness.py +++ b/utils/check_backend_completeness.py @@ -21,7 +21,6 @@ "Series.shift", "Series.sort", "Series.unique", - "Series.value_counts", ] From d3c39dd3850a703c046ae996bc44e75e8dd54da8 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:59:12 +0200 Subject: [PATCH 2/4] feat: `Series.to_dummies`, arrow `sort` and `unique` (#577) --- docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 6 +++ narwhals/_arrow/series.py | 33 +++++++++++++ narwhals/_pandas_like/series.py | 20 ++++++++ narwhals/_polars/series.py | 16 ++++++ narwhals/series.py | 73 ++++++++++++++++++++++++++++ tests/expr_and_series/sort_test.py | 31 ++++++++++++ tests/expr_and_series/unary_test.py | 6 +-- tests/expr_and_series/unique_test.py | 22 +++++++++ tests/series_only/to_dummy_test.py | 26 ++++++++++ utils/check_api_reference.py | 1 + utils/check_backend_completeness.py | 2 - 12 files changed, 230 insertions(+), 7 deletions(-) create mode 100644 tests/expr_and_series/sort_test.py create mode 100644 tests/expr_and_series/unique_test.py create mode 100644 tests/series_only/to_dummy_test.py diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 0d4094082..bf6078555 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -44,6 +44,7 @@ - std - sum - tail + - to_dummies - to_frame - to_list - to_numpy diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index d0082a31f..a04c70bb7 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -248,6 +248,12 @@ def is_first_distinct(self: Self) -> Self: def is_last_distinct(self: Self) -> Self: return reuse_series_implementation(self, "is_last_distinct") + def unique(self: Self) -> Self: + return reuse_series_implementation(self, "unique") + + def sort(self: Self, *, descending: bool = False) -> Self: + return reuse_series_implementation(self, "sort", descending=descending) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index b51ca401c..53524278f 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -468,6 +468,39 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool: else: return pc.all(pc.less_equal(ser[:-1], ser[1:])).as_py() # type: ignore[no-any-return] + def unique(self: Self) -> ArrowSeries: + pc = get_pyarrow_compute() + return self._from_native_series(pc.unique(self._native_series)) + + def sort(self: Self, *, descending: bool = False) -> ArrowSeries: + pc = get_pyarrow_compute() + series = self._native_series + order = "descending" if descending else "ascending" + sorted_indices = pc.array_sort_indices( + series, order=order, null_placement="at_start" + ) + + return self._from_native_series(pc.take(series, sorted_indices)) + + def to_dummies( + self: Self, *, separator: str = "_", drop_first: bool = False + ) -> ArrowDataFrame: + from narwhals._arrow.dataframe import ArrowDataFrame + + pa = get_pyarrow() + pc = get_pyarrow_compute() + series = self._native_series + unique_values = self.unique().sort()._native_series + columns = [pc.cast(pc.equal(series, v), pa.uint8()) for v in unique_values][ + int(drop_first) : + ] + names = [f"{self._name}{separator}{v}" for v in unique_values][int(drop_first) :] + + return ArrowDataFrame( + pa.Table.from_arrays(columns, names=names), + backend_version=self._backend_version, + ) + @property def shape(self) -> tuple[int]: return (len(self._native_series),) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index de85df4ea..36d69f0dd 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -23,6 +23,7 @@ if TYPE_CHECKING: from typing_extensions import Self + from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.namespace import PandasLikeNamespace from narwhals.dtypes import DType @@ -596,6 +597,25 @@ def tail(self: Self, n: int) -> Self: def round(self: Self, decimals: int) -> Self: return self._from_native_series(self._native_series.round(decimals=decimals)) + def to_dummies( + self: Self, *, separator: str = "_", drop_first: bool = False + ) -> PandasLikeDataFrame: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + plx = self.__native_namespace__() + series = self._native_series + name = str(self._name) if self._name else "" + return PandasLikeDataFrame( + plx.get_dummies( + series, + prefix=name, + prefix_sep=separator, + drop_first=drop_first, + ).astype(int), + implementation=self._implementation, + backend_version=self._backend_version, + ) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 07f7a82d7..11bc121e5 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -13,6 +13,7 @@ import numpy as np from typing_extensions import Self + from narwhals._polars.dataframe import PolarsDataFrame from narwhals.dtypes import DType from narwhals._polars.namespace import PolarsNamespace @@ -168,6 +169,21 @@ def __rpow__(self, other: PolarsSeries | Any) -> Self: def __invert__(self) -> Self: return self._from_native_series(self._native_series.__invert__()) + def to_dummies( + self: Self, *, separator: str = "_", drop_first: bool = False + ) -> PolarsDataFrame: + from narwhals._polars.dataframe import PolarsDataFrame + + if self._backend_version < (0, 20, 15): # pragma: no cover + result = self._native_series.to_dummies(separator=separator) + result = result.select(result.columns[int(drop_first) :]) + else: + result = self._native_series.to_dummies( + separator=separator, drop_first=drop_first + ) + + return PolarsDataFrame(result, backend_version=self._backend_version) + @property def dt(self) -> PolarsSeriesDateTimeNamespace: return PolarsSeriesDateTimeNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index 833a0b05c..9da9a6c5c 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1963,6 +1963,79 @@ def round(self: Self, decimals: int = 0) -> Self: """ return self._from_compliant_series(self._compliant_series.round(decimals)) + def to_dummies( + self: Self, *, separator: str = "_", drop_first: bool = False + ) -> DataFrame[Any]: + r""" + Get dummy/indicator variables. + + Arguments + separator: Separator/delimiter used when generating column names. + drop_first: Remove the first category from the variable being encoded. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="a") + >>> s_pl = pl.Series("a", data) + + Let's define a dataframe-agnostic function that rounds to the first decimal: + + >>> @nw.narwhalify + ... def func(s_any, drop_first: bool = False): + ... return s_any.to_dummies(drop_first=drop_first) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + a_1 a_2 a_3 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + + >>> func(s_pd, drop_first=True) + a_2 a_3 + 0 0 0 + 1 1 0 + 2 0 1 + + >>> func(s_pl) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + >>> func(s_pl, drop_first=True) + shape: (3, 2) + ┌─────┬─────┐ + │ a_2 ┆ a_3 │ + │ --- ┆ --- │ + │ u8 ┆ u8 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 0 │ + │ 0 ┆ 1 │ + └─────┴─────┘ + """ + + from narwhals.dataframe import DataFrame + + return DataFrame( + self._compliant_series.to_dummies(separator=separator, drop_first=drop_first), + level=self._level, + ) + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/expr_and_series/sort_test.py b/tests/expr_and_series/sort_test.py new file mode 100644 index 000000000..22c0be6be --- /dev/null +++ b/tests/expr_and_series/sort_test.py @@ -0,0 +1,31 @@ +from typing import Any + +import numpy as np +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + +data = {"a": [1, 3, 2], "b": [0, 2, -1]} + + +@pytest.mark.parametrize( + ("descending", "expected"), + [ + (True, {"a": [3, 2, 1], "b": [0, 2, -1]}), + (False, {"a": [1, 2, 3], "b": [0, 2, -1]}), + ], +) +def test_sort_expr(constructor: Any, descending: Any, expected: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.col("a").sort(descending=descending), "b") + compare_dicts(result, expected) + + +@pytest.mark.parametrize( + ("descending", "expected"), [(True, [3, 2, 1]), (False, [1, 2, 3])] +) +def test_sort_series(constructor_series: Any, descending: Any, expected: Any) -> None: + series = nw.from_native(constructor_series(data["a"]), series_only=True) + result = series.sort(descending=descending) + assert (result.to_numpy() == np.array(expected)).all() diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 474ef8799..0d9b68740 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -1,14 +1,10 @@ from typing import Any -import pytest - import narwhals as nw from tests.utils import compare_dicts -def test_unary(request: Any, constructor_with_lazy: Any) -> None: - if "pyarrow_table" in str(constructor_with_lazy): - request.applymarker(pytest.mark.xfail) +def test_unary(constructor_with_lazy: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} result = ( nw.from_native(constructor_with_lazy(data)) diff --git a/tests/expr_and_series/unique_test.py b/tests/expr_and_series/unique_test.py new file mode 100644 index 000000000..f9d2198f6 --- /dev/null +++ b/tests/expr_and_series/unique_test.py @@ -0,0 +1,22 @@ +from typing import Any + +import numpy as np + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + +data = {"a": [1, 1, 2]} + + +def test_unique_expr(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.col("a").unique()) + expected = {"a": [1, 2]} + compare_dicts(result, expected) + + +def test_unique_series(constructor_series: Any) -> None: + series = nw.from_native(constructor_series(data["a"]), series_only=True) + result = series.unique() + expected = np.array([1, 2]) + assert (result.to_numpy() == expected).all() diff --git a/tests/series_only/to_dummy_test.py b/tests/series_only/to_dummy_test.py new file mode 100644 index 000000000..5bd80cc30 --- /dev/null +++ b/tests/series_only/to_dummy_test.py @@ -0,0 +1,26 @@ +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + +data = [1, 2, 3] + + +@pytest.mark.parametrize("sep", ["_", "-"]) +def test_to_dummies(constructor_series: Any, sep: str) -> None: + s = nw.from_native(constructor_series(data), series_only=True).alias("a") + result = s.to_dummies(separator=sep) + expected = {f"a{sep}1": [1, 0, 0], f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]} + + compare_dicts(result, expected) + + +@pytest.mark.parametrize("sep", ["_", "-"]) +def test_to_dummies_drop_first(constructor_series: Any, sep: str) -> None: + s = nw.from_native(constructor_series(data), series_only=True).alias("a") + result = s.to_dummies(drop_first=True, separator=sep) + expected = {f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]} + + compare_dicts(result, expected) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 0f9eff266..80ee5d7aa 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -148,6 +148,7 @@ .difference(expr) .difference( { + "to_dummies", "to_pandas", "to_list", "to_numpy", diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py index ad486f4b2..1fe997494 100644 --- a/utils/check_backend_completeness.py +++ b/utils/check_backend_completeness.py @@ -19,8 +19,6 @@ "Series.quantile", "Series.round", "Series.shift", - "Series.sort", - "Series.unique", ] From 53ebffc36969dc03f0af5c34dd27b513f327ae63 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 22 Jul 2024 19:41:42 +0100 Subject: [PATCH 3/4] Bump version to 1.1.4 (#579) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index d412f5b07..6131e00f5 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.1.3' +'1.1.4' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 47be57220..b9175d192 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -46,7 +46,7 @@ from narwhals.utils import maybe_convert_dtypes from narwhals.utils import maybe_set_index -__version__ = "1.1.3" +__version__ = "1.1.4" __all__ = [ "selectors", diff --git a/pyproject.toml b/pyproject.toml index d00b10e89..ebbea35b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.1.3" +version = "1.1.4" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 372f3c2d12a320cdefb34948811adf90a7278176 Mon Sep 17 00:00:00 2001 From: Aidos Kanapyanov <65722512+aidoskanapyanov@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:27:07 +0500 Subject: [PATCH 4/4] dep: add polars specifier to ibis-framework dependency (#591) --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index a9d6f04d8..e54a3edeb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ covdefaults -ibis-framework +ibis-framework[polars] pandas polars[timezones] pre-commit