From dc2b100b4ddf38c78ad098bc5bd827fd591af6f7 Mon Sep 17 00:00:00 2001
From: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com>
Date: Mon, 22 Jul 2024 19:50:22 +0200
Subject: [PATCH 1/4] feat: add arrow value_counts  (#576)

---
 narwhals/_arrow/series.py              | 24 ++++++++++++++++++++++++
 narwhals/_pandas_like/series.py        |  2 +-
 tests/series_only/test_common.py       | 23 -----------------------
 tests/series_only/value_counts_test.py | 26 ++++++++++++++++++++++++++
 utils/check_backend_completeness.py    |  1 -
 5 files changed, 51 insertions(+), 25 deletions(-)
 create mode 100644 tests/series_only/value_counts_test.py

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
index 8bed41892..b51ca401c 100644
--- a/narwhals/_arrow/series.py
+++ b/narwhals/_arrow/series.py
@@ -344,6 +344,30 @@ def item(self: Self, index: int | None = None) -> Any:
             return self._native_series[0].as_py()
         return self._native_series[index].as_py()
 
+    def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any:  # noqa: ARG002
+        """Parallel is unused, exists for compatibility"""
+        from narwhals._arrow.dataframe import ArrowDataFrame
+
+        pc = get_pyarrow_compute()
+        pa = get_pyarrow()
+
+        name_ = (
+            "index" if self._native_series._name is None else self._native_series._name
+        )
+
+        val_count = pc.value_counts(self._native_series)
+        val_count = pa.Table.from_arrays(
+            [val_count.field("values"), val_count.field("counts")], names=[name_, "count"]
+        )
+
+        if sort:
+            val_count = val_count.sort_by([("count", "descending")])
+
+        return ArrowDataFrame(
+            val_count,
+            backend_version=self._backend_version,
+        )
+
     def zip_with(self: Self, mask: Self, other: Self) -> Self:
         pc = get_pyarrow_compute()
 
diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
index 650225afd..de85df4ea 100644
--- a/narwhals/_pandas_like/series.py
+++ b/narwhals/_pandas_like/series.py
@@ -567,7 +567,7 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A
         ).reset_index()
         val_count.columns = [name_, "count"]
         if sort:
-            val_count = val_count.sort_values(name_)
+            val_count = val_count.sort_values("count", ascending=False)
 
         return PandasLikeDataFrame(
             val_count,
diff --git a/tests/series_only/test_common.py b/tests/series_only/test_common.py
index bb5996e9c..499bd6520 100644
--- a/tests/series_only/test_common.py
+++ b/tests/series_only/test_common.py
@@ -114,29 +114,6 @@ def test_to_numpy() -> None:
     assert nw_series.shape == (3,)
 
 
-def test_value_counts(request: Any, constructor_series: Any) -> None:
-    if "pyarrow_series" in str(constructor_series):
-        request.applymarker(pytest.mark.xfail)
-
-    if "pandas_series_nullable" in str(constructor_series):  # fails for py3.8
-        pytest.skip()
-
-    series = nw.from_native(constructor_series(data_dups).rename("b"), series_only=True)
-
-    sorted_result = series.value_counts(sort=True)
-    assert sorted_result.columns == ["b", "count"]
-
-    expected = np.array([[4, 2], [6, 1]])
-    assert (sorted_result.to_numpy() == expected).all()
-
-    unsorted_result = series.value_counts(sort=False)
-    assert unsorted_result.columns == ["b", "count"]
-
-    a = unsorted_result.to_numpy()
-
-    assert (a[a[:, 0].argsort()] == expected).all()
-
-
 @pytest.mark.parametrize(
     ("interpolation", "expected"),
     [
diff --git a/tests/series_only/value_counts_test.py b/tests/series_only/value_counts_test.py
new file mode 100644
index 000000000..abda59798
--- /dev/null
+++ b/tests/series_only/value_counts_test.py
@@ -0,0 +1,26 @@
+import sys
+from typing import Any
+
+import pytest
+
+import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
+
+data = {"a": [4, 4, 6, 4, 1, 1]}
+
+
+def test_value_counts(request: Any, constructor: Any) -> None:
+    if "pandas_nullable_constructor" in str(constructor) and sys.version_info < (
+        3,
+        9,
+    ):  # fails for py3.8
+        request.applymarker(pytest.mark.xfail)
+
+    series = nw.from_native(constructor(data), eager_only=True)["a"]
+
+    sorted_result = series.value_counts(sort=True)
+    expected = {"a": [4, 1, 6], "count": [3, 2, 1]}
+    compare_dicts(sorted_result, expected)
+
+    unsorted_result = series.value_counts(sort=False).sort("count", descending=True)
+    compare_dicts(unsorted_result, expected)
diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py
index d51ecf642..ad486f4b2 100644
--- a/utils/check_backend_completeness.py
+++ b/utils/check_backend_completeness.py
@@ -21,7 +21,6 @@
     "Series.shift",
     "Series.sort",
     "Series.unique",
-    "Series.value_counts",
 ]
 
 

From d3c39dd3850a703c046ae996bc44e75e8dd54da8 Mon Sep 17 00:00:00 2001
From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com>
Date: Mon, 22 Jul 2024 19:59:12 +0200
Subject: [PATCH 2/4] feat: `Series.to_dummies`, arrow `sort` and `unique`
 (#577)

---
 docs/api-reference/series.md         |  1 +
 narwhals/_arrow/expr.py              |  6 +++
 narwhals/_arrow/series.py            | 33 +++++++++++++
 narwhals/_pandas_like/series.py      | 20 ++++++++
 narwhals/_polars/series.py           | 16 ++++++
 narwhals/series.py                   | 73 ++++++++++++++++++++++++++++
 tests/expr_and_series/sort_test.py   | 31 ++++++++++++
 tests/expr_and_series/unary_test.py  |  6 +--
 tests/expr_and_series/unique_test.py | 22 +++++++++
 tests/series_only/to_dummy_test.py   | 26 ++++++++++
 utils/check_api_reference.py         |  1 +
 utils/check_backend_completeness.py  |  2 -
 12 files changed, 230 insertions(+), 7 deletions(-)
 create mode 100644 tests/expr_and_series/sort_test.py
 create mode 100644 tests/expr_and_series/unique_test.py
 create mode 100644 tests/series_only/to_dummy_test.py

diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
index 0d4094082..bf6078555 100644
--- a/docs/api-reference/series.md
+++ b/docs/api-reference/series.md
@@ -44,6 +44,7 @@
         - std
         - sum
         - tail
+        - to_dummies
         - to_frame
         - to_list
         - to_numpy
diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
index d0082a31f..a04c70bb7 100644
--- a/narwhals/_arrow/expr.py
+++ b/narwhals/_arrow/expr.py
@@ -248,6 +248,12 @@ def is_first_distinct(self: Self) -> Self:
     def is_last_distinct(self: Self) -> Self:
         return reuse_series_implementation(self, "is_last_distinct")
 
+    def unique(self: Self) -> Self:
+        return reuse_series_implementation(self, "unique")
+
+    def sort(self: Self, *, descending: bool = False) -> Self:
+        return reuse_series_implementation(self, "sort", descending=descending)
+
     @property
     def dt(self: Self) -> ArrowExprDateTimeNamespace:
         return ArrowExprDateTimeNamespace(self)
diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
index b51ca401c..53524278f 100644
--- a/narwhals/_arrow/series.py
+++ b/narwhals/_arrow/series.py
@@ -468,6 +468,39 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool:
         else:
             return pc.all(pc.less_equal(ser[:-1], ser[1:])).as_py()  # type: ignore[no-any-return]
 
+    def unique(self: Self) -> ArrowSeries:
+        pc = get_pyarrow_compute()
+        return self._from_native_series(pc.unique(self._native_series))
+
+    def sort(self: Self, *, descending: bool = False) -> ArrowSeries:
+        pc = get_pyarrow_compute()
+        series = self._native_series
+        order = "descending" if descending else "ascending"
+        sorted_indices = pc.array_sort_indices(
+            series, order=order, null_placement="at_start"
+        )
+
+        return self._from_native_series(pc.take(series, sorted_indices))
+
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> ArrowDataFrame:
+        from narwhals._arrow.dataframe import ArrowDataFrame
+
+        pa = get_pyarrow()
+        pc = get_pyarrow_compute()
+        series = self._native_series
+        unique_values = self.unique().sort()._native_series
+        columns = [pc.cast(pc.equal(series, v), pa.uint8()) for v in unique_values][
+            int(drop_first) :
+        ]
+        names = [f"{self._name}{separator}{v}" for v in unique_values][int(drop_first) :]
+
+        return ArrowDataFrame(
+            pa.Table.from_arrays(columns, names=names),
+            backend_version=self._backend_version,
+        )
+
     @property
     def shape(self) -> tuple[int]:
         return (len(self._native_series),)
diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
index de85df4ea..36d69f0dd 100644
--- a/narwhals/_pandas_like/series.py
+++ b/narwhals/_pandas_like/series.py
@@ -23,6 +23,7 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    from narwhals._pandas_like.dataframe import PandasLikeDataFrame
     from narwhals._pandas_like.namespace import PandasLikeNamespace
     from narwhals.dtypes import DType
 
@@ -596,6 +597,25 @@ def tail(self: Self, n: int) -> Self:
     def round(self: Self, decimals: int) -> Self:
         return self._from_native_series(self._native_series.round(decimals=decimals))
 
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> PandasLikeDataFrame:
+        from narwhals._pandas_like.dataframe import PandasLikeDataFrame
+
+        plx = self.__native_namespace__()
+        series = self._native_series
+        name = str(self._name) if self._name else ""
+        return PandasLikeDataFrame(
+            plx.get_dummies(
+                series,
+                prefix=name,
+                prefix_sep=separator,
+                drop_first=drop_first,
+            ).astype(int),
+            implementation=self._implementation,
+            backend_version=self._backend_version,
+        )
+
     @property
     def str(self) -> PandasLikeSeriesStringNamespace:
         return PandasLikeSeriesStringNamespace(self)
diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py
index 07f7a82d7..11bc121e5 100644
--- a/narwhals/_polars/series.py
+++ b/narwhals/_polars/series.py
@@ -13,6 +13,7 @@
     import numpy as np
     from typing_extensions import Self
 
+    from narwhals._polars.dataframe import PolarsDataFrame
     from narwhals.dtypes import DType
 
 from narwhals._polars.namespace import PolarsNamespace
@@ -168,6 +169,21 @@ def __rpow__(self, other: PolarsSeries | Any) -> Self:
     def __invert__(self) -> Self:
         return self._from_native_series(self._native_series.__invert__())
 
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> PolarsDataFrame:
+        from narwhals._polars.dataframe import PolarsDataFrame
+
+        if self._backend_version < (0, 20, 15):  # pragma: no cover
+            result = self._native_series.to_dummies(separator=separator)
+            result = result.select(result.columns[int(drop_first) :])
+        else:
+            result = self._native_series.to_dummies(
+                separator=separator, drop_first=drop_first
+            )
+
+        return PolarsDataFrame(result, backend_version=self._backend_version)
+
     @property
     def dt(self) -> PolarsSeriesDateTimeNamespace:
         return PolarsSeriesDateTimeNamespace(self)
diff --git a/narwhals/series.py b/narwhals/series.py
index 833a0b05c..9da9a6c5c 100644
--- a/narwhals/series.py
+++ b/narwhals/series.py
@@ -1963,6 +1963,79 @@ def round(self: Self, decimals: int = 0) -> Self:
         """
         return self._from_compliant_series(self._compliant_series.round(decimals))
 
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> DataFrame[Any]:
+        r"""
+        Get dummy/indicator variables.
+
+        Arguments
+            separator: Separator/delimiter used when generating column names.
+            drop_first: Remove the first category from the variable being encoded.
+
+        Notes:
+            pandas and Polars handle null values differently. Polars distinguishes
+            between NaN and Null, whereas pandas doesn't.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = [1, 2, 3]
+            >>> s_pd = pd.Series(data, name="a")
+            >>> s_pl = pl.Series("a", data)
+
+            Let's define a dataframe-agnostic function that rounds to the first decimal:
+
+            >>> @nw.narwhalify
+            ... def func(s_any, drop_first: bool = False):
+            ...     return s_any.to_dummies(drop_first=drop_first)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(s_pd)
+               a_1  a_2  a_3
+            0    1    0    0
+            1    0    1    0
+            2    0    0    1
+
+            >>> func(s_pd, drop_first=True)
+               a_2  a_3
+            0    0    0
+            1    1    0
+            2    0    1
+
+            >>> func(s_pl)
+            shape: (3, 3)
+            ┌─────┬─────┬─────┐
+            │ a_1 ┆ a_2 ┆ a_3 │
+            │ --- ┆ --- ┆ --- │
+            │ u8  ┆ u8  ┆ u8  │
+            ╞═════╪═════╪═════╡
+            │ 1   ┆ 0   ┆ 0   │
+            │ 0   ┆ 1   ┆ 0   │
+            │ 0   ┆ 0   ┆ 1   │
+            └─────┴─────┴─────┘
+            >>> func(s_pl, drop_first=True)
+            shape: (3, 2)
+            ┌─────┬─────┐
+            │ a_2 ┆ a_3 │
+            │ --- ┆ --- │
+            │ u8  ┆ u8  │
+            ╞═════╪═════╡
+            │ 0   ┆ 0   │
+            │ 1   ┆ 0   │
+            │ 0   ┆ 1   │
+            └─────┴─────┘
+        """
+
+        from narwhals.dataframe import DataFrame
+
+        return DataFrame(
+            self._compliant_series.to_dummies(separator=separator, drop_first=drop_first),
+            level=self._level,
+        )
+
     @property
     def str(self) -> SeriesStringNamespace:
         return SeriesStringNamespace(self)
diff --git a/tests/expr_and_series/sort_test.py b/tests/expr_and_series/sort_test.py
new file mode 100644
index 000000000..22c0be6be
--- /dev/null
+++ b/tests/expr_and_series/sort_test.py
@@ -0,0 +1,31 @@
+from typing import Any
+
+import numpy as np
+import pytest
+
+import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
+
+data = {"a": [1, 3, 2], "b": [0, 2, -1]}
+
+
+@pytest.mark.parametrize(
+    ("descending", "expected"),
+    [
+        (True, {"a": [3, 2, 1], "b": [0, 2, -1]}),
+        (False, {"a": [1, 2, 3], "b": [0, 2, -1]}),
+    ],
+)
+def test_sort_expr(constructor: Any, descending: Any, expected: Any) -> None:
+    df = nw.from_native(constructor(data), eager_only=True)
+    result = df.select(nw.col("a").sort(descending=descending), "b")
+    compare_dicts(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("descending", "expected"), [(True, [3, 2, 1]), (False, [1, 2, 3])]
+)
+def test_sort_series(constructor_series: Any, descending: Any, expected: Any) -> None:
+    series = nw.from_native(constructor_series(data["a"]), series_only=True)
+    result = series.sort(descending=descending)
+    assert (result.to_numpy() == np.array(expected)).all()
diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py
index 474ef8799..0d9b68740 100644
--- a/tests/expr_and_series/unary_test.py
+++ b/tests/expr_and_series/unary_test.py
@@ -1,14 +1,10 @@
 from typing import Any
 
-import pytest
-
 import narwhals as nw
 from tests.utils import compare_dicts
 
 
-def test_unary(request: Any, constructor_with_lazy: Any) -> None:
-    if "pyarrow_table" in str(constructor_with_lazy):
-        request.applymarker(pytest.mark.xfail)
+def test_unary(constructor_with_lazy: Any) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
     result = (
         nw.from_native(constructor_with_lazy(data))
diff --git a/tests/expr_and_series/unique_test.py b/tests/expr_and_series/unique_test.py
new file mode 100644
index 000000000..f9d2198f6
--- /dev/null
+++ b/tests/expr_and_series/unique_test.py
@@ -0,0 +1,22 @@
+from typing import Any
+
+import numpy as np
+
+import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
+
+data = {"a": [1, 1, 2]}
+
+
+def test_unique_expr(constructor: Any) -> None:
+    df = nw.from_native(constructor(data), eager_only=True)
+    result = df.select(nw.col("a").unique())
+    expected = {"a": [1, 2]}
+    compare_dicts(result, expected)
+
+
+def test_unique_series(constructor_series: Any) -> None:
+    series = nw.from_native(constructor_series(data["a"]), series_only=True)
+    result = series.unique()
+    expected = np.array([1, 2])
+    assert (result.to_numpy() == expected).all()
diff --git a/tests/series_only/to_dummy_test.py b/tests/series_only/to_dummy_test.py
new file mode 100644
index 000000000..5bd80cc30
--- /dev/null
+++ b/tests/series_only/to_dummy_test.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+import pytest
+
+import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
+
+data = [1, 2, 3]
+
+
+@pytest.mark.parametrize("sep", ["_", "-"])
+def test_to_dummies(constructor_series: Any, sep: str) -> None:
+    s = nw.from_native(constructor_series(data), series_only=True).alias("a")
+    result = s.to_dummies(separator=sep)
+    expected = {f"a{sep}1": [1, 0, 0], f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]}
+
+    compare_dicts(result, expected)
+
+
+@pytest.mark.parametrize("sep", ["_", "-"])
+def test_to_dummies_drop_first(constructor_series: Any, sep: str) -> None:
+    s = nw.from_native(constructor_series(data), series_only=True).alias("a")
+    result = s.to_dummies(drop_first=True, separator=sep)
+    expected = {f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]}
+
+    compare_dicts(result, expected)
diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py
index 0f9eff266..80ee5d7aa 100644
--- a/utils/check_api_reference.py
+++ b/utils/check_api_reference.py
@@ -148,6 +148,7 @@
     .difference(expr)
     .difference(
         {
+            "to_dummies",
             "to_pandas",
             "to_list",
             "to_numpy",
diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py
index ad486f4b2..1fe997494 100644
--- a/utils/check_backend_completeness.py
+++ b/utils/check_backend_completeness.py
@@ -19,8 +19,6 @@
     "Series.quantile",
     "Series.round",
     "Series.shift",
-    "Series.sort",
-    "Series.unique",
 ]
 
 

From 53ebffc36969dc03f0af5c34dd27b513f327ae63 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Mon, 22 Jul 2024 19:41:42 +0100
Subject: [PATCH 3/4] Bump version to 1.1.4 (#579)

---
 docs/installation.md | 2 +-
 narwhals/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index d412f5b07..6131e00f5 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.1.3'
+'1.1.4'
 ```
 then installation worked correctly!
diff --git a/narwhals/__init__.py b/narwhals/__init__.py
index 47be57220..b9175d192 100644
--- a/narwhals/__init__.py
+++ b/narwhals/__init__.py
@@ -46,7 +46,7 @@
 from narwhals.utils import maybe_convert_dtypes
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.1.3"
+__version__ = "1.1.4"
 
 __all__ = [
     "selectors",
diff --git a/pyproject.toml b/pyproject.toml
index d00b10e89..ebbea35b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "narwhals"
-version = "1.1.3"
+version = "1.1.4"
 authors = [
   { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" },
 ]

From 372f3c2d12a320cdefb34948811adf90a7278176 Mon Sep 17 00:00:00 2001
From: Aidos Kanapyanov <65722512+aidoskanapyanov@users.noreply.github.com>
Date: Tue, 23 Jul 2024 17:27:07 +0500
Subject: [PATCH 4/4] dep: add polars specifier to ibis-framework dependency
 (#591)

---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index a9d6f04d8..e54a3edeb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,5 @@
 covdefaults
-ibis-framework
+ibis-framework[polars]
 pandas
 polars[timezones]
 pre-commit