Merge remote-tracking branch 'upstream/main' into add-where-expression

aivanoved · Jul 23, 2024 · 504c4ea · 504c4ea
2 parents add7b89 + 372f3c2
commit 504c4ea
Show file tree

Hide file tree

Showing 18 changed files with 285 additions and 36 deletions.
diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -44,6 +44,7 @@
         - std
         - sum
         - tail
+        - to_dummies
         - to_frame
         - to_list
         - to_numpy

diff --git a/docs/installation.md b/docs/installation.md
@@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.1.3'
+'1.1.4'
 ```
 then installation worked correctly!
diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -46,7 +46,7 @@
 from narwhals.utils import maybe_convert_dtypes
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.1.3"
+__version__ = "1.1.4"
 
 __all__ = [
     "selectors",

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -248,6 +248,12 @@ def is_first_distinct(self: Self) -> Self:
     def is_last_distinct(self: Self) -> Self:
         return reuse_series_implementation(self, "is_last_distinct")
 
+    def unique(self: Self) -> Self:
+        return reuse_series_implementation(self, "unique")
+
+    def sort(self: Self, *, descending: bool = False) -> Self:
+        return reuse_series_implementation(self, "sort", descending=descending)
+
     @property
     def dt(self: Self) -> ArrowExprDateTimeNamespace:
         return ArrowExprDateTimeNamespace(self)

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -344,6 +344,30 @@ def item(self: Self, index: int | None = None) -> Any:
             return self._native_series[0].as_py()
         return self._native_series[index].as_py()
 
+    def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any:  # noqa: ARG002
+        """Parallel is unused, exists for compatibility"""
+        from narwhals._arrow.dataframe import ArrowDataFrame
+
+        pc = get_pyarrow_compute()
+        pa = get_pyarrow()
+
+        name_ = (
+            "index" if self._native_series._name is None else self._native_series._name
+        )
+
+        val_count = pc.value_counts(self._native_series)
+        val_count = pa.Table.from_arrays(
+            [val_count.field("values"), val_count.field("counts")], names=[name_, "count"]
+        )
+
+        if sort:
+            val_count = val_count.sort_by([("count", "descending")])
+
+        return ArrowDataFrame(
+            val_count,
+            backend_version=self._backend_version,
+        )
+
     def zip_with(self: Self, mask: Self, other: Self) -> Self:
         pc = get_pyarrow_compute()
 
@@ -444,6 +468,39 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool:
         else:
             return pc.all(pc.less_equal(ser[:-1], ser[1:])).as_py()  # type: ignore[no-any-return]
 
+    def unique(self: Self) -> ArrowSeries:
+        pc = get_pyarrow_compute()
+        return self._from_native_series(pc.unique(self._native_series))
+
+    def sort(self: Self, *, descending: bool = False) -> ArrowSeries:
+        pc = get_pyarrow_compute()
+        series = self._native_series
+        order = "descending" if descending else "ascending"
+        sorted_indices = pc.array_sort_indices(
+            series, order=order, null_placement="at_start"
+        )
+
+        return self._from_native_series(pc.take(series, sorted_indices))
+
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> ArrowDataFrame:
+        from narwhals._arrow.dataframe import ArrowDataFrame
+
+        pa = get_pyarrow()
+        pc = get_pyarrow_compute()
+        series = self._native_series
+        unique_values = self.unique().sort()._native_series
+        columns = [pc.cast(pc.equal(series, v), pa.uint8()) for v in unique_values][
+            int(drop_first) :
+        ]
+        names = [f"{self._name}{separator}{v}" for v in unique_values][int(drop_first) :]
+
+        return ArrowDataFrame(
+            pa.Table.from_arrays(columns, names=names),
+            backend_version=self._backend_version,
+        )
+
     @property
     def shape(self) -> tuple[int]:
         return (len(self._native_series),)

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -23,6 +23,7 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    from narwhals._pandas_like.dataframe import PandasLikeDataFrame
     from narwhals._pandas_like.namespace import PandasLikeNamespace
     from narwhals.dtypes import DType
 
@@ -567,7 +568,7 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A
         ).reset_index()
         val_count.columns = [name_, "count"]
         if sort:
-            val_count = val_count.sort_values(name_)
+            val_count = val_count.sort_values("count", ascending=False)
 
         return PandasLikeDataFrame(
             val_count,
@@ -596,6 +597,25 @@ def tail(self: Self, n: int) -> Self:
     def round(self: Self, decimals: int) -> Self:
         return self._from_native_series(self._native_series.round(decimals=decimals))
 
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> PandasLikeDataFrame:
+        from narwhals._pandas_like.dataframe import PandasLikeDataFrame
+
+        plx = self.__native_namespace__()
+        series = self._native_series
+        name = str(self._name) if self._name else ""
+        return PandasLikeDataFrame(
+            plx.get_dummies(
+                series,
+                prefix=name,
+                prefix_sep=separator,
+                drop_first=drop_first,
+            ).astype(int),
+            implementation=self._implementation,
+            backend_version=self._backend_version,
+        )
+
     @property
     def str(self) -> PandasLikeSeriesStringNamespace:
         return PandasLikeSeriesStringNamespace(self)

diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py
@@ -13,6 +13,7 @@
     import numpy as np
     from typing_extensions import Self
 
+    from narwhals._polars.dataframe import PolarsDataFrame
     from narwhals.dtypes import DType
 
 from narwhals._polars.namespace import PolarsNamespace
@@ -168,6 +169,21 @@ def __rpow__(self, other: PolarsSeries | Any) -> Self:
     def __invert__(self) -> Self:
         return self._from_native_series(self._native_series.__invert__())
 
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> PolarsDataFrame:
+        from narwhals._polars.dataframe import PolarsDataFrame
+
+        if self._backend_version < (0, 20, 15):  # pragma: no cover
+            result = self._native_series.to_dummies(separator=separator)
+            result = result.select(result.columns[int(drop_first) :])
+        else:
+            result = self._native_series.to_dummies(
+                separator=separator, drop_first=drop_first
+            )
+
+        return PolarsDataFrame(result, backend_version=self._backend_version)
+
     @property
     def dt(self) -> PolarsSeriesDateTimeNamespace:
         return PolarsSeriesDateTimeNamespace(self)

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -1963,6 +1963,79 @@ def round(self: Self, decimals: int = 0) -> Self:
         """
         return self._from_compliant_series(self._compliant_series.round(decimals))
 
+    def to_dummies(
+        self: Self, *, separator: str = "_", drop_first: bool = False
+    ) -> DataFrame[Any]:
+        r"""
+        Get dummy/indicator variables.
+
+        Arguments
+            separator: Separator/delimiter used when generating column names.
+            drop_first: Remove the first category from the variable being encoded.
+
+        Notes:
+            pandas and Polars handle null values differently. Polars distinguishes
+            between NaN and Null, whereas pandas doesn't.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = [1, 2, 3]
+            >>> s_pd = pd.Series(data, name="a")
+            >>> s_pl = pl.Series("a", data)
+
+            Let's define a dataframe-agnostic function that rounds to the first decimal:
+
+            >>> @nw.narwhalify
+            ... def func(s_any, drop_first: bool = False):
+            ...     return s_any.to_dummies(drop_first=drop_first)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(s_pd)
+               a_1  a_2  a_3
+            0    1    0    0
+            1    0    1    0
+            2    0    0    1
+
+            >>> func(s_pd, drop_first=True)
+               a_2  a_3
+            0    0    0
+            1    1    0
+            2    0    1
+
+            >>> func(s_pl)
+            shape: (3, 3)
+            ┌─────┬─────┬─────┐
+            │ a_1 ┆ a_2 ┆ a_3 │
+            │ --- ┆ --- ┆ --- │
+            │ u8  ┆ u8  ┆ u8  │
+            ╞═════╪═════╪═════╡
+            │ 1   ┆ 0   ┆ 0   │
+            │ 0   ┆ 1   ┆ 0   │
+            │ 0   ┆ 0   ┆ 1   │
+            └─────┴─────┴─────┘
+            >>> func(s_pl, drop_first=True)
+            shape: (3, 2)
+            ┌─────┬─────┐
+            │ a_2 ┆ a_3 │
+            │ --- ┆ --- │
+            │ u8  ┆ u8  │
+            ╞═════╪═════╡
+            │ 0   ┆ 0   │
+            │ 1   ┆ 0   │
+            │ 0   ┆ 1   │
+            └─────┴─────┘
+        """
+
+        from narwhals.dataframe import DataFrame
+
+        return DataFrame(
+            self._compliant_series.to_dummies(separator=separator, drop_first=drop_first),
+            level=self._level,
+        )
+
     @property
     def str(self) -> SeriesStringNamespace:
         return SeriesStringNamespace(self)

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "narwhals"
-version = "1.1.3"
+version = "1.1.4"
 authors = [
   { name="Marco Gorelli", email="[email protected]" },
 ]

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,5 +1,5 @@
 covdefaults
-ibis-framework
+ibis-framework[polars]
 pandas
 polars[timezones]
 pre-commit

diff --git a/tests/expr_and_series/sort_test.py b/tests/expr_and_series/sort_test.py
@@ -0,0 +1,31 @@
+from typing import Any
+
+import numpy as np
+import pytest
+
+import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
+
+data = {"a": [1, 3, 2], "b": [0, 2, -1]}
+
+
+@pytest.mark.parametrize(
+    ("descending", "expected"),
+    [
+        (True, {"a": [3, 2, 1], "b": [0, 2, -1]}),
+        (False, {"a": [1, 2, 3], "b": [0, 2, -1]}),
+    ],
+)
+def test_sort_expr(constructor: Any, descending: Any, expected: Any) -> None:
+    df = nw.from_native(constructor(data), eager_only=True)
+    result = df.select(nw.col("a").sort(descending=descending), "b")
+    compare_dicts(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("descending", "expected"), [(True, [3, 2, 1]), (False, [1, 2, 3])]
+)
+def test_sort_series(constructor_series: Any, descending: Any, expected: Any) -> None:
+    series = nw.from_native(constructor_series(data["a"]), series_only=True)
+    result = series.sort(descending=descending)
+    assert (result.to_numpy() == np.array(expected)).all()
diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py
@@ -1,14 +1,10 @@
 from typing import Any
 
-import pytest
-
 import narwhals as nw
 from tests.utils import compare_dicts
 
 
-def test_unary(request: Any, constructor_with_lazy: Any) -> None:
-    if "pyarrow_table" in str(constructor_with_lazy):
-        request.applymarker(pytest.mark.xfail)
+def test_unary(constructor_with_lazy: Any) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
     result = (
         nw.from_native(constructor_with_lazy(data))

diff --git a/tests/expr_and_series/unique_test.py b/tests/expr_and_series/unique_test.py
@@ -0,0 +1,22 @@
+from typing import Any
+
+import numpy as np
+
+import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
+
+data = {"a": [1, 1, 2]}
+
+
+def test_unique_expr(constructor: Any) -> None:
+    df = nw.from_native(constructor(data), eager_only=True)
+    result = df.select(nw.col("a").unique())
+    expected = {"a": [1, 2]}
+    compare_dicts(result, expected)
+
+
+def test_unique_series(constructor_series: Any) -> None:
+    series = nw.from_native(constructor_series(data["a"]), series_only=True)
+    result = series.unique()
+    expected = np.array([1, 2])
+    assert (result.to_numpy() == expected).all()
diff --git a/tests/series_only/test_common.py b/tests/series_only/test_common.py
@@ -114,29 +114,6 @@ def test_to_numpy() -> None:
     assert nw_series.shape == (3,)
 
 
-def test_value_counts(request: Any, constructor_series: Any) -> None:
-    if "pyarrow_series" in str(constructor_series):
-        request.applymarker(pytest.mark.xfail)
-
-    if "pandas_series_nullable" in str(constructor_series):  # fails for py3.8
-        pytest.skip()
-
-    series = nw.from_native(constructor_series(data_dups).rename("b"), series_only=True)
-
-    sorted_result = series.value_counts(sort=True)
-    assert sorted_result.columns == ["b", "count"]
-
-    expected = np.array([[4, 2], [6, 1]])
-    assert (sorted_result.to_numpy() == expected).all()
-
-    unsorted_result = series.value_counts(sort=False)
-    assert unsorted_result.columns == ["b", "count"]
-
-    a = unsorted_result.to_numpy()
-
-    assert (a[a[:, 0].argsort()] == expected).all()
-
-
 @pytest.mark.parametrize(
     ("interpolation", "expected"),
     [
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,6 +44,7 @@ @@
             - std
             - sum
             - tail
+            - to_dummies
             - to_frame
             - to_list
             - to_numpy
@@ Expand Down @@