Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into add-where-expression
Browse files Browse the repository at this point in the history
  • Loading branch information
aivanoved committed Jul 23, 2024
2 parents add7b89 + 372f3c2 commit 504c4ea
Show file tree
Hide file tree
Showing 18 changed files with 285 additions and 36 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
- std
- sum
- tail
- to_dummies
- to_frame
- to_list
- to_numpy
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following:
```python
>>> import narwhals
>>> narwhals.__version__
'1.1.3'
'1.1.4'
```
then installation worked correctly!
2 changes: 1 addition & 1 deletion narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from narwhals.utils import maybe_convert_dtypes
from narwhals.utils import maybe_set_index

__version__ = "1.1.3"
__version__ = "1.1.4"

__all__ = [
"selectors",
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ def is_first_distinct(self: Self) -> Self:
def is_last_distinct(self: Self) -> Self:
return reuse_series_implementation(self, "is_last_distinct")

def unique(self: Self) -> Self:
return reuse_series_implementation(self, "unique")

def sort(self: Self, *, descending: bool = False) -> Self:
return reuse_series_implementation(self, "sort", descending=descending)

@property
def dt(self: Self) -> ArrowExprDateTimeNamespace:
return ArrowExprDateTimeNamespace(self)
Expand Down
57 changes: 57 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,30 @@ def item(self: Self, index: int | None = None) -> Any:
return self._native_series[0].as_py()
return self._native_series[index].as_py()

def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any: # noqa: ARG002
"""Parallel is unused, exists for compatibility"""
from narwhals._arrow.dataframe import ArrowDataFrame

pc = get_pyarrow_compute()
pa = get_pyarrow()

name_ = (
"index" if self._native_series._name is None else self._native_series._name
)

val_count = pc.value_counts(self._native_series)
val_count = pa.Table.from_arrays(
[val_count.field("values"), val_count.field("counts")], names=[name_, "count"]
)

if sort:
val_count = val_count.sort_by([("count", "descending")])

return ArrowDataFrame(
val_count,
backend_version=self._backend_version,
)

def zip_with(self: Self, mask: Self, other: Self) -> Self:
pc = get_pyarrow_compute()

Expand Down Expand Up @@ -444,6 +468,39 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool:
else:
return pc.all(pc.less_equal(ser[:-1], ser[1:])).as_py() # type: ignore[no-any-return]

def unique(self: Self) -> ArrowSeries:
pc = get_pyarrow_compute()
return self._from_native_series(pc.unique(self._native_series))

def sort(self: Self, *, descending: bool = False) -> ArrowSeries:
pc = get_pyarrow_compute()
series = self._native_series
order = "descending" if descending else "ascending"
sorted_indices = pc.array_sort_indices(
series, order=order, null_placement="at_start"
)

return self._from_native_series(pc.take(series, sorted_indices))

def to_dummies(
self: Self, *, separator: str = "_", drop_first: bool = False
) -> ArrowDataFrame:
from narwhals._arrow.dataframe import ArrowDataFrame

pa = get_pyarrow()
pc = get_pyarrow_compute()
series = self._native_series
unique_values = self.unique().sort()._native_series
columns = [pc.cast(pc.equal(series, v), pa.uint8()) for v in unique_values][
int(drop_first) :
]
names = [f"{self._name}{separator}{v}" for v in unique_values][int(drop_first) :]

return ArrowDataFrame(
pa.Table.from_arrays(columns, names=names),
backend_version=self._backend_version,
)

@property
def shape(self) -> tuple[int]:
return (len(self._native_series),)
Expand Down
22 changes: 21 additions & 1 deletion narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
if TYPE_CHECKING:
from typing_extensions import Self

from narwhals._pandas_like.dataframe import PandasLikeDataFrame
from narwhals._pandas_like.namespace import PandasLikeNamespace
from narwhals.dtypes import DType

Expand Down Expand Up @@ -567,7 +568,7 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A
).reset_index()
val_count.columns = [name_, "count"]
if sort:
val_count = val_count.sort_values(name_)
val_count = val_count.sort_values("count", ascending=False)

return PandasLikeDataFrame(
val_count,
Expand Down Expand Up @@ -596,6 +597,25 @@ def tail(self: Self, n: int) -> Self:
def round(self: Self, decimals: int) -> Self:
return self._from_native_series(self._native_series.round(decimals=decimals))

def to_dummies(
self: Self, *, separator: str = "_", drop_first: bool = False
) -> PandasLikeDataFrame:
from narwhals._pandas_like.dataframe import PandasLikeDataFrame

plx = self.__native_namespace__()
series = self._native_series
name = str(self._name) if self._name else ""
return PandasLikeDataFrame(
plx.get_dummies(
series,
prefix=name,
prefix_sep=separator,
drop_first=drop_first,
).astype(int),
implementation=self._implementation,
backend_version=self._backend_version,
)

@property
def str(self) -> PandasLikeSeriesStringNamespace:
return PandasLikeSeriesStringNamespace(self)
Expand Down
16 changes: 16 additions & 0 deletions narwhals/_polars/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
from typing_extensions import Self

from narwhals._polars.dataframe import PolarsDataFrame
from narwhals.dtypes import DType

from narwhals._polars.namespace import PolarsNamespace
Expand Down Expand Up @@ -168,6 +169,21 @@ def __rpow__(self, other: PolarsSeries | Any) -> Self:
def __invert__(self) -> Self:
return self._from_native_series(self._native_series.__invert__())

def to_dummies(
self: Self, *, separator: str = "_", drop_first: bool = False
) -> PolarsDataFrame:
from narwhals._polars.dataframe import PolarsDataFrame

if self._backend_version < (0, 20, 15): # pragma: no cover
result = self._native_series.to_dummies(separator=separator)
result = result.select(result.columns[int(drop_first) :])
else:
result = self._native_series.to_dummies(
separator=separator, drop_first=drop_first
)

return PolarsDataFrame(result, backend_version=self._backend_version)

@property
def dt(self) -> PolarsSeriesDateTimeNamespace:
return PolarsSeriesDateTimeNamespace(self)
Expand Down
73 changes: 73 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1963,6 +1963,79 @@ def round(self: Self, decimals: int = 0) -> Self:
"""
return self._from_compliant_series(self._compliant_series.round(decimals))

def to_dummies(
self: Self, *, separator: str = "_", drop_first: bool = False
) -> DataFrame[Any]:
r"""
Get dummy/indicator variables.
Arguments
separator: Separator/delimiter used when generating column names.
drop_first: Remove the first category from the variable being encoded.
Notes:
pandas and Polars handle null values differently. Polars distinguishes
between NaN and Null, whereas pandas doesn't.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = [1, 2, 3]
>>> s_pd = pd.Series(data, name="a")
>>> s_pl = pl.Series("a", data)
Let's define a dataframe-agnostic function that rounds to the first decimal:
>>> @nw.narwhalify
... def func(s_any, drop_first: bool = False):
... return s_any.to_dummies(drop_first=drop_first)
We can then pass either pandas or Polars to `func`:
>>> func(s_pd)
a_1 a_2 a_3
0 1 0 0
1 0 1 0
2 0 0 1
>>> func(s_pd, drop_first=True)
a_2 a_3
0 0 0
1 1 0
2 0 1
>>> func(s_pl)
shape: (3, 3)
┌─────┬─────┬─────┐
│ a_1 ┆ a_2 ┆ a_3 │
│ --- ┆ --- ┆ --- │
│ u8 ┆ u8 ┆ u8 │
╞═════╪═════╪═════╡
│ 1 ┆ 0 ┆ 0 │
│ 0 ┆ 1 ┆ 0 │
│ 0 ┆ 0 ┆ 1 │
└─────┴─────┴─────┘
>>> func(s_pl, drop_first=True)
shape: (3, 2)
┌─────┬─────┐
│ a_2 ┆ a_3 │
│ --- ┆ --- │
│ u8 ┆ u8 │
╞═════╪═════╡
│ 0 ┆ 0 │
│ 1 ┆ 0 │
│ 0 ┆ 1 │
└─────┴─────┘
"""

from narwhals.dataframe import DataFrame

return DataFrame(
self._compliant_series.to_dummies(separator=separator, drop_first=drop_first),
level=self._level,
)

@property
def str(self) -> SeriesStringNamespace:
return SeriesStringNamespace(self)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "narwhals"
version = "1.1.3"
version = "1.1.4"
authors = [
{ name="Marco Gorelli", email="[email protected]" },
]
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
covdefaults
ibis-framework
ibis-framework[polars]
pandas
polars[timezones]
pre-commit
Expand Down
31 changes: 31 additions & 0 deletions tests/expr_and_series/sort_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Any

import numpy as np
import pytest

import narwhals.stable.v1 as nw
from tests.utils import compare_dicts

data = {"a": [1, 3, 2], "b": [0, 2, -1]}


@pytest.mark.parametrize(
("descending", "expected"),
[
(True, {"a": [3, 2, 1], "b": [0, 2, -1]}),
(False, {"a": [1, 2, 3], "b": [0, 2, -1]}),
],
)
def test_sort_expr(constructor: Any, descending: Any, expected: Any) -> None:
df = nw.from_native(constructor(data), eager_only=True)
result = df.select(nw.col("a").sort(descending=descending), "b")
compare_dicts(result, expected)


@pytest.mark.parametrize(
("descending", "expected"), [(True, [3, 2, 1]), (False, [1, 2, 3])]
)
def test_sort_series(constructor_series: Any, descending: Any, expected: Any) -> None:
series = nw.from_native(constructor_series(data["a"]), series_only=True)
result = series.sort(descending=descending)
assert (result.to_numpy() == np.array(expected)).all()
6 changes: 1 addition & 5 deletions tests/expr_and_series/unary_test.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
from typing import Any

import pytest

import narwhals as nw
from tests.utils import compare_dicts


def test_unary(request: Any, constructor_with_lazy: Any) -> None:
if "pyarrow_table" in str(constructor_with_lazy):
request.applymarker(pytest.mark.xfail)
def test_unary(constructor_with_lazy: Any) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
result = (
nw.from_native(constructor_with_lazy(data))
Expand Down
22 changes: 22 additions & 0 deletions tests/expr_and_series/unique_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Any

import numpy as np

import narwhals.stable.v1 as nw
from tests.utils import compare_dicts

data = {"a": [1, 1, 2]}


def test_unique_expr(constructor: Any) -> None:
df = nw.from_native(constructor(data), eager_only=True)
result = df.select(nw.col("a").unique())
expected = {"a": [1, 2]}
compare_dicts(result, expected)


def test_unique_series(constructor_series: Any) -> None:
series = nw.from_native(constructor_series(data["a"]), series_only=True)
result = series.unique()
expected = np.array([1, 2])
assert (result.to_numpy() == expected).all()
23 changes: 0 additions & 23 deletions tests/series_only/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,29 +114,6 @@ def test_to_numpy() -> None:
assert nw_series.shape == (3,)


def test_value_counts(request: Any, constructor_series: Any) -> None:
if "pyarrow_series" in str(constructor_series):
request.applymarker(pytest.mark.xfail)

if "pandas_series_nullable" in str(constructor_series): # fails for py3.8
pytest.skip()

series = nw.from_native(constructor_series(data_dups).rename("b"), series_only=True)

sorted_result = series.value_counts(sort=True)
assert sorted_result.columns == ["b", "count"]

expected = np.array([[4, 2], [6, 1]])
assert (sorted_result.to_numpy() == expected).all()

unsorted_result = series.value_counts(sort=False)
assert unsorted_result.columns == ["b", "count"]

a = unsorted_result.to_numpy()

assert (a[a[:, 0].argsort()] == expected).all()


@pytest.mark.parametrize(
("interpolation", "expected"),
[
Expand Down
Loading

0 comments on commit 504c4ea

Please sign in to comment.