Skip to content

Commit

Permalink
feat: series and expr quantile (#207)
Browse files Browse the repository at this point in the history
* feat: series and expr quantile

* feedback on interpolation default

* mypy

* pin numpy in min version ci

* merge main

* implementation difference note
  • Loading branch information
FBruzzesi authored May 28, 2024
1 parent fcd7292 commit a8c8a4b
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/extremes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: install-reqs
run: python -m pip install --upgrade tox virtualenv setuptools pip -r requirements-dev.txt
- name: install-modin
run: python -m pip install pandas==1.1.5 polars==0.20.3
run: python -m pip install pandas==1.1.5 polars==0.20.3 "numpy<=1.21"
- name: Run pytest
run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow
- name: Run doctests
Expand Down
3 changes: 2 additions & 1 deletion docs/api-reference/expressions.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
- null_count
- n_unique
- over
- unique
- quantile
- sample
- shift
- sort
- std
- sum
- unique
show_source: false
show_bases: false
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
- name
- null_count
- n_unique
- quantile
- sample
- shape
- shift
Expand Down
8 changes: 8 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING
from typing import Any
from typing import Callable
from typing import Literal

from narwhals._pandas_like.series import PandasSeries
from narwhals._pandas_like.utils import reuse_series_implementation
Expand Down Expand Up @@ -278,6 +279,13 @@ def is_first_distinct(self) -> Self:
def is_last_distinct(self) -> Self:
return reuse_series_implementation(self, "is_last_distinct")

def quantile(
self,
quantile: float,
interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
) -> Self:
return register_expression_call(self, "quantile", quantile, interpolation)

@property
def str(self) -> PandasExprStringNamespace:
return PandasExprStringNamespace(self)
Expand Down
8 changes: 8 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Literal
from typing import Sequence

from narwhals._pandas_like.utils import item
Expand Down Expand Up @@ -473,6 +474,13 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A
implementation=self._implementation,
)

def quantile(
self: Self,
quantile: float,
interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
) -> Any:
return self._series.quantile(q=quantile, interpolation=interpolation)

def zip_with(self: Self, mask: Any, other: Any) -> PandasSeries:
ser = self._series
res = ser.where(mask._series, other._series)
Expand Down
52 changes: 52 additions & 0 deletions narwhals/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any
from typing import Callable
from typing import Iterable
from typing import Literal

from narwhals.dependencies import get_polars
from narwhals.dtypes import translate_dtype
Expand Down Expand Up @@ -1417,6 +1418,57 @@ def is_last_distinct(self) -> Expr:
"""
return self.__class__(lambda plx: self._call(plx).is_last_distinct())

def quantile(
self,
quantile: float,
interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
) -> Expr:
r"""Get quantile value.
Note:
pandas and Polars may have implementation differences for a given interpolation method.
Arguments:
quantile : float
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = {'a': list(range(50)), 'b': list(range(50, 100))}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function:
>>> def func(df_any):
... df = nw.from_native(df_any)
... result = df.select(nw.col('a', 'b').quantile(0.5, interpolation='linear'))
... return nw.to_native(result)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE
a b
0 24.5 74.5
>>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (1, 2)
┌──────┬──────┐
│ a ┆ b │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞══════╪══════╡
│ 24.5 ┆ 74.5 │
└──────┴──────┘
"""
return self.__class__(
lambda plx: self._call(plx).quantile(quantile, interpolation)
)

@property
def str(self) -> ExprStringNamespace:
return ExprStringNamespace(self)
Expand Down
45 changes: 45 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Literal

from narwhals.dtypes import to_narwhals_dtype
from narwhals.dtypes import translate_dtype
Expand Down Expand Up @@ -1482,6 +1483,50 @@ def value_counts(

return DataFrame(self._series.value_counts(sort=sort, parallel=parallel))

def quantile(
self,
quantile: float,
interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
) -> Any:
"""
Get quantile value of the series.
Note:
pandas and Polars may have implementation differences for a given interpolation method.
Arguments:
quantile : float
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = list(range(50))
>>> s_pd = pd.Series(data)
>>> s_pl = pl.Series(data)
Let's define a dataframe-agnostic function:
>>> def func(s_any):
... series = nw.from_native(s_any, allow_series=True)
... return [
... series.quantile(quantile=q, interpolation='nearest')
... for q in (0.1, 0.25, 0.5, 0.75, 0.9)
... ]
We can then pass either pandas or Polars to `func`:
>>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE
[5, 12, 24, 37, 44]
>>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE
[5.0, 12.0, 25.0, 37.0, 44.0]
"""
return self._series.quantile(quantile=quantile, interpolation=interpolation)

def zip_with(self, mask: Any, other: Any) -> Self:
"""
Take values from self or other based on the given mask. Where mask evaluates true, take values from self. Where mask evaluates false, take values from other.
Expand Down
26 changes: 26 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import warnings
from typing import Any
from typing import Literal

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -680,3 +681,28 @@ def test_null_count(df_raw: Any) -> None:
result = nw.to_native(df.null_count())
expected = {"a": [1], "b": [0], "z": [1]}
compare_dicts(result, expected)


@pytest.mark.parametrize("df_raw", [df_pandas, df_polars])
@pytest.mark.parametrize(
("interpolation", "expected"),
[
("lower", {"a": [1.0], "b": [4.0], "z": [7.0]}),
("higher", {"a": [2.0], "b": [4.0], "z": [8.0]}),
("midpoint", {"a": [1.5], "b": [4.0], "z": [7.5]}),
("linear", {"a": [1.6], "b": [4.0], "z": [7.6]}),
("nearest", {"a": [2.0], "b": [4.0], "z": [8.0]}),
],
)
def test_quantile(
df_raw: Any,
interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
expected: dict[str, list[float]],
) -> None:
q = 0.3

df = nw.from_native(df_raw)
result = nw.to_native(
df.select(nw.all().quantile(quantile=q, interpolation=interpolation))
)
compare_dicts(result, expected)
24 changes: 24 additions & 0 deletions tests/test_series.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any
from typing import Literal

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -418,6 +419,29 @@ def test_is_sorted_invalid(df_raw: Any) -> None:
series.is_sorted(descending="invalid_type") # type: ignore[arg-type]


@pytest.mark.parametrize("df_raw", [df_pandas, df_polars])
@pytest.mark.parametrize(
("interpolation", "expected"),
[
("lower", 7.0),
("higher", 8.0),
("midpoint", 7.5),
("linear", 7.6),
("nearest", 8.0),
],
)
def test_quantile(
df_raw: Any,
interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
expected: float,
) -> None:
q = 0.3

series = nw.from_native(df_raw["z"], allow_series=True)
result = series.quantile(quantile=q, interpolation=interpolation) # type: ignore[union-attr]
assert result == expected


@pytest.mark.parametrize(
("df_raw", "mask", "expected"),
[
Expand Down

0 comments on commit a8c8a4b

Please sign in to comment.