feat: series and expr quantile (#207)

* feat: series and expr quantile * feedback on interpolation default * mypy * pin numpy in min version ci * merge main * implementation difference note
narwhals-dev · May 28, 2024 · a8c8a4b · a8c8a4b
1 parent fcd7292
commit a8c8a4b
Show file tree

Hide file tree

Showing 9 changed files with 167 additions and 2 deletions.
diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml
@@ -29,7 +29,7 @@ jobs:
       - name: install-reqs
         run: python -m pip install --upgrade tox virtualenv setuptools pip -r requirements-dev.txt
       - name: install-modin
-        run: python -m pip install pandas==1.1.5 polars==0.20.3
+        run: python -m pip install pandas==1.1.5 polars==0.20.3 "numpy<=1.21"
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow
       - name: Run doctests

diff --git a/docs/api-reference/expressions.md b/docs/api-reference/expressions.md
@@ -26,11 +26,12 @@
         - null_count
         - n_unique
         - over
-        - unique
+        - quantile
         - sample
         - shift
         - sort
         - std
         - sum
+        - unique
       show_source: false
       show_bases: false
diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -29,6 +29,7 @@
         - name
         - null_count
         - n_unique
+        - quantile
         - sample
         - shape
         - shift

diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Callable
+from typing import Literal
 
 from narwhals._pandas_like.series import PandasSeries
 from narwhals._pandas_like.utils import reuse_series_implementation
@@ -278,6 +279,13 @@ def is_first_distinct(self) -> Self:
     def is_last_distinct(self) -> Self:
         return reuse_series_implementation(self, "is_last_distinct")
 
+    def quantile(
+        self,
+        quantile: float,
+        interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
+    ) -> Self:
+        return register_expression_call(self, "quantile", quantile, interpolation)
+
     @property
     def str(self) -> PandasExprStringNamespace:
         return PandasExprStringNamespace(self)

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Literal
 from typing import Sequence
 
 from narwhals._pandas_like.utils import item
@@ -473,6 +474,13 @@ def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> A
             implementation=self._implementation,
         )
 
+    def quantile(
+        self: Self,
+        quantile: float,
+        interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
+    ) -> Any:
+        return self._series.quantile(q=quantile, interpolation=interpolation)
+
     def zip_with(self: Self, mask: Any, other: Any) -> PandasSeries:
         ser = self._series
         res = ser.where(mask._series, other._series)

diff --git a/narwhals/expression.py b/narwhals/expression.py
@@ -4,6 +4,7 @@
 from typing import Any
 from typing import Callable
 from typing import Iterable
+from typing import Literal
 
 from narwhals.dependencies import get_polars
 from narwhals.dtypes import translate_dtype
@@ -1417,6 +1418,57 @@ def is_last_distinct(self) -> Expr:
         """
         return self.__class__(lambda plx: self._call(plx).is_last_distinct())
 
+    def quantile(
+        self,
+        quantile: float,
+        interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
+    ) -> Expr:
+        r"""Get quantile value.
+
+        Note:
+            pandas and Polars may have implementation differences for a given interpolation method.
+
+        Arguments:
+            quantile : float
+                Quantile between 0.0 and 1.0.
+            interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
+                Interpolation method.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = {'a': list(range(50)), 'b': list(range(50, 100))}
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(df_any):
+            ...     df = nw.from_native(df_any)
+            ...     result = df.select(nw.col('a', 'b').quantile(0.5, interpolation='linear'))
+            ...     return nw.to_native(result)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)  # doctest: +NORMALIZE_WHITESPACE
+                a   b
+            0  24.5  74.5
+
+            >>> func(df_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (1, 2)
+            ┌──────┬──────┐
+            │ a    ┆ b    │
+            │ ---  ┆ ---  │
+            │ f64  ┆ f64  │
+            ╞══════╪══════╡
+            │ 24.5 ┆ 74.5 │
+            └──────┴──────┘
+        """
+        return self.__class__(
+            lambda plx: self._call(plx).quantile(quantile, interpolation)
+        )
+
     @property
     def str(self) -> ExprStringNamespace:
         return ExprStringNamespace(self)

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Literal
 
 from narwhals.dtypes import to_narwhals_dtype
 from narwhals.dtypes import translate_dtype
@@ -1482,6 +1483,50 @@ def value_counts(
 
         return DataFrame(self._series.value_counts(sort=sort, parallel=parallel))
 
+    def quantile(
+        self,
+        quantile: float,
+        interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
+    ) -> Any:
+        """
+        Get quantile value of the series.
+
+        Note:
+            pandas and Polars may have implementation differences for a given interpolation method.
+
+        Arguments:
+            quantile : float
+                Quantile between 0.0 and 1.0.
+            interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
+                Interpolation method.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = list(range(50))
+            >>> s_pd = pd.Series(data)
+            >>> s_pl = pl.Series(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(s_any):
+            ...     series = nw.from_native(s_any, allow_series=True)
+            ...     return [
+            ...         series.quantile(quantile=q, interpolation='nearest')
+            ...         for q in (0.1, 0.25, 0.5, 0.75, 0.9)
+            ...         ]
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(s_pd)  # doctest: +NORMALIZE_WHITESPACE
+            [5, 12, 24, 37, 44]
+
+            >>> func(s_pl)  # doctest: +NORMALIZE_WHITESPACE
+            [5.0, 12.0, 25.0, 37.0, 44.0]
+        """
+        return self._series.quantile(quantile=quantile, interpolation=interpolation)
+
     def zip_with(self, mask: Any, other: Any) -> Self:
         """
         Take values from self or other based on the given mask. Where mask evaluates true, take values from self. Where mask evaluates false, take values from other.

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -3,6 +3,7 @@
 import os
 import warnings
 from typing import Any
+from typing import Literal
 
 import numpy as np
 import pandas as pd
@@ -680,3 +681,28 @@ def test_null_count(df_raw: Any) -> None:
     result = nw.to_native(df.null_count())
     expected = {"a": [1], "b": [0], "z": [1]}
     compare_dicts(result, expected)
+
+
+@pytest.mark.parametrize("df_raw", [df_pandas, df_polars])
+@pytest.mark.parametrize(
+    ("interpolation", "expected"),
+    [
+        ("lower", {"a": [1.0], "b": [4.0], "z": [7.0]}),
+        ("higher", {"a": [2.0], "b": [4.0], "z": [8.0]}),
+        ("midpoint", {"a": [1.5], "b": [4.0], "z": [7.5]}),
+        ("linear", {"a": [1.6], "b": [4.0], "z": [7.6]}),
+        ("nearest", {"a": [2.0], "b": [4.0], "z": [8.0]}),
+    ],
+)
+def test_quantile(
+    df_raw: Any,
+    interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
+    expected: dict[str, list[float]],
+) -> None:
+    q = 0.3
+
+    df = nw.from_native(df_raw)
+    result = nw.to_native(
+        df.select(nw.all().quantile(quantile=q, interpolation=interpolation))
+    )
+    compare_dicts(result, expected)
diff --git a/tests/test_series.py b/tests/test_series.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from typing import Any
+from typing import Literal
 
 import numpy as np
 import pandas as pd
@@ -418,6 +419,29 @@ def test_is_sorted_invalid(df_raw: Any) -> None:
         series.is_sorted(descending="invalid_type")  # type: ignore[arg-type]
 
 
+@pytest.mark.parametrize("df_raw", [df_pandas, df_polars])
+@pytest.mark.parametrize(
+    ("interpolation", "expected"),
+    [
+        ("lower", 7.0),
+        ("higher", 8.0),
+        ("midpoint", 7.5),
+        ("linear", 7.6),
+        ("nearest", 8.0),
+    ],
+)
+def test_quantile(
+    df_raw: Any,
+    interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
+    expected: float,
+) -> None:
+    q = 0.3
+
+    series = nw.from_native(df_raw["z"], allow_series=True)
+    result = series.quantile(quantile=q, interpolation=interpolation)  # type: ignore[union-attr]
+    assert result == expected
+
+
 @pytest.mark.parametrize(
     ("df_raw", "mask", "expected"),
     [