From b15b43611428965882f1d9b085de212fac180889 Mon Sep 17 00:00:00 2001 From: Magdalena Anopsy <74981211+anopsy@users.noreply.github.com> Date: Wed, 8 May 2024 15:26:18 +0000 Subject: [PATCH 01/38] noop From 52a987e7f2fb76843d3a7ffa8f9c033ea53bce6c Mon Sep 17 00:00:00 2001 From: ugohuche Date: Fri, 10 May 2024 20:28:08 +0100 Subject: [PATCH 02/38] Added millisecond temporal method --- narwhals/_pandas_like/expr.py | 3 +++ narwhals/_pandas_like/series.py | 5 +++++ narwhals/expression.py | 3 +++ narwhals/series.py | 3 +++ 4 files changed, 14 insertions(+) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index cc7b6e8aa..41bd6383a 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -312,6 +312,9 @@ def minute(self) -> PandasExpr: def second(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "second") + + def millisecond(self) -> PandasExpr: + return register_namespace_expression_call(self._expr, "dt", "millisecond") def ordinal_day(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "ordinal_day") diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 854ba1614..70be50f7d 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -482,6 +482,11 @@ def second(self) -> PandasSeries: return self._series._from_series( self._series._series.dt.second, ) + + def millisecond(self) -> PandasSeries: + return self._series._from_series( + self._series._series.dt.millisecond, + ) def ordinal_day(self) -> PandasSeries: ser = self._series._series diff --git a/narwhals/expression.py b/narwhals/expression.py index cea122bee..b622aa2a2 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1065,6 +1065,9 @@ def minute(self) -> Expr: def second(self) -> Expr: return self._expr.__class__(lambda plx: self._expr._call(plx).dt.second()) + + def millisecond(self) -> Expr: + return self._expr.__class__(lambda plx: self._expr._call(plx).dt.millisecond()) def ordinal_day(self) -> Expr: """ diff --git a/narwhals/series.py b/narwhals/series.py index e291cc440..e1f75b68b 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -618,6 +618,9 @@ def minute(self) -> Series: def second(self) -> Series: return self._series.__class__(self._series._series.dt.second()) + + def millisecond(self) -> Series: + return self._series.__class__(self._series._series.dt.millisecond()) def ordinal_day(self) -> Series: """ From f45c8471c40e9a6f85f0146a5ec0d27bd5e11d89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 May 2024 19:30:05 +0000 Subject: [PATCH 03/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/series.py | 2 +- narwhals/expression.py | 2 +- narwhals/series.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 41bd6383a..a53bb1e59 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -312,7 +312,7 @@ def minute(self) -> PandasExpr: def second(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "second") - + def millisecond(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "millisecond") diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 70be50f7d..13b755268 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -482,7 +482,7 @@ def second(self) -> PandasSeries: return self._series._from_series( self._series._series.dt.second, ) - + def millisecond(self) -> PandasSeries: return self._series._from_series( self._series._series.dt.millisecond, diff --git a/narwhals/expression.py b/narwhals/expression.py index b622aa2a2..3effd9f48 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1065,7 +1065,7 @@ def minute(self) -> Expr: def second(self) -> Expr: return self._expr.__class__(lambda plx: self._expr._call(plx).dt.second()) - + def millisecond(self) -> Expr: return self._expr.__class__(lambda plx: self._expr._call(plx).dt.millisecond()) diff --git a/narwhals/series.py b/narwhals/series.py index e1f75b68b..d099e617c 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -618,7 +618,7 @@ def minute(self) -> Series: def second(self) -> Series: return self._series.__class__(self._series._series.dt.second()) - + def millisecond(self) -> Series: return self._series.__class__(self._series._series.dt.millisecond()) From 26db35dfdb545625ef6d54903f8a2d3e4d7dc787 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sun, 19 May 2024 10:56:09 +0200 Subject: [PATCH 04/38] doc: fill in ser.is_in docstrings --- narwhals/series.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index 9954f8906..0f3c1721d 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -440,6 +440,42 @@ def std(self, *, ddof: int = 1) -> Any: return self._series.std(ddof=ddof) def is_in(self, other: Any) -> Self: + """ + Check if elements of this Series are in the other Series. + + Arguments: + other: Series or sequence of primitive type. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> s_pd = pd.Series([1, 2, 3]) + >>> s_pl = pl.Series([1, 2, 3]) + + We define a library agnostic function: + + >>> def func(s_any): + ... s = nw.from_native(s_any, series_only=True) + ... s = s.is_in(pl.Series([3, 2, 8])) + ... return nw.to_native(s) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 1 False + 2 True + 2 True + dtype: boolean + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [bool] + [ + false + true + true + ] + """ return self._from_series(self._series.is_in(self._extract_native(other))) def drop_nulls(self) -> Self: From 49ac3cbc2a3fb366df2f11783df86671949cdd68 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sun, 19 May 2024 11:01:38 +0200 Subject: [PATCH 05/38] fix typo --- narwhals/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index 0f3c1721d..f42e62e5b 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -463,8 +463,8 @@ def is_in(self, other: Any) -> Self: We can then pass either pandas or Polars to `func`: >>> func(s_pd) - 1 False - 2 True + 0 False + 1 True 2 True dtype: boolean >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE From 291d410c6730ca95dc9cf5013851c1b1b9ee0741 Mon Sep 17 00:00:00 2001 From: georgescutelnicu Date: Sun, 19 May 2024 13:48:47 +0300 Subject: [PATCH 06/38] Add docstring for Series.std() --- narwhals/series.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index 9954f8906..aa902879d 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -437,6 +437,34 @@ def sum(self) -> Any: return self._series.sum() def std(self, *, ddof: int = 1) -> Any: + """ + Get the standard deviation of this Series. + + Arguments: + ddof: “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> s = [1, 2, 3] + >>> s_pd = pd.Series(s) + >>> s_pl = pl.Series(s) + + We define a library agnostic function: + + >>> def func(s_any): + ... s = nw.from_native(s_any, series_only=True) + ... return s.std() + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 1.0 + >>> func(s_pl) + 1.0 + """ return self._series.std(ddof=ddof) def is_in(self, other: Any) -> Self: From 2d95633e8ea395f1d41979d881a74449d0c9aa80 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 17:33:53 +0200 Subject: [PATCH 07/38] wip --- narwhals/_pandas_like/group_by.py | 2 +- narwhals/_pandas_like/namespace.py | 20 +++++++++++++++++++- narwhals/_pandas_like/utils.py | 3 +++ narwhals/dataframe.py | 3 +++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 54c633110..b13f40971 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -150,7 +150,7 @@ def func(df: Any) -> Any: for result_keys in results_keys: out_group.append(item(result_keys._series)) out_names.append(result_keys.name) - return plx.make_native_series(name="", data=out_group, index=out_names) + return plx._make_native_series(name="", data=out_group, index=out_names) if implementation == "pandas": import pandas as pd diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index bc8ef788d..b19015e0a 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -7,6 +7,7 @@ from typing import Iterable from narwhals import dtypes +from narwhals._pandas_like import selectors from narwhals._pandas_like.dataframe import PandasDataFrame from narwhals._pandas_like.expr import PandasExpr from narwhals._pandas_like.series import PandasSeries @@ -36,7 +37,9 @@ class PandasNamespace: String = dtypes.String Datetime = dtypes.Datetime - def make_native_series(self, name: str, data: list[Any], index: Any) -> Any: + selectors = selectors + + def _make_native_series(self, name: str, data: list[Any], index: Any) -> Any: if self._implementation == "pandas": import pandas as pd @@ -55,6 +58,21 @@ def make_native_series(self, name: str, data: list[Any], index: Any) -> Any: def __init__(self, implementation: str) -> None: self._implementation = implementation + def _create_expr_from_type_selector(self, type_selector): + def func(df): + return [ + df[col] for col in df.columns if df.schema[col] in type_selector._dtypes + ] + + return PandasExpr( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + implementation=self._implementation, + ) + def _create_expr_from_callable( # noqa: PLR0913 self, func: Callable[[PandasDataFrame], list[PandasSeries]], diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 605b4de09..a0a94d662 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -102,6 +102,7 @@ def parse_into_exprs( def parse_into_expr(implementation: str, into_expr: IntoPandasExpr) -> PandasExpr: from narwhals._pandas_like.expr import PandasExpr from narwhals._pandas_like.namespace import PandasNamespace + from narwhals._pandas_like.selectors import PandasTypeSelector from narwhals._pandas_like.series import PandasSeries plx = PandasNamespace(implementation=implementation) @@ -112,6 +113,8 @@ def parse_into_expr(implementation: str, into_expr: IntoPandasExpr) -> PandasExp return plx._create_expr_from_series(into_expr) if isinstance(into_expr, str): return plx.col(into_expr) + if isinstance(into_expr, PandasTypeSelector): + return plx._create_expr_from_type_selector(into_expr) if (np := get_numpy()) is not None and isinstance(into_expr, np.ndarray): series = create_native_series(into_expr, implementation=implementation) return plx._create_expr_from_series(series) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index c5f911643..7762ad570 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -61,6 +61,7 @@ def _flatten_and_extract(self, *args: Any, **kwargs: Any) -> Any: def _extract_native(self, arg: Any) -> Any: from narwhals.expression import Expr + from narwhals.selectors import Selector from narwhals.series import Series if isinstance(arg, BaseFrame): @@ -69,6 +70,8 @@ def _extract_native(self, arg: Any) -> Any: return arg._series if isinstance(arg, Expr): return arg._call(self.__narwhals_namespace__()) + if isinstance(arg, Selector): + return arg._call(self.__narwhals_namespace__()) if get_polars() is not None and "polars" in str(type(arg)): msg = ( f"Expected Narwhals object, got: {type(arg)}.\n\n" From 46df82b40aed83311d1bfebccb50a91e42388587 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 18:14:20 +0200 Subject: [PATCH 08/38] narwhals selectors.by_dtype, initial attempt --- narwhals/_pandas_like/namespace.py | 21 ++++----------------- narwhals/_pandas_like/selectors.py | 28 ++++++++++++++++++++++++++++ narwhals/_pandas_like/utils.py | 3 --- narwhals/dataframe.py | 3 --- narwhals/selectors.py | 12 ++++++++++++ 5 files changed, 44 insertions(+), 23 deletions(-) create mode 100644 narwhals/_pandas_like/selectors.py create mode 100644 narwhals/selectors.py diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index b19015e0a..06f5f03fa 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -7,9 +7,9 @@ from typing import Iterable from narwhals import dtypes -from narwhals._pandas_like import selectors from narwhals._pandas_like.dataframe import PandasDataFrame from narwhals._pandas_like.expr import PandasExpr +from narwhals._pandas_like.selectors import PandasSelector from narwhals._pandas_like.series import PandasSeries from narwhals._pandas_like.utils import horizontal_concat from narwhals._pandas_like.utils import parse_into_exprs @@ -37,7 +37,9 @@ class PandasNamespace: String = dtypes.String Datetime = dtypes.Datetime - selectors = selectors + @property + def selectors(self) -> PandasSelector: + return PandasSelector(self._implementation) def _make_native_series(self, name: str, data: list[Any], index: Any) -> Any: if self._implementation == "pandas": @@ -58,21 +60,6 @@ def _make_native_series(self, name: str, data: list[Any], index: Any) -> Any: def __init__(self, implementation: str) -> None: self._implementation = implementation - def _create_expr_from_type_selector(self, type_selector): - def func(df): - return [ - df[col] for col in df.columns if df.schema[col] in type_selector._dtypes - ] - - return PandasExpr( - func, - depth=0, - function_name="type_selector", - root_names=None, - output_names=None, - implementation=self._implementation, - ) - def _create_expr_from_callable( # noqa: PLR0913 self, func: Callable[[PandasDataFrame], list[PandasSeries]], diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py new file mode 100644 index 000000000..df0655175 --- /dev/null +++ b/narwhals/_pandas_like/selectors.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._pandas_like.expr import PandasExpr + +if TYPE_CHECKING: + from narwhals._pandas_like.dataframe import PandasDataFrame + from narwhals._pandas_like.series import PandasSeries + from narwhals.dtypes import DType + + +class PandasSelector: + def __init__(self, implementation: str) -> None: + self._implementation = implementation + + def by_dtype(self, dtypes: list[DType]) -> PandasExpr: + def func(df: PandasDataFrame) -> list[PandasSeries]: + return [df[col] for col in df.columns if df.schema[col] in dtypes] + + return PandasExpr( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + implementation=self._implementation, + ) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index a0a94d662..605b4de09 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -102,7 +102,6 @@ def parse_into_exprs( def parse_into_expr(implementation: str, into_expr: IntoPandasExpr) -> PandasExpr: from narwhals._pandas_like.expr import PandasExpr from narwhals._pandas_like.namespace import PandasNamespace - from narwhals._pandas_like.selectors import PandasTypeSelector from narwhals._pandas_like.series import PandasSeries plx = PandasNamespace(implementation=implementation) @@ -113,8 +112,6 @@ def parse_into_expr(implementation: str, into_expr: IntoPandasExpr) -> PandasExp return plx._create_expr_from_series(into_expr) if isinstance(into_expr, str): return plx.col(into_expr) - if isinstance(into_expr, PandasTypeSelector): - return plx._create_expr_from_type_selector(into_expr) if (np := get_numpy()) is not None and isinstance(into_expr, np.ndarray): series = create_native_series(into_expr, implementation=implementation) return plx._create_expr_from_series(series) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 7762ad570..c5f911643 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -61,7 +61,6 @@ def _flatten_and_extract(self, *args: Any, **kwargs: Any) -> Any: def _extract_native(self, arg: Any) -> Any: from narwhals.expression import Expr - from narwhals.selectors import Selector from narwhals.series import Series if isinstance(arg, BaseFrame): @@ -70,8 +69,6 @@ def _extract_native(self, arg: Any) -> Any: return arg._series if isinstance(arg, Expr): return arg._call(self.__narwhals_namespace__()) - if isinstance(arg, Selector): - return arg._call(self.__narwhals_namespace__()) if get_polars() is not None and "polars" in str(type(arg)): msg = ( f"Expected Narwhals object, got: {type(arg)}.\n\n" diff --git a/narwhals/selectors.py b/narwhals/selectors.py new file mode 100644 index 000000000..57a899655 --- /dev/null +++ b/narwhals/selectors.py @@ -0,0 +1,12 @@ +from narwhals.dtypes import DType +from narwhals.dtypes import translate_dtype +from narwhals.expression import Expr +from narwhals.utils import flatten + + +def by_dtype(*dtypes: DType) -> Expr: + return Expr( + lambda plx: plx.selectors.by_dtype( + [translate_dtype(plx, dtype) for dtype in flatten(dtypes)] + ) + ) From 2bd5dd0f19178acd7675734520137248496b8fe9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 18:24:26 +0200 Subject: [PATCH 09/38] docs --- narwhals/selectors.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 57a899655..7591213b9 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -5,6 +5,47 @@ def by_dtype(*dtypes: DType) -> Expr: + """ + Select columns based on their dtype. + + Arguments: + dtypes: one or data types to select + + Examples: + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> import pandas as pd + >>> import polars as pl + >>> + >>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [4.1, 2.3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function to select int64 and float64 + dtypes and multiplies each value by 2: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.select(ncs.by_dtype(nw.Int64, nw.Float64)*2) + ... return nw.to_native(df) + + We can then pass either pandas or Polars dataframes: + + >>> func(df_pd) + a c + 0 2 8.2 + 1 4 4.6 + >>> func(df_pl) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 2 ┆ 8.2 │ + │ 4 ┆ 4.6 │ + └─────┴─────┘ + """ return Expr( lambda plx: plx.selectors.by_dtype( [translate_dtype(plx, dtype) for dtype in flatten(dtypes)] From 15ae3666ec95e68df56c75b6de8c0d10e3d865e4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 18:28:15 +0200 Subject: [PATCH 10/38] update --- docs/api-reference/selectors.md | 10 ++++++++++ narwhals/selectors.py | 8 ++++++-- tests/selectors_test.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 docs/api-reference/selectors.md create mode 100644 tests/selectors_test.py diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md new file mode 100644 index 000000000..57ebf9330 --- /dev/null +++ b/docs/api-reference/selectors.md @@ -0,0 +1,10 @@ +# `narwhals.selectors` + +::: narwhals.dtypes + handler: python + options: + members: + - by_dtype + show_root_heading: false + show_source: false + show_bases: false diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 7591213b9..eb566af27 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -1,10 +1,14 @@ -from narwhals.dtypes import DType +from __future__ import annotations + +from typing import Any + from narwhals.dtypes import translate_dtype from narwhals.expression import Expr from narwhals.utils import flatten -def by_dtype(*dtypes: DType) -> Expr: +# todo type dtypes +def by_dtype(*dtypes: Any) -> Expr: """ Select columns based on their dtype. diff --git a/tests/selectors_test.py b/tests/selectors_test.py new file mode 100644 index 000000000..055140d16 --- /dev/null +++ b/tests/selectors_test.py @@ -0,0 +1,19 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from narwhals.selectors import by_dtype +from tests.utils import compare_dicts + +data = {"a": [1, 1, 2], "b": ["a", "b", "c"], "c": [4.0, 5.0, 6.0]} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_selecctors(constructor: Any) -> None: + df = nw.from_native(constructor(data)) + result = nw.to_native(df.select(by_dtype([nw.Int64, nw.Float64]) + 1)) + expected = {"a": [2, 2, 3], "c": [5.0, 6.0, 7.0]} + compare_dicts(result, expected) From be47b2c98b5a2705562b344889c7411a8aa6bb2f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 18:30:50 +0200 Subject: [PATCH 11/38] include selectors in api --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index ea3e5facc..3a1ccce31 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - api-reference/expressions_str.md - api-reference/dtypes.md - api-reference/dependencies.md + - api-reference/selectors.md theme: name: material font: false From 05193ca377fed85aa8c7b8e12020a39cdb956295 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 18:33:51 +0200 Subject: [PATCH 12/38] docs --- docs/api-reference/selectors.md | 2 +- narwhals/selectors.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md index 57ebf9330..1292f192d 100644 --- a/docs/api-reference/selectors.md +++ b/docs/api-reference/selectors.md @@ -1,6 +1,6 @@ # `narwhals.selectors` -::: narwhals.dtypes +::: narwhals.selectors handler: python options: members: diff --git a/narwhals/selectors.py b/narwhals/selectors.py index eb566af27..93d1f4383 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -7,7 +7,6 @@ from narwhals.utils import flatten -# todo type dtypes def by_dtype(*dtypes: Any) -> Expr: """ Select columns based on their dtype. From 83a012698e4e607f4c87f237e0628155aa20ad58 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 19 May 2024 19:38:51 +0200 Subject: [PATCH 13/38] add numeric selector --- narwhals/selectors.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 93d1f4383..6fb602e33 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -2,6 +2,7 @@ from typing import Any +from narwhals import dtypes from narwhals.dtypes import translate_dtype from narwhals.expression import Expr from narwhals.utils import flatten @@ -54,3 +55,18 @@ def by_dtype(*dtypes: Any) -> Expr: [translate_dtype(plx, dtype) for dtype in flatten(dtypes)] ) ) + + +def numeric() -> Expr: + return by_dtype( + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + dtypes.Float64, + dtypes.Float32, + ) From 3a14d6586dcf4d9ada59ebc90844f3f67372f216 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 20 May 2024 09:41:53 +0200 Subject: [PATCH 14/38] add to reference --- docs/api-reference/selectors.md | 1 + narwhals/selectors.py | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md index 1292f192d..cce2d2c79 100644 --- a/docs/api-reference/selectors.md +++ b/docs/api-reference/selectors.md @@ -5,6 +5,7 @@ options: members: - by_dtype + - numeric show_root_heading: false show_source: false show_bases: false diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 6fb602e33..8e170a910 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -58,6 +58,44 @@ def by_dtype(*dtypes: Any) -> Expr: def numeric() -> Expr: + """ + Select numeric columns. + + Examples: + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> import pandas as pd + >>> import polars as pl + >>> + >>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [4.1, 2.3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function to select numeric + dtypes and multiplies each value by 2: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.select(ncs.by_dtype(nw.Int64, nw.Float64)*2) + ... return nw.to_native(df) + + We can then pass either pandas or Polars dataframes: + + >>> func(df_pd) + a c + 0 2 8.2 + 1 4 4.6 + >>> func(df_pl) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 2 ┆ 8.2 │ + │ 4 ┆ 4.6 │ + └─────┴─────┘ + """ return by_dtype( dtypes.Int64, dtypes.Int32, From b7e8d86ab65043e51310c460e793290cba524442 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 20 May 2024 09:48:49 +0200 Subject: [PATCH 15/38] coverage --- tests/selectors_test.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 055140d16..6d9fca876 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -6,6 +6,7 @@ import narwhals as nw from narwhals.selectors import by_dtype +from narwhals.selectors import numeric from tests.utils import compare_dicts data = {"a": [1, 1, 2], "b": ["a", "b", "c"], "c": [4.0, 5.0, 6.0]} @@ -17,3 +18,11 @@ def test_selecctors(constructor: Any) -> None: result = nw.to_native(df.select(by_dtype([nw.Int64, nw.Float64]) + 1)) expected = {"a": [2, 2, 3], "c": [5.0, 6.0, 7.0]} compare_dicts(result, expected) + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_numeric(constructor: Any) -> None: + df = nw.from_native(constructor(data)) + result = nw.to_native(df.select(numeric() + 1)) + expected = {"a": [2, 2, 3], "c": [5.0, 6.0, 7.0]} + compare_dicts(result, expected) From f18c533a0c2ff870ad489a6f93bca8bef2a98eda Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 20 May 2024 16:17:44 +0200 Subject: [PATCH 16/38] feat: Series descriptive methods (#187) * feat: series descriptive * add Expr * value_counts, is_sorted, and expr docstring * pandas value count fix? * un sorted test * explicit name is None check --- docs/api-reference/expressions.md | 5 + docs/api-reference/series.md | 8 + narwhals/_pandas_like/expr.py | 15 ++ narwhals/_pandas_like/series.py | 44 ++++ narwhals/expression.py | 208 ++++++++++++++++++ narwhals/series.py | 303 +++++++++++++++++++++++++++ tests/expr/is_duplicated_test.py | 24 +++ tests/expr/is_first_distinct_test.py | 24 +++ tests/expr/is_last_distinct_test.py | 24 +++ tests/expr/is_unique_test.py | 24 +++ tests/expr/null_count_test.py | 24 +++ tests/test_series.py | 83 ++++++++ utils/check_api_reference.py | 15 +- 13 files changed, 800 insertions(+), 1 deletion(-) create mode 100644 tests/expr/is_duplicated_test.py create mode 100644 tests/expr/is_first_distinct_test.py create mode 100644 tests/expr/is_last_distinct_test.py create mode 100644 tests/expr/is_unique_test.py create mode 100644 tests/expr/null_count_test.py diff --git a/docs/api-reference/expressions.md b/docs/api-reference/expressions.md index aa4ab1e61..a889d5b71 100644 --- a/docs/api-reference/expressions.md +++ b/docs/api-reference/expressions.md @@ -14,11 +14,16 @@ - fill_null - filter - is_between + - is_duplicated + - is_first_distinct - is_in + - is_last_distinct - is_null + - is_unique - max - mean - min + - null_count - n_unique - over - unique diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e87a30f7c..d65262a5b 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -15,12 +15,19 @@ - fill_null - filter - is_between + - is_duplicated + - is_empty + - is_first_distinct - is_in + - is_last_distinct - is_null + - is_sorted + - is_unique - max - mean - min - name + - null_count - n_unique - sample - shape @@ -32,5 +39,6 @@ - to_numpy - to_pandas - unique + - value_counts show_source: false show_bases: false diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c3273e1b6..f90208248 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -253,6 +253,21 @@ def func(df: PandasDataFrame) -> list[PandasSeries]: implementation=self._implementation, ) + def is_duplicated(self) -> Self: + return register_expression_call(self, "is_duplicated") + + def is_unique(self) -> Self: + return register_expression_call(self, "is_unique") + + def null_count(self) -> Self: + return register_expression_call(self, "null_count") + + def is_first_distinct(self) -> Self: + return register_expression_call(self, "is_first_distinct") + + def is_last_distinct(self) -> Self: + return register_expression_call(self, "is_last_distinct") + @property def str(self) -> PandasExprStringNamespace: return PandasExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index a36852afb..819492bfe 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -439,6 +439,50 @@ def to_pandas(self) -> Any: msg = f"Unknown implementation: {self._implementation}" # pragma: no cover raise AssertionError(msg) + # --- descriptive --- + def is_duplicated(self: Self) -> Self: + return self._from_series(self._series.duplicated(keep=False)) + + def is_empty(self: Self) -> bool: + return self._series.empty # type: ignore[no-any-return] + + def is_unique(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep=False)) + + def null_count(self: Self) -> int: + return self._series.isnull().sum() # type: ignore[no-any-return] + + def is_first_distinct(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep="first")) + + def is_last_distinct(self: Self) -> Self: + return self._from_series(~self._series.duplicated(keep="last")) + + def is_sorted(self: Self, *, descending: bool = False) -> bool: + if not isinstance(descending, bool): + msg = f"argument 'descending' should be boolean, found {type(descending)}" + raise TypeError(msg) + + if descending: + return self._series.is_monotonic_decreasing # type: ignore[no-any-return] + else: + return self._series.is_monotonic_increasing # type: ignore[no-any-return] + + def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any: + """Parallel is unused, exists for compatibility""" + from narwhals._pandas_like.dataframe import PandasDataFrame + + name_ = "index" if self._series.name is None else self._series.name + val_count = self._series.value_counts(dropna=False, sort=False).reset_index() + val_count.columns = [name_, "count"] + if sort: + val_count = val_count.sort_values(name_) + + return PandasDataFrame( + val_count, + implementation=self._implementation, + ) + @property def str(self) -> PandasSeriesStringNamespace: return PandasSeriesStringNamespace(self) diff --git a/narwhals/expression.py b/narwhals/expression.py index ad99e0aee..6d11cd5e4 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1122,6 +1122,214 @@ def over(self, *keys: str | Iterable[str]) -> Expr: """ return self.__class__(lambda plx: self._call(plx).over(flatten(keys))) + def is_duplicated(self) -> Expr: + r""" + Return a boolean mask indicating duplicated values. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... duplicated = df.select(nw.all().is_duplicated()) + ... return nw.to_native(duplicated) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 True True + 1 False True + 2 False False + 3 True False + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ true ┆ true │ + │ false ┆ true │ + │ false ┆ false │ + │ true ┆ false │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_duplicated()) + + def is_unique(self) -> Expr: + r""" + Return a boolean mask indicating unique values. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... unique = df.select(nw.all().is_unique()) + ... return nw.to_native(unique) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 False False + 1 True False + 2 True True + 3 False True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ true ┆ false │ + │ true ┆ true │ + │ false ┆ true │ + └───────┴───────┘ + """ + + return self.__class__(lambda plx: self._call(plx).is_unique()) + + def null_count(self) -> Expr: + r""" + Count null values. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... nulls = df.select(nw.all().null_count()) + ... return nw.to_native(nulls) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 0 1 2 + >>> func(df_pl) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + """ + return self.__class__(lambda plx: self._call(plx).null_count()) + + def is_first_distinct(self) -> Expr: + r""" + Return a boolean mask indicating the first occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... first_distinct = df.select(nw.all().is_first_distinct()) + ... return nw.to_native(first_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 True True + 1 True False + 2 True True + 3 False True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + │ true ┆ true │ + │ false ┆ true │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_first_distinct()) + + def is_last_distinct(self) -> Expr: + r"""Return a boolean mask indicating the last occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... last_distinct = df.select(nw.all().is_last_distinct()) + ... return nw.to_native(last_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + a b + 0 False False + 1 True True + 2 True True + 3 True True + >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4, 2) + ┌───────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ true ┆ true │ + │ true ┆ true │ + │ true ┆ true │ + └───────┴───────┘ + """ + return self.__class__(lambda plx: self._call(plx).is_last_distinct()) + @property def str(self) -> ExprStringNamespace: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index aa902879d..7a8c98094 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1015,6 +1015,309 @@ def __invert__(self) -> Series: def filter(self, other: Any) -> Series: return self._from_series(self._series.filter(self._extract_native(other))) + # --- descriptive --- + def is_duplicated(self: Self) -> Series: + r""" + Get a mask of all duplicated rows in the Series. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 2, 3, 1]) + >>> s_pl = pl.Series([1, 2, 3, 1]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... duplicated = series.is_duplicated() + ... return nw.to_native(duplicated) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 True + 1 False + 2 False + 3 True + dtype: bool + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [bool] + [ + true + false + false + true + ] + """ + return Series(self._series.is_duplicated()) + + def is_empty(self: Self) -> bool: + r""" + Check if the series is empty. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + + Let's define a dataframe-agnostic function that filters rows in which "foo" + values are greater than 10, and then checks if the result is empty or not: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.filter(series > 10).is_empty() + + We can then pass either pandas or Polars to `func`: + + >>> s_pd = pd.Series([1, 2, 3]) + >>> s_pl = pl.Series([1, 2, 3]) + >>> func(s_pd), func(s_pl) + (True, True) + + >>> s_pd = pd.Series([100, 2, 3]) + >>> s_pl = pl.Series([100, 2, 3]) + >>> func(s_pd), func(s_pl) + (False, False) + """ + return self._series.is_empty() # type: ignore[no-any-return] + + def is_unique(self: Self) -> Series: + r""" + Get a mask of all unique rows in the Series. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 2, 3, 1]) + >>> s_pl = pl.Series([1, 2, 3, 1]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... unique = series.is_unique() + ... return nw.to_native(unique) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 False + 1 True + 2 True + 3 False + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [bool] + [ + false + true + true + false + ] + """ + return Series(self._series.is_unique()) + + def null_count(self: Self) -> int: + r""" + Create a new Series that shows the null counts per column. + + Notes: + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, None, 3]) + >>> s_pl = pl.Series([1, None, None]) + + Let's define a dataframe-agnostic function that returns the null count of + the series: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.null_count() + + We can then pass either pandas or Polars to `func`: + >>> func(s_pd) + 1 + >>> func(s_pl) + 2 + """ + + return self._series.null_count() # type: ignore[no-any-return] + + def is_first_distinct(self: Self) -> Series: + r""" + Return a boolean mask indicating the first occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2]) + >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... first_distinct = series.is_first_distinct() + ... return nw.to_native(first_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 True + 1 False + 2 True + 3 True + 4 False + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + return Series(self._series.is_first_distinct()) + + def is_last_distinct(self: Self) -> Series: + r""" + Return a boolean mask indicating the last occurrence of each distinct value. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2]) + >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... last_distinct = series.is_last_distinct() + ... return nw.to_native(last_distinct) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + return Series(self._series.is_last_distinct()) + + def is_sorted(self: Self, *, descending: bool = False) -> bool: + r""" + Check if the Series is sorted. + + Arguments: + descending: Check if the Series is sorted in descending order. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> unsorted_data = [1, 3, 2] + >>> sorted_data = [3, 2, 1] + + Let's define a dataframe-agnostic function: + + >>> def func(s_any, descending=False): + ... series = nw.from_native(s_any, allow_series=True) + ... return series.is_sorted(descending=descending) + + We can then pass either pandas or Polars to `func`: + + >>> func(pl.Series(unsorted_data)) + False + >>> func(pl.Series(sorted_data), descending=True) + True + >>> func(pd.Series(unsorted_data)) + False + >>> func(pd.Series(sorted_data), descending=True) + True + """ + return self._series.is_sorted(descending=descending) # type: ignore[no-any-return] + + def value_counts( + self: Self, *, sort: bool = False, parallel: bool = False + ) -> DataFrame: + r""" + Count the occurrences of unique values. + + Arguments: + sort: Sort the output by count in descending order. If set to False (default), + the order of the output is random. + parallel: Execute the computation in parallel. Unused for pandas-like APIs. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> s_pd = pd.Series([1, 1, 2, 3, 2], name="s") + >>> s_pl = pl.Series(values=[1, 1, 2, 3, 2], name="s") + + Let's define a dataframe-agnostic function: + + >>> def func(s_any): + ... series = nw.from_native(s_any, allow_series=True) + ... val_count = series.value_counts(sort=True) + ... return nw.to_native(val_count) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest: +NORMALIZE_WHITESPACE + s count + 0 1 2 + 1 2 2 + 2 3 1 + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3, 2) + ┌─────┬───────┐ + │ s ┆ count │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═══════╡ + │ 1 ┆ 2 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴───────┘ + """ + from narwhals.dataframe import DataFrame + + return DataFrame(self._series.value_counts(sort=sort, parallel=parallel)) + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/expr/is_duplicated_test.py b/tests/expr/is_duplicated_test.py new file mode 100644 index 000000000..52e18f08a --- /dev/null +++ b/tests/expr/is_duplicated_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2], + "b": [1, 2, 3], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_duplicated(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_duplicated()) + expected = { + "a": [True, True, False], + "b": [False, False, False], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_first_distinct_test.py b/tests/expr/is_first_distinct_test.py new file mode 100644 index 000000000..22208c402 --- /dev/null +++ b/tests/expr/is_first_distinct_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2, 3, 2], + "b": [1, 2, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_first_distinct(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_first_distinct()) + expected = { + "a": [True, False, True, True, False], + "b": [True, True, True, False, False], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_last_distinct_test.py b/tests/expr/is_last_distinct_test.py new file mode 100644 index 000000000..984e2ee00 --- /dev/null +++ b/tests/expr/is_last_distinct_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2, 3, 2], + "b": [1, 2, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_last_distinct(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_last_distinct()) + expected = { + "a": [False, True, False, True, True], + "b": [False, False, True, True, True], + } + compare_dicts(result, expected) diff --git a/tests/expr/is_unique_test.py b/tests/expr/is_unique_test.py new file mode 100644 index 000000000..7ba842add --- /dev/null +++ b/tests/expr/is_unique_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1, 1, 2], + "b": [1, 2, 3], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_is_unique(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().is_unique()) + expected = { + "a": [False, False, True], + "b": [True, True, True], + } + compare_dicts(result, expected) diff --git a/tests/expr/null_count_test.py b/tests/expr/null_count_test.py new file mode 100644 index 000000000..68b615585 --- /dev/null +++ b/tests/expr/null_count_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "a": [1.0, None, None, 3.0], + "b": [1.0, None, 4, 5.0], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_null_count(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.all().null_count()) + expected = { + "a": [2], + "b": [1], + } + compare_dicts(result, expected) diff --git a/tests/test_series.py b/tests/test_series.py index d6a33dd01..bcc702c4f 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -321,3 +321,86 @@ def test_to_numpy() -> None: result = nw.Series(s).__array__() assert result.dtype == "float64" assert nw.Series(s).shape == (3,) + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_duplicated(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_duplicated() + expected = np.array([True, True, False]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.parametrize(("threshold", "expected"), [(0, False), (10, True)]) +def test_is_empty(df_raw: Any, threshold: Any, expected: Any) -> None: + series = nw.Series(df_raw["a"]) + result = series.filter(series > threshold).is_empty() + assert result == expected + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_unique(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_unique() + expected = np.array([False, False, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("s_raw", [pd.Series([1, 2, None]), pl.Series([1, 2, None])]) +def test_null_count(s_raw: Any) -> None: + series = nw.Series(s_raw) + result = series.null_count() + assert result == 1 + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_first_distinct(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_first_distinct() + expected = np.array([True, False, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_last_distinct(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + result = series.is_last_distinct() + expected = np.array([False, True, True]) + assert (result.to_numpy() == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_value_counts(df_raw: Any) -> None: + series = nw.Series(df_raw["b"]) + sorted_result = series.value_counts(sort=True) + assert sorted_result.columns == ["b", "count"] + + expected = np.array([[4, 2], [6, 1]]) + assert (sorted_result.to_numpy() == expected).all() + + unsorted_result = series.value_counts(sort=False) + assert unsorted_result.columns == ["b", "count"] + + a = unsorted_result.to_numpy() + + assert (a[a[:, 0].argsort()] == expected).all() + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.parametrize( + ("col", "descending", "expected"), + [("a", False, False), ("z", False, True), ("z", True, False)], +) +def test_is_sorted(df_raw: Any, col: str, descending: bool, expected: bool) -> None: # noqa: FBT001 + series = nw.Series(df_raw[col]) + result = series.is_sorted(descending=descending) + assert result == expected + + +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +def test_is_sorted_invalid(df_raw: Any) -> None: + series = nw.Series(df_raw["z"]) + + with pytest.raises(TypeError): + series.is_sorted(descending="invalid_type") # type: ignore[arg-type] diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 9eac56dea..e1f74ac2b 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -126,6 +126,7 @@ series = [ i for i in nw.Series(pl.Series()).__dir__() if not i[0].isupper() and i[0] != "_" ] + if missing := set(expr).difference(series).difference({"over"}): print("In expr but not in series") # noqa: T201 print(missing) # noqa: T201 @@ -133,7 +134,19 @@ if ( extra := set(series) .difference(expr) - .difference({"to_pandas", "to_numpy", "dtype", "name", "shape", "to_frame"}) + .difference( + { + "to_pandas", + "to_numpy", + "dtype", + "name", + "shape", + "to_frame", + "is_empty", + "is_sorted", + "value_counts", + } + ) ): print("in series but not in expr") # noqa: T201 print(extra) # noqa: T201 From 8a368dd1114a76f3de6aa15942d0e04244661215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Mon, 20 May 2024 17:16:04 +0200 Subject: [PATCH 17/38] error message imrovement - is_in --- narwhals/expression.py | 7 ++++++- tests/test_series.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/narwhals/expression.py b/narwhals/expression.py index ad99e0aee..8c5a66cac 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -844,7 +844,12 @@ def is_between( ) def is_in(self, other: Any) -> Expr: - return self.__class__(lambda plx: self._call(plx).is_in(other)) + if isinstance(other, Iterable) and not isinstance(other, (str, bytes)): + return self.__class__(lambda plx: self._call(plx).is_in(other)) + else: + raise NotImplementedError( + "Narwhals `is_in` doesn't accept expressions as an argument, as opposed to Polars. You should provide an iterable instead." + ) def filter(self, other: Any) -> Expr: return self.__class__( diff --git a/tests/test_series.py b/tests/test_series.py index d6a33dd01..0019742e6 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -59,6 +59,18 @@ def test_is_in(df_raw: Any) -> None: assert result[2] +@pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.filterwarnings("ignore:np.find_common_type is deprecated:DeprecationWarning") +def test_is_in_other(df_raw: Any) -> None: + with pytest.raises( + NotImplementedError, + match=( + "Narwhals `is_in` doesn't accept expressions as an argument, as opposed to Polars. You should provide an iterable instead." + ), + ): + nw.from_native(df_raw).with_columns(contains=nw.col("c").is_in("sets")) + + @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) @pytest.mark.filterwarnings("ignore:np.find_common_type is deprecated:DeprecationWarning") def test_filter(df_raw: Any) -> None: From 3f5deeef7f2233521499007ac1709adc6cacbd45 Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh <126014542+brentomagic@users.noreply.github.com> Date: Mon, 20 May 2024 21:03:54 +0100 Subject: [PATCH 18/38] added series.filter docstring --- narwhals/series.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index aa902879d..5d28bf899 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1013,6 +1013,40 @@ def __invert__(self) -> Series: return self._from_series(self._series.__invert__()) def filter(self, other: Any) -> Series: + """ + Filter elements in the Series based on a condition. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> s = [4, 10, 15, 34, 50] + >>> s_pd = pd.Series(s) + >>> s_pl = pl.Series(s) + + We define a library agnostic function: + + >>> def func(s_any): + ... s = nw.from_native(s_any, series_only=True) + ... s = s.filter(s > 10) + ... return nw.to_native(s) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 2 15 + 3 34 + 4 50 + dtype: int64 + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [i64] + [ + 15 + 34 + 50 + ] + """ return self._from_series(self._series.filter(self._extract_native(other))) @property From a2dfd7103fb09552bc347c9c3d8159b250d8df77 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 21 May 2024 08:48:33 +0200 Subject: [PATCH 19/38] correct docstring --- narwhals/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index f42e62e5b..9e12bcae1 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -441,10 +441,10 @@ def std(self, *, ddof: int = 1) -> Any: def is_in(self, other: Any) -> Self: """ - Check if elements of this Series are in the other Series. + Check if elements of this sequence are present in the other Series. Arguments: - other: Series or sequence of primitive type. + other: Sequence of primitive type. Examples: >>> import pandas as pd @@ -457,7 +457,7 @@ def is_in(self, other: Any) -> Self: >>> def func(s_any): ... s = nw.from_native(s_any, series_only=True) - ... s = s.is_in(pl.Series([3, 2, 8])) + ... s = s.is_in([3, 2, 8]) ... return nw.to_native(s) We can then pass either pandas or Polars to `func`: From 000db4c094925dbfd8f4de7e8d55ce6e2859db56 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 21 May 2024 10:15:58 +0200 Subject: [PATCH 20/38] doc: fill in ser.sample docstrings --- narwhals/series.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index 7a8c98094..ec6d8e047 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -697,6 +697,49 @@ def sample( *, with_replacement: bool = False, ) -> Self: + """ + Sample randomly from this Series. + + Arguments: + n: Number of items to return. Cannot be used with fraction. + + fraction: Fraction of items to return. Cannot be used with n. + + with_replacement: Allow values to be sampled more than once. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + + >>> s_pd = pd.Series([1, 2, 3, 4]) + >>> s_pl = pl.Series([1, 2, 3, 4]) + + We define a library agnostic function: + + >>> def func(s_any): + ... nw.from_native(s_any, series_only=True) + ... s = s.sample(fraction=1.0, with_replacement=True) + ... return nw.to_native(s) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) # doctest:+SKIP + a + 2 3 + 1 2 + 3 4 + 3 4 + >>> func(s_pl) # doctest:+SKIP + shape: (4,) + Series: '' [i64] + [ + 1 + 4 + 3 + 4 + ] + """ return self._from_series( self._series.sample(n=n, fraction=fraction, with_replacement=with_replacement) ) From 4d8e494630cc94486eb8e63f8d955cd02ba6aa94 Mon Sep 17 00:00:00 2001 From: Magdalena Anopsy <74981211+anopsy@users.noreply.github.com> Date: Tue, 21 May 2024 11:47:37 +0000 Subject: [PATCH 21/38] docstring is_in in Expr added --- narwhals/expression.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/narwhals/expression.py b/narwhals/expression.py index 2cf4afdd1..8ffb8f580 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -636,6 +636,48 @@ def is_between( ) def is_in(self, other: Any) -> Expr: + """ + Check if elements of this expression are present in the other list. + + Arguments: + other: List-like objects. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> df_pd = pd.DataFrame({'a': [1, 2, 9, 10]}) + >>> df_pl = pl.DataFrame({'a': [1, 2, 9, 10]}) + + Let's define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.with_columns(b = nw.col('a').is_in([1, 2])) + ... return nw.to_native(df) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 0 1 True + 1 2 True + 2 9 False + 3 10 False + + >>> func(df_pl) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 9 ┆ false │ + │ 10 ┆ false │ + └─────┴───────┘ + """ return self.__class__(lambda plx: self._call(plx).is_in(other)) def filter(self, other: Any) -> Expr: From 53810051a95c576825ac66530b72cdf87695d73d Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 21 May 2024 15:48:58 +0200 Subject: [PATCH 22/38] correct the summary --- narwhals/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/series.py b/narwhals/series.py index 9e12bcae1..a51089cf2 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -441,7 +441,7 @@ def std(self, *, ddof: int = 1) -> Any: def is_in(self, other: Any) -> Self: """ - Check if elements of this sequence are present in the other Series. + Check if the elements of this Series are in the other sequence. Arguments: other: Sequence of primitive type. From 3e6059a60ec23c40025d15e9b49ec8eec57e6854 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 21 May 2024 22:17:03 +0200 Subject: [PATCH 23/38] Bump version to 0.8.16 --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index f98931c97..00f40751a 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals -'0.8.15' +'0.8.16' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index bfdc65873..d78e01a5f 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -31,7 +31,7 @@ from narwhals.utils import maybe_align_index from narwhals.utils import maybe_set_index -__version__ = "0.8.15" +__version__ = "0.8.16" __all__ = [ "concat", diff --git a/pyproject.toml b/pyproject.toml index 9781c3a6d..a61661744 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "0.8.15" +version = "0.8.16" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From ef5a3e4ba797c0e2d2d8ddbdec853a6eab0b3012 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 21 May 2024 23:27:19 +0200 Subject: [PATCH 24/38] fixup filter --- narwhals/_pandas_like/expr.py | 8 ++++++-- narwhals/expression.py | 6 ++++-- tests/expr/filter_test.py | 27 +++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) create mode 100644 tests/expr/filter_test.py diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index f90208248..1dc4f3064 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -184,8 +184,12 @@ def fill_null(self, value: Any) -> Self: def is_in(self, other: Any) -> Self: return register_expression_call(self, "is_in", other) - def filter(self, other: Any) -> Self: - return register_expression_call(self, "filter", other) + def filter(self, *predicates: Any) -> Self: + from narwhals._pandas_like.namespace import PandasNamespace + + plx = PandasNamespace(self._implementation) + expr = plx.all_horizontal(*predicates) + return register_expression_call(self, "filter", expr) def drop_nulls(self) -> Self: return register_expression_call(self, "drop_nulls") diff --git a/narwhals/expression.py b/narwhals/expression.py index fbddc1e3e..674b59811 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -851,9 +851,11 @@ def is_in(self, other: Any) -> Expr: "Narwhals `is_in` doesn't accept expressions as an argument, as opposed to Polars. You should provide an iterable instead." ) - def filter(self, other: Any) -> Expr: + def filter(self, *predicates: Any) -> Expr: return self.__class__( - lambda plx: self._call(plx).filter(extract_native(plx, other)) + lambda plx: self._call(plx).filter( + *[extract_native(plx, pred) for pred in flatten(predicates)] + ) ) def is_null(self) -> Expr: diff --git a/tests/expr/filter_test.py b/tests/expr/filter_test.py new file mode 100644 index 000000000..ea15a94f1 --- /dev/null +++ b/tests/expr/filter_test.py @@ -0,0 +1,27 @@ +from typing import Any + +import pandas as pd +import polars as pl +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + +data = { + "i": [0, 1, 2, 3, 4], + "a": [0, 1, 2, 3, 4], + "b": [1, 2, 3, 5, 3], + "c": [5, 4, 3, 2, 1], +} + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +def test_filter(constructor: Any) -> None: + df = nw.from_native(constructor(data), eager_only=True) + result = df.select(nw.col("a").filter(nw.col("i") < 2, nw.col("c") == 5)) + expected = { + "a": [0], + } + compare_dicts(result, expected) + result = df.select(df["a"].filter((df["i"] < 2) & (df["c"] == 5))) + compare_dicts(result, expected) From c458ab8fc8b3dab2125299f14c80864e84ad138e Mon Sep 17 00:00:00 2001 From: ugohuche Date: Wed, 22 May 2024 00:17:35 +0100 Subject: [PATCH 25/38] Added millisecond and microsecond temporal methods --- narwhals/_pandas_like/expr.py | 3 + narwhals/_pandas_like/series.py | 7 +- narwhals/expression.py | 103 ++++++++++++++++++++++++ narwhals/series.py | 134 ++++++++++++++++++++++++++------ tests/test_dt.py | 6 +- 5 files changed, 226 insertions(+), 27 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 04fbfb42f..02e473836 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -330,6 +330,9 @@ def second(self) -> PandasExpr: def millisecond(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "millisecond") + + def microsecond(self) -> PandasExpr: + return register_namespace_expression_call(self._expr, "dt", "microsecond") def ordinal_day(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "ordinal_day") diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index f96c3d38a..513ef98b8 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -548,7 +548,12 @@ def second(self) -> PandasSeries: def millisecond(self) -> PandasSeries: return self._series._from_series( - self._series._series.dt.millisecond, + self._series._series.dt.microsecond // 1000, + ) + + def microsecond(self) -> PandasSeries: + return self._series._from_series( + self._series._series.dt.microsecond ) def ordinal_day(self) -> PandasSeries: diff --git a/narwhals/expression.py b/narwhals/expression.py index 96ceca030..0d240f2be 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1765,7 +1765,110 @@ def second(self) -> Expr: return self._expr.__class__(lambda plx: self._expr._call(plx).dt.second()) def millisecond(self) -> Expr: + """ + Extract milliseconds from underlying DateTime representation. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> from datetime import datetime + >>> import narwhals as nw + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1, 1, 0), + ... datetime(2024, 10, 13, 5, 30, 14, 505000), + ... datetime(2065, 1, 1, 10, 20, 30, 67000), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + We define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.with_columns( + ... nw.col("datetime").dt.hour().alias("hour"), + ... nw.col("datetime").dt.minute().alias("minute"), + ... nw.col("datetime").dt.second().alias("second"), + ... nw.col("datetime").dt.millisecond().alias("millisecond") + ... ) + ... return nw.to_native(df) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + datetime hour minute second millisecond + 0 1978-01-01 01:01:01.000 1 1 1 0 + 1 2024-10-13 05:30:14.505 5 30 14 505 + 2 2065-01-01 10:20:30.067 10 20 30 67 + >>> func(df_pl) + shape: (3, 5) + ┌─────────────────────────┬──────┬────────┬────────┬─────────────┐ + │ datetime ┆ hour ┆ minute ┆ second ┆ millisecond │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ + ╞═════════════════════════╪══════╪════════╪════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 5 ┆ 30 ┆ 14 ┆ 505 │ + │ 2065-01-01 10:20:30.067 ┆ 10 ┆ 20 ┆ 30 ┆ 67 │ + └─────────────────────────┴──────┴────────┴────────┴─────────────┘ + """ return self._expr.__class__(lambda plx: self._expr._call(plx).dt.millisecond()) + + def microsecond(self) -> Expr: + """ + Extract microseconds from underlying DateTime representation. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> from datetime import datetime + >>> import narwhals as nw + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1, 1, 0), + ... datetime(2024, 10, 13, 5, 30, 14, 505000), + ... datetime(2065, 1, 1, 10, 20, 30, 67000), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + We define a dataframe-agnostic function: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.with_columns( + ... nw.col("datetime").dt.hour().alias("hour"), + ... nw.col("datetime").dt.minute().alias("minute"), + ... nw.col("datetime").dt.second().alias("second"), + ... nw.col("datetime").dt.microsecond().alias("microsecond") + ... ) + ... return nw.to_native(df) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + datetime hour minute second microsecond + 0 1978-01-01 01:01:01.000 1 1 1 0 + 1 2024-10-13 05:30:14.505 5 30 14 505000 + 2 2065-01-01 10:20:30.067 10 20 30 67000 + >>> func(df_pl) + shape: (3, 5) + ┌─────────────────────────┬──────┬────────┬────────┬─────────────┐ + │ datetime ┆ hour ┆ minute ┆ second ┆ microsecond │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ + ╞═════════════════════════╪══════╪════════╪════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 5 ┆ 30 ┆ 14 ┆ 505000 │ + │ 2065-01-01 10:20:30.067 ┆ 10 ┆ 20 ┆ 30 ┆ 67000 │ + └─────────────────────────┴──────┴────────┴────────┴─────────────┘ + """ + return self._expr.__class__(lambda plx: self._expr._call(plx).dt.microsecond()) + + def ordinal_day(self) -> Expr: """ diff --git a/narwhals/series.py b/narwhals/series.py index 7c52e4686..2b9e29610 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1419,16 +1419,16 @@ def __init__(self, series: Series) -> None: def year(self) -> Series: """ - Get the year in a date series. + Get the year in a datetime series. Examples: >>> import pandas as pd >>> import polars as pl >>> from datetime import datetime >>> import narwhals as nw - >>> data = [datetime(2012, 1, 7), datetime(2023, 3, 10)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) + >>> dates = [datetime(2012, 1, 7), datetime(2023, 3, 10)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -1455,16 +1455,16 @@ def year(self) -> Series: def month(self) -> Series: """ - Gets the month in a date series. + Gets the month in a datetime series. Examples: >>> import pandas as pd >>> import polars as pl >>> from datetime import datetime >>> import narwhals as nw - >>> data = [datetime(2023, 2, 1), datetime(2023, 8, 3)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) + >>> dates = [datetime(2023, 2, 1), datetime(2023, 8, 3)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -1491,16 +1491,16 @@ def month(self) -> Series: def day(self) -> Series: """ - Extracts the day in a date series. + Extracts the day in a datetime series. Examples: >>> import pandas as pd >>> import polars as pl >>> from datetime import datetime >>> import narwhals as nw - >>> data = [datetime(2022, 1, 1), datetime(2022, 1, 5)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) + >>> dates = [datetime(2022, 1, 1), datetime(2022, 1, 5)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -1527,16 +1527,16 @@ def day(self) -> Series: def hour(self) -> Series: """ - Extracts the hour in a date series. + Extracts the hour in a datetime series. Examples: >>> import pandas as pd >>> import polars as pl >>> from datetime import datetime >>> import narwhals as nw - >>> data = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) + >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -1563,16 +1563,16 @@ def hour(self) -> Series: def minute(self) -> Series: """ - Extracts the minute in a date series. + Extracts the minute in a datetime series. Examples: >>> import pandas as pd >>> import polars as pl >>> from datetime import datetime >>> import narwhals as nw - >>> data = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) + >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -1599,16 +1599,16 @@ def minute(self) -> Series: def second(self) -> Series: """ - Extracts the second(s) in a date series. + Extracts the second(s) in a datetime series. Examples: >>> import pandas as pd >>> import polars as pl >>> from datetime import datetime >>> import narwhals as nw - >>> data = [datetime(2022, 1, 1, 5, 3, 10), datetime(2022, 1, 5, 9, 12, 4)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) + >>> dates = [datetime(2022, 1, 1, 5, 3, 10), datetime(2022, 1, 5, 9, 12, 4)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -1634,7 +1634,93 @@ def second(self) -> Series: return self._series.__class__(self._series._series.dt.second()) def millisecond(self) -> Series: + """ + Extracts the milliseconds in a datetime series. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> from datetime import datetime + >>> import narwhals as nw + >>> dates = [ + ... datetime(2023, 5, 21, 12, 55, 10, 400000), + ... datetime(2023, 5, 21, 12, 55, 10, 600000), + ... datetime(2023, 5, 21, 12, 55, 10, 800000), + ... datetime(2023, 5, 21, 12, 55, 11, 0), + ... datetime(2023, 5, 21, 12, 55, 11, 200000) + ... ] + + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + + We define a library agnostic function: + + >>> def func(s_any): + ... s = nw.from_native(s_any, series_only=True) + ... s = s.dt.millisecond().alias("datetime") + ... return nw.to_native(s) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 0 400 + 1 600 + 2 800 + 3 0 + 4 200 + Name: datetime, dtype: int32 + """ return self._series.__class__(self._series._series.dt.millisecond()) + + def microsecond(self) -> Series: + """ + Extracts the microseconds in a datetime series. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> from datetime import datetime + >>> import narwhals as nw + >>> dates = [ + ... datetime(2023, 5, 21, 12, 55, 10, 400000), + ... datetime(2023, 5, 21, 12, 55, 10, 600000), + ... datetime(2023, 5, 21, 12, 55, 10, 800000), + ... datetime(2023, 5, 21, 12, 55, 11, 0), + ... datetime(2023, 5, 21, 12, 55, 11, 200000) + ... ] + + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + + We define a library agnostic function: + + >>> def func(s_any): + ... s = nw.from_native(s_any, series_only=True) + ... s = s.dt.microsecond().alias("datetime") + ... return nw.to_native(s) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 0 400000 + 1 600000 + 2 800000 + 3 0 + 4 200000 + Name: datetime, dtype: int32 + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: 'datetime' [i32] + [ + 400000 + 600000 + 800000 + 0 + 200000 + ] + """ + return self._series.__class__(self._series._series.dt.microsecond()) + def ordinal_day(self) -> Series: """ diff --git a/tests/test_dt.py b/tests/test_dt.py index 5e7276064..03ef87214 100644 --- a/tests/test_dt.py +++ b/tests/test_dt.py @@ -17,8 +17,8 @@ data = { "a": [ - datetime(2021, 3, 1, 12, 34, 56), - datetime(2020, 1, 2, 2, 4, 14), + datetime(2021, 3, 1, 12, 34, 56, 49000), + datetime(2020, 1, 2, 2, 4, 14, 715000), ], } data_timedelta = { @@ -44,6 +44,8 @@ ("hour", [12, 2]), ("minute", [34, 4]), ("second", [56, 14]), + ("millisecond", [49, 715]), + ("microsecond", [49000, 715000]), ("ordinal_day", [60, 2]), ], ) From a10d4cbf122befcdf4f0987642401bc981cb28c4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 23:18:01 +0000 Subject: [PATCH 26/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/series.py | 6 ++---- narwhals/expression.py | 4 +--- narwhals/series.py | 3 +-- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 02e473836..c9e021a8f 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -330,7 +330,7 @@ def second(self) -> PandasExpr: def millisecond(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "millisecond") - + def microsecond(self) -> PandasExpr: return register_namespace_expression_call(self._expr, "dt", "microsecond") diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 513ef98b8..b52dc37c5 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -550,11 +550,9 @@ def millisecond(self) -> PandasSeries: return self._series._from_series( self._series._series.dt.microsecond // 1000, ) - + def microsecond(self) -> PandasSeries: - return self._series._from_series( - self._series._series.dt.microsecond - ) + return self._series._from_series(self._series._series.dt.microsecond) def ordinal_day(self) -> PandasSeries: ser = self._series._series diff --git a/narwhals/expression.py b/narwhals/expression.py index 0d240f2be..3d0aafbd8 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -1815,7 +1815,7 @@ def millisecond(self) -> Expr: └─────────────────────────┴──────┴────────┴────────┴─────────────┘ """ return self._expr.__class__(lambda plx: self._expr._call(plx).dt.millisecond()) - + def microsecond(self) -> Expr: """ Extract microseconds from underlying DateTime representation. @@ -1867,8 +1867,6 @@ def microsecond(self) -> Expr: └─────────────────────────┴──────┴────────┴────────┴─────────────┘ """ return self._expr.__class__(lambda plx: self._expr._call(plx).dt.microsecond()) - - def ordinal_day(self) -> Expr: """ diff --git a/narwhals/series.py b/narwhals/series.py index 2b9e29610..d75a3cca3 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1671,7 +1671,7 @@ def millisecond(self) -> Series: Name: datetime, dtype: int32 """ return self._series.__class__(self._series._series.dt.millisecond()) - + def microsecond(self) -> Series: """ Extracts the microseconds in a datetime series. @@ -1721,7 +1721,6 @@ def microsecond(self) -> Series: """ return self._series.__class__(self._series._series.dt.microsecond()) - def ordinal_day(self) -> Series: """ Get ordinal day. From 1f207668407edac6bc1767993ab64fa4ee935ab1 Mon Sep 17 00:00:00 2001 From: ugohuche Date: Wed, 22 May 2024 00:28:38 +0100 Subject: [PATCH 27/38] Modified addition of millisecond and microsecond methods --- narwhals/series.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index 2b9e29610..98d7db5f2 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1668,7 +1668,17 @@ def millisecond(self) -> Series: 2 800 3 0 4 200 - Name: datetime, dtype: int32 + Name: datetime, dtype: int... + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: 'datetime' [i32] + [ + 400 + 600 + 800 + 0 + 200 + ] """ return self._series.__class__(self._series._series.dt.millisecond()) @@ -1707,7 +1717,7 @@ def microsecond(self) -> Series: 2 800000 3 0 4 200000 - Name: datetime, dtype: int32 + Name: datetime, dtype: int... >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: 'datetime' [i32] From 9d9508c0f7c8e3aaa60afc4e653adb46cfdd31be Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 21 May 2024 11:40:42 +0200 Subject: [PATCH 28/38] fixup the example --- narwhals/series.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/narwhals/series.py b/narwhals/series.py index ec6d8e047..676315598 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -707,6 +707,11 @@ def sample( with_replacement: Allow values to be sampled more than once. + Notes: + The `sample` method returns a Series with a specified number of + randomly selected items chosen from this Series. + The results are not consistent across libraries. + Examples: >>> import narwhals as nw >>> import pandas as pd @@ -718,7 +723,7 @@ def sample( We define a library agnostic function: >>> def func(s_any): - ... nw.from_native(s_any, series_only=True) + ... s = nw.from_native(s_any, series_only=True) ... s = s.sample(fraction=1.0, with_replacement=True) ... return nw.to_native(s) From 76478a2510b3c279c8474419b534020e3fb55e21 Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh <126014542+brentomagic@users.noreply.github.com> Date: Thu, 23 May 2024 00:38:28 +0100 Subject: [PATCH 29/38] made corrections to expr.filter docstring --- narwhals/expression.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/narwhals/expression.py b/narwhals/expression.py index fb10d38cf..6ed06be8a 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -894,6 +894,44 @@ def is_in(self, other: Any) -> Expr: ) def filter(self, *predicates: Any) -> Expr: + """ + Filters elements based on a condition, returning a new expression. + + Examples: + >>> import polars as pl + >>> import pandas as pd + >>> import narwhals as nw + >>> df_pd = pd.DataFrame({'a': [2, 3, 4, 5, 6, 7], 'b': [10, 11, 12, 13, 14, 15]}) + >>> df_pl = pl.DataFrame({'a': [2, 3, 4, 5, 6, 7], 'b': [10, 11, 12, 13, 14, 15]}) + + Let's define a dataframe-agnostic function: + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.select( + ... nw.col("a").filter(nw.col("a") > 4), + ... nw.col("b").filter(nw.col("b") > 12) + ... ) + ... return nw.to_native(df) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 3 5 13 + 4 6 14 + 5 7 15 + >>> func(df_pl) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 13 │ + │ 6 ┆ 14 │ + │ 7 ┆ 15 │ + └─────┴─────┘ + """ return self.__class__( lambda plx: self._call(plx).filter( *[extract_native(plx, pred) for pred in flatten(predicates)] From 2d44f258833215ed47e820415602b20439e1abbd Mon Sep 17 00:00:00 2001 From: Nwabueze Ugoh <126014542+brentomagic@users.noreply.github.com> Date: Thu, 23 May 2024 01:39:39 +0100 Subject: [PATCH 30/38] added different operator for 'b' column --- narwhals/expression.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/narwhals/expression.py b/narwhals/expression.py index 6ed06be8a..fe2c6014b 100644 --- a/narwhals/expression.py +++ b/narwhals/expression.py @@ -909,7 +909,7 @@ def filter(self, *predicates: Any) -> Expr: ... df = nw.from_native(df_any) ... df = df.select( ... nw.col("a").filter(nw.col("a") > 4), - ... nw.col("b").filter(nw.col("b") > 12) + ... nw.col("b").filter(nw.col("b") < 13) ... ) ... return nw.to_native(df) @@ -917,9 +917,9 @@ def filter(self, *predicates: Any) -> Expr: >>> func(df_pd) a b - 3 5 13 - 4 6 14 - 5 7 15 + 3 5 10 + 4 6 11 + 5 7 12 >>> func(df_pl) shape: (3, 2) ┌─────┬─────┐ @@ -927,9 +927,9 @@ def filter(self, *predicates: Any) -> Expr: │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ - │ 5 ┆ 13 │ - │ 6 ┆ 14 │ - │ 7 ┆ 15 │ + │ 5 ┆ 10 │ + │ 6 ┆ 11 │ + │ 7 ┆ 12 │ └─────┴─────┘ """ return self.__class__( From be46b4e1d85c3f10c3574d5934bd3c6807c21067 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 23 May 2024 16:34:56 +0200 Subject: [PATCH 31/38] dont convert_dtypes --- narwhals/_pandas_like/series.py | 10 +--------- narwhals/dependencies.py | 12 ++++++------ narwhals/series.py | 2 +- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index b52dc37c5..8ce050ee8 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING from typing import Any from typing import Sequence @@ -166,14 +165,7 @@ def is_in(self, other: Any) -> PandasSeries: import pandas as pd ser = self._series - with warnings.catch_warnings(): - # np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types` - warnings.filterwarnings( - "ignore", - message="np.find_common_type is deprecated.*", - category=DeprecationWarning, - ) - res = ser.isin(other).convert_dtypes() + res = ser.isin(other) res[ser.isna()] = pd.NA return self._from_series(res) diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 035463f17..757069eb1 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -7,17 +7,17 @@ def get_polars() -> Any: - """Import Polars (if available - else return None).""" + """Get Polars module (if already imported - else return None).""" return sys.modules.get("polars", None) def get_pandas() -> Any: - """Import pandas (if available - else return None).""" + """Get pandas module (if already imported - else return None).""" return sys.modules.get("pandas", None) def get_modin() -> Any: # pragma: no cover - """Import modin (if available - else return None).""" + """Get modin.pandas module (if already imported - else return None).""" modin = sys.modules.get("modin", None) if modin is not None: return modin.pandas @@ -25,15 +25,15 @@ def get_modin() -> Any: # pragma: no cover def get_cudf() -> Any: - """Import cudf (if available - else return None).""" + """Get cudf module (if already imported - else return None).""" return sys.modules.get("cudf", None) def get_pyarrow() -> Any: - """Import pyarrow (if available - else return None).""" + """Get pyarrow module (if already imported - else return None).""" return sys.modules.get("pyarrow", None) def get_numpy() -> Any: - """Import numpy (if available - else return None).""" + """Get numpy module (if already imported - else return None).""" return sys.modules.get("numpy", None) diff --git a/narwhals/series.py b/narwhals/series.py index f67f143db..69bd033c0 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -494,7 +494,7 @@ def is_in(self, other: Any) -> Self: 0 False 1 True 2 True - dtype: boolean + dtype: bool >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] From 269c9af3aa51f58d786881f59a3cd4fccda598f2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 23 May 2024 16:51:23 +0200 Subject: [PATCH 32/38] Bump version to 0.8.17 --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 00f40751a..89b7f5d14 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals -'0.8.16' +'0.8.17' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index d78e01a5f..5c41293dc 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -31,7 +31,7 @@ from narwhals.utils import maybe_align_index from narwhals.utils import maybe_set_index -__version__ = "0.8.16" +__version__ = "0.8.17" __all__ = [ "concat", diff --git a/pyproject.toml b/pyproject.toml index a61661744..307d9374d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "0.8.16" +version = "0.8.17" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From bf5814d8ce8e5633b444539aefd1a77baac1a401 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 24 May 2024 11:52:48 +0200 Subject: [PATCH 33/38] ban 3rd party imports --- .pre-commit-config.yaml | 6 +++++- narwhals/_pandas_like/group_by.py | 14 +++++++------- narwhals/_pandas_like/namespace.py | 15 --------------- narwhals/_pandas_like/series.py | 6 ++---- narwhals/_pandas_like/utils.py | 22 +++++++++++----------- narwhals/dtypes.py | 3 ++- narwhals/series.py | 4 +--- 7 files changed, 28 insertions(+), 42 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b0eaac7e..48c1079c8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,4 +28,8 @@ repos: entry: python -m utils.check_api_reference language: python additional_dependencies: [polars] - + - id: imports-are-banned + name: import are banned (use `get_pandas` instead of `import pandas`) + entry: (?>> )import (pandas|polars|modin|cudf) + language: pygrep + files: ^narwhals/ diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index b13f40971..4b7c24438 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -12,6 +12,8 @@ from narwhals._pandas_like.utils import is_simple_aggregation from narwhals._pandas_like.utils import item from narwhals._pandas_like.utils import parse_into_exprs +from narwhals._pandas_like.utils import series_from_iterable +from narwhals.dependencies import get_pandas from narwhals.utils import parse_version from narwhals.utils import remove_prefix @@ -88,9 +90,7 @@ def agg_pandas( # noqa: PLR0913 - https://github.com/rapidsai/cudf/issues/15118 - https://github.com/rapidsai/cudf/issues/15084 """ - import pandas as pd - - from narwhals._pandas_like.namespace import PandasNamespace + pd = get_pandas() all_simple_aggs = True for expr in exprs: @@ -140,8 +140,6 @@ def agg_pandas( # noqa: PLR0913 stacklevel=2, ) - plx = PandasNamespace(implementation=implementation) - def func(df: Any) -> Any: out_group = [] out_names = [] @@ -150,10 +148,12 @@ def func(df: Any) -> Any: for result_keys in results_keys: out_group.append(item(result_keys._series)) out_names.append(result_keys.name) - return plx._make_native_series(name="", data=out_group, index=out_names) + return series_from_iterable( + out_group, index=out_names, name="", implementation=implementation + ) if implementation == "pandas": - import pandas as pd + pd = get_pandas() if parse_version(pd.__version__) < parse_version("2.2.0"): # pragma: no cover result_complex = grouped.apply(func) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 06f5f03fa..1bf12c611 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -41,21 +41,6 @@ class PandasNamespace: def selectors(self) -> PandasSelector: return PandasSelector(self._implementation) - def _make_native_series(self, name: str, data: list[Any], index: Any) -> Any: - if self._implementation == "pandas": - import pandas as pd - - return pd.Series(name=name, data=data, index=index) - if self._implementation == "modin": # pragma: no cover - import modin.pandas as mpd - - return mpd.Series(name=name, data=data, index=index) - if self._implementation == "cudf": # pragma: no cover - import cudf - - return cudf.Series(name=name, data=data, index=index) - raise NotImplementedError # pragma: no cover - # --- not in spec --- def __init__(self, implementation: str) -> None: self._implementation = implementation diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 8ce050ee8..74c641a70 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -9,6 +9,7 @@ from narwhals._pandas_like.utils import to_datetime from narwhals._pandas_like.utils import translate_dtype from narwhals._pandas_like.utils import validate_column_comparand +from narwhals.dependencies import get_pandas from narwhals.utils import parse_version if TYPE_CHECKING: @@ -81,7 +82,7 @@ def __init__( self._implementation = implementation self._use_copy_false = False if self._implementation == "pandas": - import pandas as pd + pd = get_pandas() if parse_version(pd.__version__) < parse_version("3.0.0"): self._use_copy_false = True @@ -162,11 +163,8 @@ def is_between( return self._from_series(res) def is_in(self, other: Any) -> PandasSeries: - import pandas as pd - ser = self._series res = ser.isin(other) - res[ser.isna()] = pd.NA return self._from_series(res) # Binary comparisons diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 605b4de09..1d387fbc8 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -256,17 +256,17 @@ def horizontal_concat(dfs: list[Any], implementation: str) -> Any: Should be in namespace. """ if implementation == "pandas": - import pandas as pd + pd = get_pandas() if parse_version(pd.__version__) < parse_version("3.0.0"): return pd.concat(dfs, axis=1, copy=False) return pd.concat(dfs, axis=1) # pragma: no cover if implementation == "cudf": # pragma: no cover - import cudf + cudf = get_cudf() return cudf.concat(dfs, axis=1) if implementation == "modin": # pragma: no cover - import modin.pandas as mpd + mpd = get_modin() return mpd.concat(dfs, axis=1) msg = f"Unknown implementation: {implementation}" # pragma: no cover @@ -289,17 +289,17 @@ def vertical_concat(dfs: list[Any], implementation: str) -> Any: msg = "unable to vstack, column names don't match" raise TypeError(msg) if implementation == "pandas": - import pandas as pd + pd = get_pandas() if parse_version(pd.__version__) < parse_version("3.0.0"): return pd.concat(dfs, axis=0, copy=False) return pd.concat(dfs, axis=0) # pragma: no cover if implementation == "cudf": # pragma: no cover - import cudf + cudf = get_cudf() return cudf.concat(dfs, axis=0) if implementation == "modin": # pragma: no cover - import modin.pandas as mpd + mpd = get_modin() return mpd.concat(dfs, axis=0) msg = f"Unknown implementation: {implementation}" # pragma: no cover @@ -311,15 +311,15 @@ def series_from_iterable( ) -> Any: """Return native series.""" if implementation == "pandas": - import pandas as pd + pd = get_pandas() return pd.Series(data, name=name, index=index, copy=False) if implementation == "cudf": # pragma: no cover - import cudf + cudf = get_cudf() return cudf.Series(data, name=name, index=index) if implementation == "modin": # pragma: no cover - import modin.pandas as mpd + mpd = get_modin() return mpd.Series(data, name=name, index=index) msg = f"Unknown implementation: {implementation}" # pragma: no cover @@ -390,9 +390,9 @@ def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: if isinstance_or_issubclass(dtype, dtypes.UInt8): return "uint8" if isinstance_or_issubclass(dtype, dtypes.String): - import pandas as pd + pd = get_pandas() - if parse_version(pd.__version__) >= parse_version("2.0.0"): + if pd is not None and parse_version(pd.__version__) >= parse_version("2.0.0"): if get_pyarrow() is not None: return "string[pyarrow]" return "string[python]" # pragma: no cover diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 161a6383f..49bc22395 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any +from narwhals.dependencies import get_polars from narwhals.utils import isinstance_or_issubclass if TYPE_CHECKING: @@ -116,7 +117,7 @@ def translate_dtype(plx: Any, dtype: DType) -> Any: def to_narwhals_dtype(dtype: Any, *, is_polars: bool) -> DType: if not is_polars: return dtype # type: ignore[no-any-return] - import polars as pl + pl = get_polars() if dtype == pl.Float64: return Float64() diff --git a/narwhals/series.py b/narwhals/series.py index 69bd033c0..28506eff0 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -67,9 +67,7 @@ def __getitem__(self, idx: int | slice) -> Any: def __narwhals_namespace__(self) -> Any: if self._is_polars: - import polars as pl - - return pl + return get_polars() return self._series.__narwhals_namespace__() @property From c7f4e2d3fff2e983857587111fae379146f67e3d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 24 May 2024 12:00:40 +0200 Subject: [PATCH 34/38] Bump version to 0.8.18 --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 89b7f5d14..12fa65d8a 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals -'0.8.17' +'0.8.18' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 5c41293dc..659a8e410 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -31,7 +31,7 @@ from narwhals.utils import maybe_align_index from narwhals.utils import maybe_set_index -__version__ = "0.8.17" +__version__ = "0.8.18" __all__ = [ "concat", diff --git a/pyproject.toml b/pyproject.toml index 307d9374d..11db43421 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "0.8.17" +version = "0.8.18" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 6244a86f4848625bac01f9924f6086586e973444 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Fri, 24 May 2024 13:51:47 +0200 Subject: [PATCH 35/38] doc: fill in ser.is_between docstrings --- narwhals/series.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index 28506eff0..2e84ca352 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -973,6 +973,54 @@ def fill_null(self, value: Any) -> Self: def is_between( self, lower_bound: Any, upper_bound: Any, closed: str = "both" ) -> Self: + """ + Get a boolean mask of the values that are between the given lower/upper bounds. + + Arguments: + lower_bound: Lower bound value. + + upper_bound: Upper bound value. + + closed: Define which sides of the interval are closed (inclusive). + + Notes: + If the value of the `lower_bound` is greater than that of the `upper_bound`, + then the values will be False, as no value can satisfy the condition. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> s_pd = pd.Series([1, 2, 3, 4, 5]) + >>> s_pl = pl.Series([1, 2, 3, 4, 5]) + + We define a library agnostic function: + + >>> def func(s_any): + ... s = nw.from_native(s_any, series_only=True) + ... s = s.is_between(2, 4, 'right') + ... return nw.to_native(s) + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 0 False + 1 False + 2 True + 3 True + 4 False + dtype: bool + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + false + false + true + true + false + ] + """ return self._from_series( self._series.is_between(lower_bound, upper_bound, closed=closed) ) From bf6589eb958553b1af56a2edf974607163ef5ad3 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Fri, 24 May 2024 15:07:34 +0200 Subject: [PATCH 36/38] add missing spaces --- narwhals/series.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index 2e84ca352..03f2e5ba9 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1004,11 +1004,11 @@ def is_between( We can then pass either pandas or Polars to `func`: >>> func(s_pd) - 0 False - 1 False - 2 True - 3 True - 4 False + 0 False + 1 False + 2 True + 3 True + 4 False dtype: bool >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) From 766b4642f2e5f10fcb4b9fded10ccf28a9fb579a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 25 May 2024 13:45:40 +0200 Subject: [PATCH 37/38] return narwhals series in to_dict --- README.md | 5 ++-- docs/index.md | 1 + docs/related.md | 6 ++--- narwhals/dataframe.py | 54 ++++++++----------------------------------- tests/test_common.py | 4 ++-- 5 files changed, 19 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 42d1b8e8f..0172a847a 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,15 @@ Seamlessly support all, without depending on any! - ✅ Use **Expressions** - ✅ 100% branch coverage, tested against pandas and Polars nightly builds! - ✅ Preserve your Index (if present) without it getting in the way! +- ✅ **Zero 3rd party imports**, Narwhals only uses what you already have! ## Used by Join the party! -- [timebasedcv](https://github.com/FBruzzesi/timebasedcv) -- [scikit-lego](https://github.com/koaning/scikit-lego) (work-in-progress, in `narwhals-development` branch) +- [scikit-lego](https://github.com/koaning/scikit-lego) - [scikit-playtime](https://github.com/koaning/scikit-playtime) +- [timebasedcv](https://github.com/FBruzzesi/timebasedcv) ## Installation diff --git a/docs/index.md b/docs/index.md index 6bb427787..eb48084db 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,6 +12,7 @@ Seamlessly support both, without depending on either! - ✅ Use **Expressions** - ✅ Tested against pandas and Polars nightly builds! - ✅ Preserve your Index (if present) without it getting in the way! +- ✅ **Zero 3rd party imports**, Narwhals only uses what you already have! ## Who's this for? diff --git a/docs/related.md b/docs/related.md index 124611dcd..c4a5e6cb9 100644 --- a/docs/related.md +++ b/docs/related.md @@ -23,11 +23,11 @@ The projects are not in competition and have different goals. [Presents itself as a dataframe standard](https://voltrondata.com/resources/open-source-standards), and dispatches to 20+ backends. Some differences with Narwhals are: -- Narwhals is ~1000 times lighter +- Narwhals is ~1000 times lighter and is aimed at library maintainers as opposed to end users - Narwhals only supports 4 backends, Ibis more than 20 -- Narwhals is limited to fundamental dataframe operations, Ibis includes more advanced and niche ones. +- Narwhals is focused on fundamental dataframe operations, Ibis on SQL backends -Again, the projects are not in competition and have different goals. +The projects are not in competition and have different goals. ## Array API diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index c5f911643..a0fe445bf 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -393,8 +393,8 @@ def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: Convert DataFrame to a dictionary mapping column name to values. Arguments: - as_series: If set to true ``True`` values are Series, otherwise - values are Any. + as_series: If set to true ``True``, then the values are Narwhals Series, + otherwise the values are Any. Examples: >>> import polars as pl @@ -429,49 +429,15 @@ def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: └─────┴────────┴─────┴────────┴──────────┘ >>> df.to_dict(as_series=False) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'cars': ['beetle', 'audi', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) # doctest: +SKIP - {'A': shape: (5,) - Series: 'A' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], 'fruits': shape: (5,) - Series: 'fruits' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], 'B': shape: (5,) - Series: 'B' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], 'cars': shape: (5,) - Series: 'cars' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], 'optional': shape: (5,) - Series: 'optional' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} """ + from narwhals.series import Series + + if as_series: + return { + key: Series(value) + for key, value in self._dataframe.to_dict(as_series=as_series).items() + } + # TODO: overload return type return self._dataframe.to_dict(as_series=as_series) # type: ignore[no-any-return] # inherited diff --git a/tests/test_common.py b/tests/test_common.py index d3b29112c..3f7e6fb8d 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -572,7 +572,7 @@ def test_to_dict() -> None: "z": pd.Series([7.0, 8, 9], name="z"), } for key in expected: - pd_assert_series_equal(result[key], expected[key]) + pd_assert_series_equal(nw.to_native(result[key]), expected[key]) df = nw.DataFrame(df_polars) result = df.to_dict(as_series=True) @@ -582,7 +582,7 @@ def test_to_dict() -> None: "z": pl.Series("z", [7.0, 8, 9]), } for key in expected: - pl_assert_series_equal(result[key], expected[key]) + pl_assert_series_equal(nw.to_native(result[key]), expected[key]) @pytest.mark.parametrize( From fbed74c208e5c88a8a9d303f5e0bcf45b19cc9fa Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 25 May 2024 15:34:40 +0200 Subject: [PATCH 38/38] coverage --- narwhals/_pandas_like/group_by.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 4b7c24438..39c4979d7 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -157,7 +157,7 @@ def func(df: Any) -> Any: if parse_version(pd.__version__) < parse_version("2.2.0"): # pragma: no cover result_complex = grouped.apply(func) - else: # pragma: no cover + else: result_complex = grouped.apply(func, include_groups=False) else: # pragma: no cover result_complex = grouped.apply(func)