From 676731ccf11145b8b47091aae72f6353b48e4e55 Mon Sep 17 00:00:00 2001 From: Luciano <66913960+lucianosrp@users.noreply.github.com> Date: Mon, 19 Aug 2024 20:41:48 +0200 Subject: [PATCH] feat: add `dt.to_string` to `DaskExpr` (#796) * first test refactoring * tests: complete expr/series test separation * feat: add first implementation * some more refactoring * Apply suggestions from code review Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> * fix: add '.f' handling * tests: add modin catch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests: add modin catch --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- narwhals/_dask/expr.py | 12 +- tests/expr_and_series/dt/to_string_test.py | 140 +++++++++++++++------ 2 files changed, 113 insertions(+), 39 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 2b5d16897..faedb6095 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -475,7 +475,9 @@ def fill_null(self, value: Any) -> DaskExpr: ) def clip( - self: Self, lower_bound: Any | None = None, upper_bound: Any | None = None + self: Self, + lower_bound: Any | None = None, + upper_bound: Any | None = None, ) -> Self: return self._from_call( lambda _input, _lower, _upper: _input.clip(lower=_lower, upper=_upper), @@ -798,6 +800,14 @@ def ordinal_day(self) -> DaskExpr: returns_scalar=False, ) + def to_string(self, format: str) -> DaskExpr: # noqa: A002 + return self._expr._from_call( + lambda _input, _format: _input.dt.strftime(_format), + "strftime", + format.replace("%.f", ".%f"), + returns_scalar=False, + ) + def total_minutes(self) -> DaskExpr: return self._expr._from_call( lambda _input: _input.dt.total_seconds() // 60, diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py index b78f2d1f7..8cd7ae4c5 100644 --- a/tests/expr_and_series/dt/to_string_test.py +++ b/tests/expr_and_series/dt/to_string_test.py @@ -6,6 +6,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import compare_dicts from tests.utils import is_windows data = { @@ -17,31 +18,71 @@ @pytest.mark.parametrize( - "fmt", ["%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%G-W%V-%u", "%G-W%V"] + "fmt", + [ + "%Y-%m-%d", + "%Y-%m-%d %H:%M:%S", + "%Y/%m/%d %H:%M:%S", + "%G-W%V-%u", + "%G-W%V", + ], ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") -def test_dt_to_string(constructor_eager: Any, fmt: str) -> None: +def test_dt_to_string_series(constructor_eager: Any, fmt: str) -> None: input_frame = nw.from_native(constructor_eager(data), eager_only=True) input_series = input_frame["a"] expected_col = [datetime.strftime(d, fmt) for d in data["a"]] - result = input_series.dt.to_string(fmt).to_list() + result = {"a": input_series.dt.to_string(fmt)} + if any( x in str(constructor_eager) for x in ["pandas_pyarrow", "pyarrow_table", "modin"] ): # PyArrow differs from other libraries, in that %S also shows # the fraction of a second. - result = [x[: x.find(".")] if "." in x else x for x in result] - assert result == expected_col - result = input_frame.select(nw.col("a").dt.to_string(fmt))["a"].to_list() - if any( - x in str(constructor_eager) for x in ["pandas_pyarrow", "pyarrow_table", "modin"] - ): + result = {"a": input_series.dt.to_string(fmt).str.replace(r"\.\d+$", "")} + + compare_dicts(result, {"a": expected_col}) + + +@pytest.mark.parametrize( + "fmt", + [ + "%Y-%m-%d", + "%Y-%m-%d %H:%M:%S", + "%Y/%m/%d %H:%M:%S", + "%G-W%V-%u", + "%G-W%V", + ], +) +@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") +def test_dt_to_string_expr(constructor: Any, fmt: str) -> None: + input_frame = nw.from_native(constructor(data)) + + expected_col = [datetime.strftime(d, fmt) for d in data["a"]] + + result = input_frame.select(nw.col("a").dt.to_string(fmt).alias("b")) + if any(x in str(constructor) for x in ["pandas_pyarrow", "pyarrow_table", "modin"]): # PyArrow differs from other libraries, in that %S also shows # the fraction of a second. - result = [x[: x.find(".")] if "." in x else x for x in result] - assert result == expected_col + result = input_frame.select( + nw.col("a").dt.to_string(fmt).str.replace(r"\.\d+$", "").alias("b") + ) + compare_dicts(result, {"b": expected_col}) + + +def _clean_string(result: str) -> str: + # rstrip '0' to remove trailing zeros, as different libraries handle this differently + # if there's then a trailing `.`, remove that too. + if "." in result: + result = result.rstrip("0").rstrip(".") + return result + + +def _clean_string_expr(e: Any) -> Any: + # Same as `_clean_string` but for Expr + return e.str.replace_all(r"0+$", "").str.replace_all(r"\.$", "") @pytest.mark.parametrize( @@ -50,20 +91,16 @@ def test_dt_to_string(constructor_eager: Any, fmt: str) -> None: (datetime(2020, 1, 9), "2020-01-09T00:00:00.000000"), (datetime(2020, 1, 9, 12, 34, 56), "2020-01-09T12:34:56.000000"), (datetime(2020, 1, 9, 12, 34, 56, 123), "2020-01-09T12:34:56.000123"), - (datetime(2020, 1, 9, 12, 34, 56, 123456), "2020-01-09T12:34:56.123456"), + ( + datetime(2020, 1, 9, 12, 34, 56, 123456), + "2020-01-09T12:34:56.123456", + ), ], ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") -def test_dt_to_string_iso_local_datetime( +def test_dt_to_string_iso_local_datetime_series( constructor_eager: Any, data: datetime, expected: str ) -> None: - def _clean_string(result: str) -> str: - # rstrip '0' to remove trailing zeros, as different libraries handle this differently - # if there's then a trailing `.`, remove that too. - if "." in result: - result = result.rstrip("0").rstrip(".") - return result - df = constructor_eager({"a": [data]}) result = ( nw.from_native(df, eager_only=True)["a"] @@ -72,13 +109,6 @@ def _clean_string(result: str) -> str: ) assert _clean_string(result) == _clean_string(expected) - result = ( - nw.from_native(df, eager_only=True) - .select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S.%f"))["a"] - .to_list()[0] - ) - assert _clean_string(result) == _clean_string(expected) - result = ( nw.from_native(df, eager_only=True)["a"] .dt.to_string("%Y-%m-%dT%H:%M:%S%.f") @@ -86,12 +116,35 @@ def _clean_string(result: str) -> str: ) assert _clean_string(result) == _clean_string(expected) - result = ( - nw.from_native(df, eager_only=True) - .select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S%.f"))["a"] - .to_list()[0] + +@pytest.mark.parametrize( + ("data", "expected"), + [ + (datetime(2020, 1, 9, 12, 34, 56), "2020-01-09T12:34:56.000000"), + (datetime(2020, 1, 9, 12, 34, 56, 123), "2020-01-09T12:34:56.000123"), + ( + datetime(2020, 1, 9, 12, 34, 56, 123456), + "2020-01-09T12:34:56.123456", + ), + ], +) +@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") +def test_dt_to_string_iso_local_datetime_expr( + request: Any, constructor: Any, data: datetime, expected: str +) -> None: + if "modin" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = constructor({"a": [data]}) + + result = nw.from_native(df).with_columns( + _clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S.%f")).alias("b") ) - assert _clean_string(result) == _clean_string(expected) + compare_dicts(result, {"a": [data], "b": [_clean_string(expected)]}) + + result = nw.from_native(df).with_columns( + _clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S%.f")).alias("b") + ) + compare_dicts(result, {"a": [data], "b": [_clean_string(expected)]}) @pytest.mark.parametrize( @@ -99,7 +152,7 @@ def _clean_string(result: str) -> str: [(datetime(2020, 1, 9), "2020-01-09")], ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") -def test_dt_to_string_iso_local_date( +def test_dt_to_string_iso_local_date_series( constructor_eager: Any, data: datetime, expected: str ) -> None: df = constructor_eager({"a": [data]}) @@ -108,9 +161,20 @@ def test_dt_to_string_iso_local_date( ) assert result == expected - result = ( - nw.from_native(df, eager_only=True) - .select(b=nw.col("a").dt.to_string("%Y-%m-%d"))["b"] - .to_list()[0] + +@pytest.mark.parametrize( + ("data", "expected"), + [(datetime(2020, 1, 9), "2020-01-09")], +) +@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") +def test_dt_to_string_iso_local_date_expr( + request: Any, constructor: Any, data: datetime, expected: str +) -> None: + if "modin" in str(constructor): + request.applymarker(pytest.mark.xfail) + + df = constructor({"a": [data]}) + result = nw.from_native(df).with_columns( + nw.col("a").dt.to_string("%Y-%m-%d").alias("b") ) - assert result == expected + compare_dicts(result, {"a": [data], "b": [expected]})