feat: add dt.to_string to DaskExpr (#796)

* first test refactoring * tests: complete expr/series test separation * feat: add first implementation * some more refactoring * Apply suggestions from code review Co-authored-by: Francesco Bruzzesi <[email protected]> * fix: add '.f' handling * tests: add modin catch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests: add modin catch --------- Co-authored-by: Francesco Bruzzesi <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
narwhals-dev · Aug 19, 2024 · 676731c · 676731c
1 parent 949ce05
commit 676731c
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 39 deletions.
diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -475,7 +475,9 @@ def fill_null(self, value: Any) -> DaskExpr:
         )
 
     def clip(
-        self: Self, lower_bound: Any | None = None, upper_bound: Any | None = None
+        self: Self,
+        lower_bound: Any | None = None,
+        upper_bound: Any | None = None,
     ) -> Self:
         return self._from_call(
             lambda _input, _lower, _upper: _input.clip(lower=_lower, upper=_upper),
@@ -798,6 +800,14 @@ def ordinal_day(self) -> DaskExpr:
             returns_scalar=False,
         )
 
+    def to_string(self, format: str) -> DaskExpr:  # noqa: A002
+        return self._expr._from_call(
+            lambda _input, _format: _input.dt.strftime(_format),
+            "strftime",
+            format.replace("%.f", ".%f"),
+            returns_scalar=False,
+        )
+
     def total_minutes(self) -> DaskExpr:
         return self._expr._from_call(
             lambda _input: _input.dt.total_seconds() // 60,

diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py
@@ -6,6 +6,7 @@
 import pytest
 
 import narwhals.stable.v1 as nw
+from tests.utils import compare_dicts
 from tests.utils import is_windows
 
 data = {
@@ -17,31 +18,71 @@
 
 
 @pytest.mark.parametrize(
-    "fmt", ["%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%G-W%V-%u", "%G-W%V"]
+    "fmt",
+    [
+        "%Y-%m-%d",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y/%m/%d %H:%M:%S",
+        "%G-W%V-%u",
+        "%G-W%V",
+    ],
 )
 @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
-def test_dt_to_string(constructor_eager: Any, fmt: str) -> None:
+def test_dt_to_string_series(constructor_eager: Any, fmt: str) -> None:
     input_frame = nw.from_native(constructor_eager(data), eager_only=True)
     input_series = input_frame["a"]
 
     expected_col = [datetime.strftime(d, fmt) for d in data["a"]]
 
-    result = input_series.dt.to_string(fmt).to_list()
+    result = {"a": input_series.dt.to_string(fmt)}
+
     if any(
         x in str(constructor_eager) for x in ["pandas_pyarrow", "pyarrow_table", "modin"]
     ):
         # PyArrow differs from other libraries, in that %S also shows
         # the fraction of a second.
-        result = [x[: x.find(".")] if "." in x else x for x in result]
-    assert result == expected_col
-    result = input_frame.select(nw.col("a").dt.to_string(fmt))["a"].to_list()
-    if any(
-        x in str(constructor_eager) for x in ["pandas_pyarrow", "pyarrow_table", "modin"]
-    ):
+        result = {"a": input_series.dt.to_string(fmt).str.replace(r"\.\d+$", "")}
+
+    compare_dicts(result, {"a": expected_col})
+
+
+@pytest.mark.parametrize(
+    "fmt",
+    [
+        "%Y-%m-%d",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y/%m/%d %H:%M:%S",
+        "%G-W%V-%u",
+        "%G-W%V",
+    ],
+)
+@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
+def test_dt_to_string_expr(constructor: Any, fmt: str) -> None:
+    input_frame = nw.from_native(constructor(data))
+
+    expected_col = [datetime.strftime(d, fmt) for d in data["a"]]
+
+    result = input_frame.select(nw.col("a").dt.to_string(fmt).alias("b"))
+    if any(x in str(constructor) for x in ["pandas_pyarrow", "pyarrow_table", "modin"]):
         # PyArrow differs from other libraries, in that %S also shows
         # the fraction of a second.
-        result = [x[: x.find(".")] if "." in x else x for x in result]
-    assert result == expected_col
+        result = input_frame.select(
+            nw.col("a").dt.to_string(fmt).str.replace(r"\.\d+$", "").alias("b")
+        )
+    compare_dicts(result, {"b": expected_col})
+
+
+def _clean_string(result: str) -> str:
+    # rstrip '0' to remove trailing zeros, as different libraries handle this differently
+    # if there's then a trailing `.`, remove that too.
+    if "." in result:
+        result = result.rstrip("0").rstrip(".")
+    return result
+
+
+def _clean_string_expr(e: Any) -> Any:
+    # Same as `_clean_string` but for Expr
+    return e.str.replace_all(r"0+$", "").str.replace_all(r"\.$", "")
 
 
 @pytest.mark.parametrize(
@@ -50,20 +91,16 @@ def test_dt_to_string(constructor_eager: Any, fmt: str) -> None:
         (datetime(2020, 1, 9), "2020-01-09T00:00:00.000000"),
         (datetime(2020, 1, 9, 12, 34, 56), "2020-01-09T12:34:56.000000"),
         (datetime(2020, 1, 9, 12, 34, 56, 123), "2020-01-09T12:34:56.000123"),
-        (datetime(2020, 1, 9, 12, 34, 56, 123456), "2020-01-09T12:34:56.123456"),
+        (
+            datetime(2020, 1, 9, 12, 34, 56, 123456),
+            "2020-01-09T12:34:56.123456",
+        ),
     ],
 )
 @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
-def test_dt_to_string_iso_local_datetime(
+def test_dt_to_string_iso_local_datetime_series(
     constructor_eager: Any, data: datetime, expected: str
 ) -> None:
-    def _clean_string(result: str) -> str:
-        # rstrip '0' to remove trailing zeros, as different libraries handle this differently
-        # if there's then a trailing `.`, remove that too.
-        if "." in result:
-            result = result.rstrip("0").rstrip(".")
-        return result
-
     df = constructor_eager({"a": [data]})
     result = (
         nw.from_native(df, eager_only=True)["a"]
@@ -72,34 +109,50 @@ def _clean_string(result: str) -> str:
     )
     assert _clean_string(result) == _clean_string(expected)
 
-    result = (
-        nw.from_native(df, eager_only=True)
-        .select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S.%f"))["a"]
-        .to_list()[0]
-    )
-    assert _clean_string(result) == _clean_string(expected)
-
     result = (
         nw.from_native(df, eager_only=True)["a"]
         .dt.to_string("%Y-%m-%dT%H:%M:%S%.f")
         .to_list()[0]
     )
     assert _clean_string(result) == _clean_string(expected)
 
-    result = (
-        nw.from_native(df, eager_only=True)
-        .select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S%.f"))["a"]
-        .to_list()[0]
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [
+        (datetime(2020, 1, 9, 12, 34, 56), "2020-01-09T12:34:56.000000"),
+        (datetime(2020, 1, 9, 12, 34, 56, 123), "2020-01-09T12:34:56.000123"),
+        (
+            datetime(2020, 1, 9, 12, 34, 56, 123456),
+            "2020-01-09T12:34:56.123456",
+        ),
+    ],
+)
+@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
+def test_dt_to_string_iso_local_datetime_expr(
+    request: Any, constructor: Any, data: datetime, expected: str
+) -> None:
+    if "modin" in str(constructor):
+        request.applymarker(pytest.mark.xfail)
+    df = constructor({"a": [data]})
+
+    result = nw.from_native(df).with_columns(
+        _clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S.%f")).alias("b")
     )
-    assert _clean_string(result) == _clean_string(expected)
+    compare_dicts(result, {"a": [data], "b": [_clean_string(expected)]})
+
+    result = nw.from_native(df).with_columns(
+        _clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S%.f")).alias("b")
+    )
+    compare_dicts(result, {"a": [data], "b": [_clean_string(expected)]})
 
 
 @pytest.mark.parametrize(
     ("data", "expected"),
     [(datetime(2020, 1, 9), "2020-01-09")],
 )
 @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
-def test_dt_to_string_iso_local_date(
+def test_dt_to_string_iso_local_date_series(
     constructor_eager: Any, data: datetime, expected: str
 ) -> None:
     df = constructor_eager({"a": [data]})
@@ -108,9 +161,20 @@ def test_dt_to_string_iso_local_date(
     )
     assert result == expected
 
-    result = (
-        nw.from_native(df, eager_only=True)
-        .select(b=nw.col("a").dt.to_string("%Y-%m-%d"))["b"]
-        .to_list()[0]
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [(datetime(2020, 1, 9), "2020-01-09")],
+)
+@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
+def test_dt_to_string_iso_local_date_expr(
+    request: Any, constructor: Any, data: datetime, expected: str
+) -> None:
+    if "modin" in str(constructor):
+        request.applymarker(pytest.mark.xfail)
+
+    df = constructor({"a": [data]})
+    result = nw.from_native(df).with_columns(
+        nw.col("a").dt.to_string("%Y-%m-%d").alias("b")
     )
-    assert result == expected
+    compare_dicts(result, {"a": [data], "b": [expected]})