From e90afe45438fa431ea5c8a06314d7abfbdd77ac7 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Thu, 4 Jul 2024 22:24:08 +0200 Subject: [PATCH] feat: add `clone` for dataframes (#406) * add clone * add to lazy frame too --- docs/api-reference/dataframe.md | 1 + docs/api-reference/lazyframe.md | 1 + narwhals/_arrow/dataframe.py | 3 ++ narwhals/_pandas_like/dataframe.py | 3 ++ narwhals/dataframe.py | 74 ++++++++++++++++++++++++++++++ tests/conftest.py | 9 ++++ tests/frame/clone_test.py | 13 ++++++ 7 files changed, 104 insertions(+) create mode 100644 tests/frame/clone_test.py diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 67520527a..a00037291 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -4,6 +4,7 @@ handler: python options: members: + - clone - columns - drop - drop_nulls diff --git a/docs/api-reference/lazyframe.md b/docs/api-reference/lazyframe.md index 96e1a3ffe..8869e5aaf 100644 --- a/docs/api-reference/lazyframe.md +++ b/docs/api-reference/lazyframe.md @@ -4,6 +4,7 @@ handler: python options: members: + - clone - collect - columns - drop diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 6c9c390d2..e8dd7dab2 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -151,3 +151,6 @@ def lazy(self) -> Self: def collect(self) -> ArrowDataFrame: return ArrowDataFrame(self._dataframe) + + def clone(self) -> Self: + raise NotImplementedError("clone is not yet supported on PyArrow tables") diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 443e2add0..a38f1bd4b 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -452,3 +452,6 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) -> _col = self.columns.index(column) if isinstance(column, str) else column return self._dataframe.iat[row, _col] + + def clone(self: Self) -> Self: + return self._from_dataframe(self._dataframe.copy()) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 239fcd2c4..399b61cf0 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -198,6 +198,9 @@ def join( ) ) + def clone(self) -> Self: + return self._from_dataframe(self._dataframe.clone()) + class DataFrame(BaseFrame): """ @@ -1732,6 +1735,42 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) -> """ return self._dataframe.item(row=row, column=column) + def clone(self) -> Self: + r""" + Create a copy of this DataFrame. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function in which we clone the DataFrame: + + >>> @nw.narwhalify + ... def func(df): + ... return df.clone() + + >>> func(df_pd) + a b + 0 1 3 + 1 2 4 + + >>> func(df_pl) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + """ + return super().clone() + class LazyFrame(BaseFrame): """ @@ -2889,3 +2928,38 @@ def join( └─────┴─────┴─────┴───────┘ """ return super().join(other, how=how, left_on=left_on, right_on=right_on) + + def clone(self) -> Self: + r""" + Create a copy of this DataFrame. + + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.LazyFrame(data) + + Let's define a dataframe-agnostic function in which we copy the DataFrame: + + >>> @nw.narwhalify + ... def func(df): + ... return df.clone() + + >>> func(df_pd) + a b + 0 1 3 + 1 2 4 + + >>> func(df_pl).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + """ + return super().clone() diff --git a/tests/conftest.py b/tests/conftest.py index 9b7fc942f..5b71cdc0e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,6 +52,10 @@ def polars_constructor(obj: Any) -> IntoDataFrame: return pl.DataFrame(obj) +def polars_lazy_constructor(obj: Any) -> pl.LazyFrame: + return pl.LazyFrame(obj) + + if parse_version(pd.__version__) >= parse_version("2.0.0"): params = [pandas_constructor, pandas_nullable_constructor, pandas_pyarrow_constructor] else: # pragma: no cover @@ -66,6 +70,11 @@ def constructor(request: Any) -> Callable[[Any], IntoDataFrame]: return request.param # type: ignore[no-any-return] +@pytest.fixture(params=[*params, polars_lazy_constructor]) +def constructor_with_lazy(request: Any) -> Callable[[Any], Any]: + return request.param # type: ignore[no-any-return] + + # TODO: once pyarrow has complete coverage, we can remove this one, # and just put `pa.table` into `constructor` @pytest.fixture(params=[*params, pa.table]) diff --git a/tests/frame/clone_test.py b/tests/frame/clone_test.py new file mode 100644 index 000000000..9d9835063 --- /dev/null +++ b/tests/frame/clone_test.py @@ -0,0 +1,13 @@ +from typing import Any + +import narwhals as nw +from tests.utils import compare_dicts + + +def test_clone(constructor_with_lazy: Any) -> None: + expected = {"a": [1, 2], "b": [3, 4]} + df = nw.from_native(constructor_with_lazy(expected)) + df_clone = df.clone() + assert df is not df_clone + assert df._dataframe is not df_clone._dataframe + compare_dicts(df_clone, expected)