diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 3c2849024..20c93c896 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -777,6 +777,7 @@ def unique( def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame: from narwhals.utils import parse_version + pandas_df = self.to_pandas() if backend is None: return self elif backend is Implementation.DUCKDB: @@ -784,9 +785,8 @@ def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyF from narwhals._duckdb.dataframe import DuckDBLazyFrame - df = self._native_frame # noqa: F841 return DuckDBLazyFrame( - df=duckdb.table("df"), + df=duckdb.table("pandas_df"), backend_version=parse_version(duckdb.__version__), version=self._version, validate_column_names=False, @@ -797,7 +797,7 @@ def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyF from narwhals._polars.dataframe import PolarsLazyFrame return PolarsLazyFrame( - df=pl.from_pandas(self._native_frame).lazy(), + df=pl.from_pandas(pandas_df).lazy(), backend_version=parse_version(pl.__version__), version=self._version, ) @@ -808,7 +808,7 @@ def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyF from narwhals._dask.dataframe import DaskLazyFrame return DaskLazyFrame( - native_dataframe=dd.from_pandas(self._native_frame), + native_dataframe=dd.from_pandas(pandas_df), backend_version=parse_version(dask.__version__), version=self._version, validate_column_names=False, diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 3ac9ef834..371a78b19 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -507,7 +507,6 @@ def __arrow_c_stream__(self: Self, requested_schema: object | None = None) -> ob def lazy( self: Self, - *, backend: ModuleType | Implementation | str | None = None, ) -> LazyFrame[Any]: """Restrict available API methods to lazy-only ones. diff --git a/narwhals/functions.py b/narwhals/functions.py index b13bf51f0..74f0c84e5 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -28,6 +28,7 @@ from narwhals.utils import flatten from narwhals.utils import parse_version from narwhals.utils import validate_laziness +from narwhals.utils import validate_native_namespace_and_backend # Missing type parameters for generic type "DataFrame" # However, trying to provide one results in mypy still complaining... @@ -374,6 +375,7 @@ def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, *, + backend: ModuleType | Implementation | str | None = None, native_namespace: ModuleType | None = None, ) -> DataFrame[Any]: """Instantiate DataFrame from dictionary. @@ -388,9 +390,22 @@ def from_dict( Arguments: data: Dictionary to create DataFrame from. schema: The DataFrame schema as Schema or dict of {name: type}. - native_namespace: The native library to use for DataFrame creation. Only + backend: specifies which eager backend instantiate to. Only necessary if inputs are not Narwhals Series. + `backend` can be specified in various ways: + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, + `POLARS`, `MODIN` or `CUDF`. + - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. + - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. + native_namespace: The native library to use for DataFrame creation. + + **Deprecated** (v1.26.0): + Please use `backend` instead. Note that `native_namespace` is still available + (and won't emit a deprecation warning) if you use `narwhals.stable.v1`, + see [perfect backwards compatibility policy](../backcompat.md/). + Returns: A new DataFrame. @@ -400,24 +415,20 @@ def from_dict( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} - Let's create a new dataframe of the same class as the dataframe we started with, from a dict of new data: + Let's create a new dataframe and specify the backend argument. - >>> def agnostic_from_dict(df_native: IntoFrameT) -> IntoFrameT: - ... new_data = {"c": [5, 2], "d": [1, 4]} - ... native_namespace = nw.get_native_namespace(df_native) - ... return nw.from_dict( - ... new_data, native_namespace=native_namespace - ... ).to_native() + >>> def agnostic_from_dict(backend: str) -> IntoFrameT: + ... data = {"c": [5, 2], "d": [1, 4]} + ... return nw.from_dict(data, backend=backend).to_native() Let's see what happens when passing pandas, Polars or PyArrow input: - >>> agnostic_from_dict(pd.DataFrame(data)) + >>> agnostic_from_dict(backend="pandas") c d 0 5 1 1 2 4 - >>> agnostic_from_dict(pl.DataFrame(data)) + >>> agnostic_from_dict(backend="polars") shape: (2, 2) ┌─────┬─────┐ │ c ┆ d │ @@ -427,7 +438,7 @@ def from_dict( │ 5 ┆ 1 │ │ 2 ┆ 4 │ └─────┴─────┘ - >>> agnostic_from_dict(pa.table(data)) + >>> agnostic_from_dict(backend="pyarrow") pyarrow.Table c: int64 d: int64 @@ -435,19 +446,22 @@ def from_dict( c: [[5,2]] d: [[1,4]] """ + backend = validate_native_namespace_and_backend( + backend, native_namespace, emit_deprecation_warning=True + ) return _from_dict_impl( data, schema, - native_namespace=native_namespace, + backend=backend, version=Version.MAIN, ) -def _from_dict_impl( +def _from_dict_impl( # noqa: PLR0915 data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, *, - native_namespace: ModuleType | None = None, + backend: ModuleType | Implementation | str | None = None, version: Version, ) -> DataFrame[Any]: from narwhals.series import Series @@ -456,18 +470,31 @@ def _from_dict_impl( if not data: msg = "from_dict cannot be called with empty dictionary" raise ValueError(msg) - if native_namespace is None: + if backend is None: for val in data.values(): if isinstance(val, Series): native_namespace = val.__native_namespace__() break else: - msg = "Calling `from_dict` without `native_namespace` is only supported if all input values are already Narwhals Series" + msg = "Calling `from_dict` without `backend` is only supported if all input values are already Narwhals Series" raise TypeError(msg) data = {key: to_native(value, pass_through=True) for key, value in data.items()} - implementation = Implementation.from_native_namespace(native_namespace) + eager_backend = Implementation.from_native_namespace(native_namespace) + else: + eager_backend = Implementation.from_backend(backend) + native_namespace = eager_backend.to_native_namespace() - if implementation is Implementation.POLARS: + supported_eager_backends = ( + Implementation.POLARS, + Implementation.PANDAS, + Implementation.PYARROW, + Implementation.MODIN, + Implementation.CUDF, + ) + if eager_backend is not None and eager_backend not in supported_eager_backends: + msg = f"Unsupported `backend` value.\nExpected one of {supported_eager_backends} or None, got: {eager_backend}." + raise ValueError(msg) + if eager_backend is Implementation.POLARS: if schema: from narwhals._polars.utils import ( narwhals_to_native_dtype as polars_narwhals_to_native_dtype, @@ -481,11 +508,11 @@ def _from_dict_impl( schema_pl = None native_frame = native_namespace.from_dict(data, schema=schema_pl) - elif implementation in { + elif eager_backend in ( Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF, - }: + ): aligned_data = {} left_most_series = None for key, native_series in data.items(): @@ -515,8 +542,8 @@ def _from_dict_impl( schema = { name: pandas_like_narwhals_to_native_dtype( dtype=schema[name], - dtype_backend=get_dtype_backend(native_type, implementation), - implementation=implementation, + dtype_backend=get_dtype_backend(native_type, eager_backend), + implementation=eager_backend, backend_version=backend_version, version=version, ) @@ -524,7 +551,7 @@ def _from_dict_impl( } native_frame = native_frame.astype(schema) - elif implementation is Implementation.PYARROW: + elif eager_backend is Implementation.PYARROW: if schema: from narwhals._arrow.utils import ( narwhals_to_native_dtype as arrow_narwhals_to_native_dtype, diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index b018c90a0..b7a218fcc 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -78,6 +78,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index +from narwhals.utils import validate_native_namespace_and_backend from narwhals.utils import validate_strict_and_pass_though if TYPE_CHECKING: @@ -169,7 +170,6 @@ def __getitem__(self: Self, item: Any) -> Any: def lazy( self: Self, - *, backend: ModuleType | Implementation | str | None = None, ) -> LazyFrame[Any]: """Restrict available API methods to lazy-only ones. @@ -2162,6 +2162,7 @@ def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, *, + backend: ModuleType | Implementation | str | None = None, native_namespace: ModuleType | None = None, ) -> DataFrame[Any]: """Instantiate DataFrame from dictionary. @@ -2176,17 +2177,33 @@ def from_dict( Arguments: data: Dictionary to create DataFrame from. schema: The DataFrame schema as Schema or dict of {name: type}. - native_namespace: The native library to use for DataFrame creation. Only + backend: specifies which eager backend instantiate to. Only necessary if inputs are not Narwhals Series. + `backend` can be specified in various ways: + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, + `POLARS`, `MODIN` or `CUDF`. + - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. + - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. + native_namespace: The native library to use for DataFrame creation. + + **Deprecated** (v1.26.0): + Please use `backend` instead. Note that `native_namespace` is still available + (and won't emit a deprecation warning) if you use `narwhals.stable.v1`, + see [perfect backwards compatibility policy](../backcompat.md/). + Returns: A new DataFrame. """ + backend = validate_native_namespace_and_backend( + backend, native_namespace, emit_deprecation_warning=False + ) return _stableify( # type: ignore[no-any-return] _from_dict_impl( data, schema, - native_namespace=native_namespace, + backend=backend, version=Version.V1, ) ) diff --git a/narwhals/utils.py b/narwhals/utils.py index c99e23062..aa3e00d91 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -162,16 +162,41 @@ def to_native_namespace(self: Self) -> ModuleType: Returns: Native module. """ - mapping = { - Implementation.PANDAS: get_pandas(), - Implementation.MODIN: get_modin(), - Implementation.CUDF: get_cudf(), - Implementation.PYARROW: get_pyarrow(), - Implementation.PYSPARK: get_pyspark_sql(), - Implementation.POLARS: get_polars(), - Implementation.DASK: get_dask_dataframe(), - } - return mapping[self] # type: ignore[no-any-return] + if self is Implementation.PANDAS: + import pandas as pd # ignore-banned-import + + return pd # type: ignore[no-any-return] + if self is Implementation.MODIN: + import modin.pandas + + return modin.pandas # type: ignore[no-any-return] + if self is Implementation.CUDF: # pragma: no cover + import cudf # ignore-banned-import + + return cudf # type: ignore[no-any-return] + if self is Implementation.PYARROW: + import pyarrow as pa # ignore-banned-import + + return pa # type: ignore[no-any-return] + if self is Implementation.PYSPARK: # pragma: no cover + import pyspark.sql + + return pyspark.sql # type: ignore[no-any-return] + if self is Implementation.POLARS: + import polars as pl # ignore-banned-import + + return pl + if self is Implementation.DASK: + import dask.dataframe # ignore-banned-import + + return dask.dataframe # type: ignore[no-any-return] + + if self is Implementation.DUCKDB: + import duckdb # ignore-banned-import + + return duckdb # type: ignore[no-any-return] + msg = "Not supported Implementation" # pragma: no cover + raise AssertionError(msg) def is_pandas(self: Self) -> bool: """Return whether implementation is pandas. @@ -1042,6 +1067,27 @@ def validate_strict_and_pass_though( return pass_through +def validate_native_namespace_and_backend( + backend: ModuleType | Implementation | str | None = None, + native_namespace: ModuleType | None = None, + *, + emit_deprecation_warning: bool, +) -> ModuleType | Implementation | str | None: + if native_namespace is not None and backend is None: # pragma: no cover + if emit_deprecation_warning: + msg = ( + "`native_namespace` is deprecated, please use `pass_through` instead.\n\n" + "Note: `native_namespace` will remain available in `narwhals.stable.v1`.\n" + "See https://narwhals-dev.github.io/narwhals/backcompat/ for more information.\n" + ) + issue_deprecation_warning(msg, _version="1.25.1") + backend = native_namespace + elif native_namespace is not None and backend is not None: + msg = "Can't pass both `native_namespace` and `backend`" + raise ValueError(msg) + return backend + + def _validate_rolling_arguments( window_size: int, min_samples: int | None ) -> tuple[int, int]: diff --git a/tests/frame/lazy_test.py b/tests/frame/lazy_test.py index 64eabd95f..12229afba 100644 --- a/tests/frame/lazy_test.py +++ b/tests/frame/lazy_test.py @@ -56,12 +56,9 @@ def test_lazy_to_default(constructor_eager: ConstructorEager) -> None: ], ) def test_lazy_backend( - request: pytest.FixtureRequest, constructor_eager: ConstructorEager, backend: Implementation | str, ) -> None: - if "modin" in str(constructor_eager): - request.applymarker(pytest.mark.xfail) if (backend is Implementation.DASK) or backend == "dask": pytest.importorskip("dask") if (backend is Implementation.DUCKDB) or backend == "duckdb": diff --git a/tests/from_dict_test.py b/tests/from_dict_test.py index 0630cac43..dd62c2533 100644 --- a/tests/from_dict_test.py +++ b/tests/from_dict_test.py @@ -7,68 +7,138 @@ import narwhals as nw import narwhals.stable.v1 as nw_v1 +from narwhals.utils import Implementation from tests.utils import Constructor from tests.utils import assert_equal_data - -def test_from_dict(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor) or "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) - native_namespace = nw.get_native_namespace(df) - result = nw.from_dict({"c": [1, 2], "d": [5, 6]}, native_namespace=native_namespace) +TEST_EAGER_BACKENDS = [ + Implementation.POLARS, + Implementation.PANDAS, + Implementation.PYARROW, + "polars", + "pandas", + "pyarrow", +] + + +@pytest.mark.parametrize( + "backend", + TEST_EAGER_BACKENDS, +) +def test_from_dict( + backend: Implementation | str, +) -> None: + result = nw.from_dict({"c": [1, 2], "d": [5, 6]}, backend=backend) expected = {"c": [1, 2], "d": [5, 6]} assert_equal_data(result, expected) assert isinstance(result, nw.DataFrame) +@pytest.mark.parametrize( + "backend", + TEST_EAGER_BACKENDS, +) def test_from_dict_schema( - constructor: Constructor, request: pytest.FixtureRequest + backend: Implementation | str, ) -> None: - if "dask" in str(constructor) or "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} - df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) - native_namespace = nw_v1.get_native_namespace(df) result = nw_v1.from_dict( {"c": [1, 2], "d": [5, 6]}, - native_namespace=native_namespace, + backend=backend, schema=schema, # type: ignore[arg-type] ) assert result.collect_schema() == schema -def test_from_dict_without_namespace(constructor: Constructor) -> None: - df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() +@pytest.mark.parametrize( + "backend", + [ + Implementation.POLARS, + "polars", + ], +) +def test_from_dict_without_backend( + constructor: Constructor, backend: Implementation | str +) -> None: + df = ( + nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + .lazy() + .collect(backend=backend) + ) result = nw.from_dict({"c": df["a"], "d": df["b"]}) assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) -def test_from_dict_without_namespace_invalid( +def test_from_dict_without_backend_invalid( constructor: Constructor, ) -> None: df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() - with pytest.raises(TypeError, match="namespace"): + with pytest.raises(TypeError, match="backend"): nw.from_dict({"c": nw.to_native(df["a"]), "d": nw.to_native(df["b"])}) -def test_from_dict_one_native_one_narwhals( +def test_from_dict_with_backend_invalid() -> None: + pytest.importorskip("duckdb") + with pytest.raises(ValueError, match="Unsupported `backend` value"): + nw.from_dict({"c": [1, 2], "d": [5, 6]}, backend="duckdb") + + +def test_from_dict_both_backend_and_namespace( constructor: Constructor, ) -> None: - df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + native_namespace = nw.get_native_namespace(df) + with pytest.raises(ValueError, match="Can't pass both"): + nw.from_dict( + {"c": [1, 2], "d": [5, 6]}, + backend="pandas", + native_namespace=native_namespace, + ) + + +def test_from_dict_both_backend_and_namespace_v1( + constructor: Constructor, +) -> None: + df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + native_namespace = nw_v1.get_native_namespace(df) + with pytest.raises(ValueError, match="Can't pass both"): + nw_v1.from_dict( + {"c": [1, 2], "d": [5, 6]}, + backend="pandas", + native_namespace=native_namespace, + ) + + +@pytest.mark.parametrize( + "backend", + [ + Implementation.POLARS, + "polars", + ], +) +def test_from_dict_one_native_one_narwhals( + constructor: Constructor, backend: Implementation | str +) -> None: + df = ( + nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + .lazy() + .collect(backend=backend) + ) result = nw.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) expected = {"c": [1, 2, 3], "d": [4, 5, 6]} assert_equal_data(result, expected) -def test_from_dict_v1(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor) or "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) - native_namespace = nw_v1.get_native_namespace(df) +@pytest.mark.parametrize( + "backend", + TEST_EAGER_BACKENDS, +) +def test_from_dict_v1( + backend: Implementation | str, +) -> None: result = nw_v1.from_dict( {"c": [1, 2], "d": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}, - native_namespace=native_namespace, + backend=backend, ) expected = {"c": [1, 2], "d": [datetime(2020, 1, 1), datetime(2020, 1, 2)]} assert_equal_data(result, expected) @@ -85,7 +155,7 @@ def test_alignment() -> None: # https://github.com/narwhals-dev/narwhals/issues/1474 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = nw.from_dict( - {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, native_namespace=pd + {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd ).to_native() expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}) pd.testing.assert_frame_equal(result, expected)