From 046bf408b0ddd5e1b5a7b0c709190495d73bf46a Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 5 Aug 2023 09:27:49 +0200 Subject: [PATCH] Move from_dataframe --- py-polars/polars/__init__.py | 2 +- py-polars/polars/convert.py | 80 +---------------- py-polars/polars/dataframe/frame.py | 24 ++--- py-polars/polars/interchange/__init__.py | 7 ++ .../polars/interchange/from_dataframe.py | 89 +++++++++++++++++++ 5 files changed, 103 insertions(+), 99 deletions(-) create mode 100644 py-polars/polars/interchange/from_dataframe.py diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index f45956ea02fee..ab9f40a026fc9 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -16,7 +16,6 @@ from polars.config import Config from polars.convert import ( from_arrow, - from_dataframe, from_dict, from_dicts, from_numpy, @@ -148,6 +147,7 @@ when, zeros, ) +from polars.interchange import from_dataframe from polars.io import ( read_avro, read_csv, diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index 2c72dc94a7e1f..601e2a595ec80 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -15,12 +15,11 @@ Struct, Utf8, ) -from polars.dependencies import _PYARROW_AVAILABLE from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import NoDataError from polars.io import read_csv -from polars.utils.various import _cast_repr_strings_with_schema, parse_version +from polars.utils.various import _cast_repr_strings_with_schema if TYPE_CHECKING: from polars import DataFrame, Series @@ -726,80 +725,3 @@ def from_pandas( ) else: raise ValueError(f"Expected pandas DataFrame or Series, got {type(data)}.") - - -def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame: - """ - Build a Polars DataFrame from any dataframe supporting the interchange protocol. - - Parameters - ---------- - df - Object supporting the dataframe interchange protocol, i.e. must have implemented - the ``__dataframe__`` method. - allow_copy - Allow memory to be copied to perform the conversion. If set to False, causes - conversions that are not zero-copy to fail. - - Notes - ----- - Details on the dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more - efficient method of conversion. - - Polars currently relies on pyarrow's implementation of the dataframe interchange - protocol. Therefore, pyarrow>=11.0.0 is required for this function to work. - - Because Polars can not currently guarantee zero-copy conversion from Arrow for - categorical columns, ``allow_copy=False`` will not work if the dataframe contains - categorical data. - - """ - if isinstance(df, pl.DataFrame): - return df - if not hasattr(df, "__dataframe__"): - raise TypeError( - f"`df` of type {type(df)} does not support the dataframe interchange protocol." - ) - - pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy) - return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value] - - -def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table: - if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"): - raise ImportError( - "pyarrow>=11.0.0 is required for converting a dataframe interchange object" - " to a Polars dataframe." - ) - - import pyarrow.interchange # noqa: F401 - - if not allow_copy: - return _df_to_pyarrow_table_zero_copy(df) - - return pa.interchange.from_dataframe(df, allow_copy=True) - - -def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table: - dfi = df.__dataframe__(allow_copy=False) - if _dfi_contains_categorical_data(dfi): - raise TypeError( - "Polars can not currently guarantee zero-copy conversion from Arrow for " - " categorical columns. Set `allow_copy=True` or cast categorical columns to" - " string first." - ) - - if isinstance(df, pa.Table): - return df - elif isinstance(df, pa.RecordBatch): - return pa.Table.from_batches([df]) - else: - return pa.interchange.from_dataframe(dfi, allow_copy=False) - - -def _dfi_contains_categorical_data(dfi: Any) -> bool: - CATEGORICAL_DTYPE = 23 - return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns()) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 1cf4ade8aed50..c811bfb08b3b4 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -47,7 +47,6 @@ Time, Utf8, py_type_to_dtype, - unpack_dtypes, ) from polars.dependencies import ( _PYARROW_AVAILABLE, @@ -120,9 +119,7 @@ from xlsxwriter import Workbook from polars import Expr, LazyFrame, Series - from polars.internals.interchange.dataframe_protocol import ( - DataFrame as DataFrameXchg, - ) + from polars.internals.interchange import PolarsDataFrameXchg from polars.type_aliases import ( AsofJoinStrategy, AvroCompression, @@ -1207,7 +1204,7 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]: def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> DataFrameXchg: + ) -> PolarsDataFrameXchg: """ Convert to a dataframe object implementing the dataframe interchange protocol. @@ -1240,20 +1237,9 @@ def __dataframe__( contains categorical data. """ - if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version( - "11" - ): - raise ImportError( - "pyarrow>=11.0.0 is required for converting a Polars dataframe to a" - " dataframe interchange object." - ) - if not allow_copy and Categorical in unpack_dtypes(*self.dtypes): - raise TypeError( - "Polars can not currently guarantee zero-copy conversion to Arrow for" - " categorical columns. Set `allow_copy=True` or cast categorical" - " columns to string first." - ) - return self.to_arrow().__dataframe__(nan_as_null, allow_copy) + from polars.internals.interchange.dataframe import PolarsDataFrameXchg + + return PolarsDataFrameXchg(self, nan_as_null, allow_copy) def __dataframe_consortium_standard__( self, *, api_version: str | None = None diff --git a/py-polars/polars/interchange/__init__.py b/py-polars/polars/interchange/__init__.py index e69de29bb2d1d..66eb80f27f9dd 100644 --- a/py-polars/polars/interchange/__init__.py +++ b/py-polars/polars/interchange/__init__.py @@ -0,0 +1,7 @@ +from polars.interchange.dataframe import PolarsDataFrameXchg +from polars.interchange.from_dataframe import from_dataframe + +__all__ = [ + "PolarsDataFrameXchg", + "from_dataframe", +] diff --git a/py-polars/polars/interchange/from_dataframe.py b/py-polars/polars/interchange/from_dataframe.py new file mode 100644 index 0000000000000..c8787d7f55734 --- /dev/null +++ b/py-polars/polars/interchange/from_dataframe.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import polars._reexport as pl +from polars.convert import from_arrow +from polars.dependencies import _PYARROW_AVAILABLE +from polars.dependencies import pyarrow as pa +from polars.utils.various import parse_version + +if TYPE_CHECKING: + from polars import DataFrame + + +def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame: + """ + Build a Polars DataFrame from any dataframe supporting the interchange protocol. + + Parameters + ---------- + df + Object supporting the dataframe interchange protocol, i.e. must have implemented + the ``__dataframe__`` method. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more + efficient method of conversion. + + Polars currently relies on pyarrow's implementation of the dataframe interchange + protocol. Therefore, pyarrow>=11.0.0 is required for this function to work. + + Because Polars can not currently guarantee zero-copy conversion from Arrow for + categorical columns, ``allow_copy=False`` will not work if the dataframe contains + categorical data. + + """ + if isinstance(df, pl.DataFrame): + return df + if not hasattr(df, "__dataframe__"): + raise TypeError( + f"`df` of type {type(df)} does not support the dataframe interchange protocol." + ) + + pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy) + return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value] + + +def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table: + if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"): + raise ImportError( + "pyarrow>=11.0.0 is required for converting a dataframe interchange object" + " to a Polars dataframe." + ) + + import pyarrow.interchange # noqa: F401 + + if not allow_copy: + return _df_to_pyarrow_table_zero_copy(df) + + return pa.interchange.from_dataframe(df, allow_copy=True) + + +def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table: + dfi = df.__dataframe__(allow_copy=False) + if _dfi_contains_categorical_data(dfi): + raise TypeError( + "Polars can not currently guarantee zero-copy conversion from Arrow for " + " categorical columns. Set `allow_copy=True` or cast categorical columns to" + " string first." + ) + + if isinstance(df, pa.Table): + return df + elif isinstance(df, pa.RecordBatch): + return pa.Table.from_batches([df]) + else: + return pa.interchange.from_dataframe(dfi, allow_copy=False) + + +def _dfi_contains_categorical_data(dfi: Any) -> bool: + CATEGORICAL_DTYPE = 23 + return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns())