Skip to content

Commit

Permalink
Move from_dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Aug 5, 2023
1 parent a111410 commit 046bf40
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 99 deletions.
2 changes: 1 addition & 1 deletion py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from polars.config import Config
from polars.convert import (
from_arrow,
from_dataframe,
from_dict,
from_dicts,
from_numpy,
Expand Down Expand Up @@ -148,6 +147,7 @@
when,
zeros,
)
from polars.interchange import from_dataframe
from polars.io import (
read_avro,
read_csv,
Expand Down
80 changes: 1 addition & 79 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@
Struct,
Utf8,
)
from polars.dependencies import _PYARROW_AVAILABLE
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import NoDataError
from polars.io import read_csv
from polars.utils.various import _cast_repr_strings_with_schema, parse_version
from polars.utils.various import _cast_repr_strings_with_schema

if TYPE_CHECKING:
from polars import DataFrame, Series
Expand Down Expand Up @@ -726,80 +725,3 @@ def from_pandas(
)
else:
raise ValueError(f"Expected pandas DataFrame or Series, got {type(data)}.")


def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame:
"""
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
Parameters
----------
df
Object supporting the dataframe interchange protocol, i.e. must have implemented
the ``__dataframe__`` method.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
efficient method of conversion.
Polars currently relies on pyarrow's implementation of the dataframe interchange
protocol. Therefore, pyarrow>=11.0.0 is required for this function to work.
Because Polars can not currently guarantee zero-copy conversion from Arrow for
categorical columns, ``allow_copy=False`` will not work if the dataframe contains
categorical data.
"""
if isinstance(df, pl.DataFrame):
return df
if not hasattr(df, "__dataframe__"):
raise TypeError(
f"`df` of type {type(df)} does not support the dataframe interchange protocol."
)

pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy)
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value]


def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table:
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"):
raise ImportError(
"pyarrow>=11.0.0 is required for converting a dataframe interchange object"
" to a Polars dataframe."
)

import pyarrow.interchange # noqa: F401

if not allow_copy:
return _df_to_pyarrow_table_zero_copy(df)

return pa.interchange.from_dataframe(df, allow_copy=True)


def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table:
dfi = df.__dataframe__(allow_copy=False)
if _dfi_contains_categorical_data(dfi):
raise TypeError(
"Polars can not currently guarantee zero-copy conversion from Arrow for "
" categorical columns. Set `allow_copy=True` or cast categorical columns to"
" string first."
)

if isinstance(df, pa.Table):
return df
elif isinstance(df, pa.RecordBatch):
return pa.Table.from_batches([df])
else:
return pa.interchange.from_dataframe(dfi, allow_copy=False)


def _dfi_contains_categorical_data(dfi: Any) -> bool:
CATEGORICAL_DTYPE = 23
return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns())
24 changes: 5 additions & 19 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
Time,
Utf8,
py_type_to_dtype,
unpack_dtypes,
)
from polars.dependencies import (
_PYARROW_AVAILABLE,
Expand Down Expand Up @@ -120,9 +119,7 @@
from xlsxwriter import Workbook

from polars import Expr, LazyFrame, Series
from polars.internals.interchange.dataframe_protocol import (
DataFrame as DataFrameXchg,
)
from polars.internals.interchange import PolarsDataFrameXchg
from polars.type_aliases import (
AsofJoinStrategy,
AvroCompression,
Expand Down Expand Up @@ -1207,7 +1204,7 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:

def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> DataFrameXchg:
) -> PolarsDataFrameXchg:
"""
Convert to a dataframe object implementing the dataframe interchange protocol.
Expand Down Expand Up @@ -1240,20 +1237,9 @@ def __dataframe__(
contains categorical data.
"""
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version(
"11"
):
raise ImportError(
"pyarrow>=11.0.0 is required for converting a Polars dataframe to a"
" dataframe interchange object."
)
if not allow_copy and Categorical in unpack_dtypes(*self.dtypes):
raise TypeError(
"Polars can not currently guarantee zero-copy conversion to Arrow for"
" categorical columns. Set `allow_copy=True` or cast categorical"
" columns to string first."
)
return self.to_arrow().__dataframe__(nan_as_null, allow_copy)
from polars.internals.interchange.dataframe import PolarsDataFrameXchg

return PolarsDataFrameXchg(self, nan_as_null, allow_copy)

def __dataframe_consortium_standard__(
self, *, api_version: str | None = None
Expand Down
7 changes: 7 additions & 0 deletions py-polars/polars/interchange/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from polars.interchange.dataframe import PolarsDataFrameXchg
from polars.interchange.from_dataframe import from_dataframe

__all__ = [
"PolarsDataFrameXchg",
"from_dataframe",
]
89 changes: 89 additions & 0 deletions py-polars/polars/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

import polars._reexport as pl
from polars.convert import from_arrow
from polars.dependencies import _PYARROW_AVAILABLE
from polars.dependencies import pyarrow as pa
from polars.utils.various import parse_version

if TYPE_CHECKING:
from polars import DataFrame


def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame:
"""
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
Parameters
----------
df
Object supporting the dataframe interchange protocol, i.e. must have implemented
the ``__dataframe__`` method.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
efficient method of conversion.
Polars currently relies on pyarrow's implementation of the dataframe interchange
protocol. Therefore, pyarrow>=11.0.0 is required for this function to work.
Because Polars can not currently guarantee zero-copy conversion from Arrow for
categorical columns, ``allow_copy=False`` will not work if the dataframe contains
categorical data.
"""
if isinstance(df, pl.DataFrame):
return df
if not hasattr(df, "__dataframe__"):
raise TypeError(
f"`df` of type {type(df)} does not support the dataframe interchange protocol."
)

pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy)
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value]


def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table:
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"):
raise ImportError(
"pyarrow>=11.0.0 is required for converting a dataframe interchange object"
" to a Polars dataframe."
)

import pyarrow.interchange # noqa: F401

if not allow_copy:
return _df_to_pyarrow_table_zero_copy(df)

return pa.interchange.from_dataframe(df, allow_copy=True)


def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table:
dfi = df.__dataframe__(allow_copy=False)
if _dfi_contains_categorical_data(dfi):
raise TypeError(
"Polars can not currently guarantee zero-copy conversion from Arrow for "
" categorical columns. Set `allow_copy=True` or cast categorical columns to"
" string first."
)

if isinstance(df, pa.Table):
return df
elif isinstance(df, pa.RecordBatch):
return pa.Table.from_batches([df])
else:
return pa.interchange.from_dataframe(dfi, allow_copy=False)


def _dfi_contains_categorical_data(dfi: Any) -> bool:
CATEGORICAL_DTYPE = 23
return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns())

0 comments on commit 046bf40

Please sign in to comment.