-
-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
103 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from polars.interchange.dataframe import PolarsDataFrameXchg | ||
from polars.interchange.from_dataframe import from_dataframe | ||
|
||
__all__ = [ | ||
"PolarsDataFrameXchg", | ||
"from_dataframe", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING, Any | ||
|
||
import polars._reexport as pl | ||
from polars.convert import from_arrow | ||
from polars.dependencies import _PYARROW_AVAILABLE | ||
from polars.dependencies import pyarrow as pa | ||
from polars.utils.various import parse_version | ||
|
||
if TYPE_CHECKING: | ||
from polars import DataFrame | ||
|
||
|
||
def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame: | ||
""" | ||
Build a Polars DataFrame from any dataframe supporting the interchange protocol. | ||
Parameters | ||
---------- | ||
df | ||
Object supporting the dataframe interchange protocol, i.e. must have implemented | ||
the ``__dataframe__`` method. | ||
allow_copy | ||
Allow memory to be copied to perform the conversion. If set to False, causes | ||
conversions that are not zero-copy to fail. | ||
Notes | ||
----- | ||
Details on the dataframe interchange protocol: | ||
https://data-apis.org/dataframe-protocol/latest/index.html | ||
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more | ||
efficient method of conversion. | ||
Polars currently relies on pyarrow's implementation of the dataframe interchange | ||
protocol. Therefore, pyarrow>=11.0.0 is required for this function to work. | ||
Because Polars can not currently guarantee zero-copy conversion from Arrow for | ||
categorical columns, ``allow_copy=False`` will not work if the dataframe contains | ||
categorical data. | ||
""" | ||
if isinstance(df, pl.DataFrame): | ||
return df | ||
if not hasattr(df, "__dataframe__"): | ||
raise TypeError( | ||
f"`df` of type {type(df)} does not support the dataframe interchange protocol." | ||
) | ||
|
||
pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy) | ||
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value] | ||
|
||
|
||
def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table: | ||
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"): | ||
raise ImportError( | ||
"pyarrow>=11.0.0 is required for converting a dataframe interchange object" | ||
" to a Polars dataframe." | ||
) | ||
|
||
import pyarrow.interchange # noqa: F401 | ||
|
||
if not allow_copy: | ||
return _df_to_pyarrow_table_zero_copy(df) | ||
|
||
return pa.interchange.from_dataframe(df, allow_copy=True) | ||
|
||
|
||
def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table: | ||
dfi = df.__dataframe__(allow_copy=False) | ||
if _dfi_contains_categorical_data(dfi): | ||
raise TypeError( | ||
"Polars can not currently guarantee zero-copy conversion from Arrow for " | ||
" categorical columns. Set `allow_copy=True` or cast categorical columns to" | ||
" string first." | ||
) | ||
|
||
if isinstance(df, pa.Table): | ||
return df | ||
elif isinstance(df, pa.RecordBatch): | ||
return pa.Table.from_batches([df]) | ||
else: | ||
return pa.interchange.from_dataframe(dfi, allow_copy=False) | ||
|
||
|
||
def _dfi_contains_categorical_data(dfi: Any) -> bool: | ||
CATEGORICAL_DTYPE = 23 | ||
return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns()) |