diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index ab9f40a026fc9..fdd063c7cf95d 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -147,7 +147,7 @@ when, zeros, ) -from polars.interchange import from_dataframe +from polars.interchange.from_dataframe import from_dataframe from polars.io import ( read_avro, read_csv, diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index c811bfb08b3b4..b56223be0b192 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -119,7 +119,7 @@ from xlsxwriter import Workbook from polars import Expr, LazyFrame, Series - from polars.internals.interchange import PolarsDataFrameXchg + from polars.interchange.dataframe import PolarsDataFrame from polars.type_aliases import ( AsofJoinStrategy, AvroCompression, @@ -1204,7 +1204,7 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]: def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> PolarsDataFrameXchg: + ) -> PolarsDataFrame: """ Convert to a dataframe object implementing the dataframe interchange protocol. @@ -1237,9 +1237,9 @@ def __dataframe__( contains categorical data. """ - from polars.internals.interchange.dataframe import PolarsDataFrameXchg + from polars.interchange.dataframe import PolarsDataFrame - return PolarsDataFrameXchg(self, nan_as_null, allow_copy) + return PolarsDataFrame(self, nan_as_null, allow_copy) def __dataframe_consortium_standard__( self, *, api_version: str | None = None diff --git a/py-polars/polars/interchange/__init__.py b/py-polars/polars/interchange/__init__.py index 66eb80f27f9dd..e69de29bb2d1d 100644 --- a/py-polars/polars/interchange/__init__.py +++ b/py-polars/polars/interchange/__init__.py @@ -1,7 +0,0 @@ -from polars.interchange.dataframe import PolarsDataFrameXchg -from polars.interchange.from_dataframe import from_dataframe - -__all__ = [ - "PolarsDataFrameXchg", - "from_dataframe", -] diff --git a/py-polars/polars/interchange/buffer.py b/py-polars/polars/interchange/buffer.py index bb0688cbf9342..6059bdbc9ad8d 100644 --- a/py-polars/polars/interchange/buffer.py +++ b/py-polars/polars/interchange/buffer.py @@ -2,21 +2,23 @@ from typing import TYPE_CHECKING -from polars.internals.interchange.dataframe_protocol import ( +from polars.interchange.dataframe_protocol import ( Buffer, DlpackDeviceType, DtypeKind, ) -from polars.internals.interchange.utils import polars_dtype_to_dtype +from polars.interchange.utils import polars_dtype_to_dtype if TYPE_CHECKING: - import polars as pl + from typing import NoReturn + + from polars import Series class PolarsBuffer(Buffer): - """A buffer represented by a Polars Series consisting of a single chunk.""" + """A buffer backed by a Polars Series consisting of a single chunk.""" - def __init__(self, data: pl.Series, allow_copy: bool = True) -> None: + def __init__(self, data: Series, allow_copy: bool = True): if data.n_chunks() > 1: if allow_copy: data = data.rechunk() @@ -34,7 +36,8 @@ def bufsize(self) -> int: bytes_per_element = dtype[1] // 8 if dtype[0] == DtypeKind.STRING: - return self._data.str.lengths().sum() * bytes_per_element + n_bytes: int = self._data.str.lengths().sum() # type: ignore[assignment] + return n_bytes * bytes_per_element else: return len(self._data) * bytes_per_element @@ -43,11 +46,11 @@ def ptr(self) -> int: """Pointer to start of the buffer as an integer.""" return self._data._s.get_ptr() - def __dlpack__(self): + def __dlpack__(self) -> NoReturn: """Represent this structure as DLPack interface.""" raise NotImplementedError("__dlpack__") - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + def __dlpack_device__(self) -> tuple[DlpackDeviceType, None]: """Device type and device ID for where the data in the buffer resides.""" return (DlpackDeviceType.CPU, None) diff --git a/py-polars/polars/interchange/column.py b/py-polars/polars/interchange/column.py index 016afad86949e..07db68d6f3f12 100644 --- a/py-polars/polars/interchange/column.py +++ b/py-polars/polars/interchange/column.py @@ -4,19 +4,19 @@ from typing import TYPE_CHECKING import polars as pl -from polars.internals.interchange.buffer import PolarsBuffer -from polars.internals.interchange.dataframe_protocol import ( +from polars.interchange.buffer import PolarsBuffer +from polars.interchange.dataframe_protocol import ( Column, ColumnNullType, DtypeKind, ) -from polars.internals.interchange.utils import polars_dtype_to_dtype +from polars.interchange.utils import polars_dtype_to_dtype if TYPE_CHECKING: from collections.abc import Iterable from typing import Any - from polars.internals.interchange.dataframe_protocol import ( + from polars.interchange.dataframe_protocol import ( CategoricalDescription, ColumnBuffers, Dtype, @@ -24,7 +24,7 @@ class PolarsColumn(Column): - """A column represented by a Polars Series.""" + """A column backed by a Polars Series.""" def __init__(self, column: pl.Series, allow_copy: bool = True): self._col = column @@ -52,6 +52,15 @@ def dtype(self) -> Dtype: @property def describe_categorical(self) -> CategoricalDescription: + """ + Return information on the categorical column. + + Raises + ------ + TypeError + If the dtype is not categorical. + + """ if self.dtype[0] != DtypeKind.CATEGORICAL: raise TypeError( "describe_categorical only works on a column with categorical dtype!" @@ -59,26 +68,32 @@ def describe_categorical(self) -> CategoricalDescription: categories = self._col.unique().cat.set_ordering("physical").sort() return { - "is_ordered": self._col.cat.ordered, # TODO: Implement + "is_ordered": False, # self._col.cat.ordered, # TODO: Implement "is_dictionary": True, "categories": PolarsColumn(categories), } @property def describe_null(self) -> tuple[ColumnNullType, int]: + """Return the null representation the column dtype uses.""" return ColumnNullType.USE_BITMASK, 0 - def null_count(self) -> int: + @property + def null_count(self) -> int | None: + """The number of null elements.""" return self._col.null_count() @property def metadata(self) -> dict[str, Any]: + """The metadata for the column.""" return {} def num_chunks(self) -> int: + """Return the number of chunks the column consists of.""" return self._col.n_chunks() def get_chunks(self, n_chunks: int | None = None) -> Iterable[PolarsColumn]: + """Return an iterator yielding the chunks.""" total_n_chunks = self.num_chunks() chunks = self._col.get_chunks() # TODO: Implement @@ -103,6 +118,7 @@ def get_chunks(self, n_chunks: int | None = None) -> Iterable[PolarsColumn]: yield PolarsColumn(chunk[start : start + step], self._allow_copy) def get_buffers(self) -> ColumnBuffers: + """Return a dictionary containing the underlying buffers.""" buffers: ColumnBuffers = { "data": self._get_data_buffer(), "validity": self._get_validity_buffer(), @@ -139,7 +155,7 @@ def _get_validity_buffer(self) -> tuple[PolarsBuffer, Dtype]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. - """ + """ # noqa: D205 buffer = PolarsBuffer(self._col.is_not_null(), self._allow_copy) dtype = polars_dtype_to_dtype(pl.Boolean) return buffer, dtype @@ -150,11 +166,10 @@ def _get_offsets_buffer(self) -> tuple[PolarsBuffer, Dtype]: data (e.g., variable-length strings) and the buffer's associated dtype. Raises NoBufferPresent if the data buffer does not have an associated offsets buffer. - """ + """ # noqa: D205 if self.dtype[0] != DtypeKind.STRING: raise TypeError( - "This column has a fixed-length dtype so " - "it does not have an offsets buffer" + "This column has a fixed-length dtype so it does not have an offsets buffer" ) offsets = ( @@ -162,7 +177,7 @@ def _get_offsets_buffer(self) -> tuple[PolarsBuffer, Dtype]: .fill_null(0) .cumsum() .extend_constant(None, 1) - .shift_and_fill(1, 0) + .shift_and_fill(0) .rechunk() ) diff --git a/py-polars/polars/interchange/dataframe.py b/py-polars/polars/interchange/dataframe.py index 369439f2d83cf..05dcffbfa48bf 100644 --- a/py-polars/polars/interchange/dataframe.py +++ b/py-polars/polars/interchange/dataframe.py @@ -4,21 +4,21 @@ from itertools import accumulate from typing import TYPE_CHECKING -from polars.internals.interchange.column import PolarsColumn -from polars.internals.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from polars.interchange.column import PolarsColumn +from polars.interchange.dataframe_protocol import DataFrame as DataFrameObject if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Iterator from typing import Any - import polars as pl + from polars import DataFrame -class PolarsDataFrameXchg(DataFrameXchg): - """A dataframe represented by a Polars DataFrame.""" +class PolarsDataFrame(DataFrameObject): + """A dataframe object backed by a Polars DataFrame.""" def __init__( - self, df: pl.DataFrame, nan_as_null: bool = False, allow_copy: bool = True + self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True ): self._df = df self._nan_as_null = nan_as_null # Has no effect for now @@ -26,18 +26,22 @@ def __init__( def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> PolarsDataFrameXchg: - return PolarsDataFrameXchg(self._df, nan_as_null, allow_copy) + ) -> PolarsDataFrame: + """Construct a new exchange object, potentially changing the parameters.""" + return PolarsDataFrame(self._df, nan_as_null=nan_as_null, allow_copy=allow_copy) @property def metadata(self) -> dict[str, Any]: + """The metadata for the dataframe.""" return {} def num_columns(self) -> int: - return self._df.shape[1] + """Return the number of columns in the DataFrame.""" + return self._df.width def num_rows(self) -> int: - return self._df.shape[0] + """Return the number of rows in the DataFrame.""" + return self._df.height def num_chunks(self) -> int: """ @@ -49,50 +53,62 @@ def num_chunks(self) -> int: See Also -------- - polars.internals.dataframe.frame.DataFrame.n_chunks. + polars.dataframe.frame.DataFrame.n_chunks + """ - return self._df.n_chunks() + return self._df.n_chunks("first") def column_names(self) -> list[str]: + """Return the column names.""" return self._df.columns def get_column(self, i: int) -> PolarsColumn: + """Return the column at the indicated position.""" return PolarsColumn(self._df.to_series(i), allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> PolarsColumn: + """Return the column whose name is the indicated name.""" return PolarsColumn(self._df[name], allow_copy=self._allow_copy) - def get_columns(self) -> list[PolarsColumn]: - return [ - PolarsColumn(column, allow_copy=self._allow_copy) - for column in self._df.get_columns() - ] + def get_columns(self) -> Iterable[PolarsColumn]: + """Return an iterator yielding the columns.""" + for column in self._df.get_columns(): + yield PolarsColumn(column, allow_copy=self._allow_copy) - def select_columns(self, indices: Sequence[int]) -> PolarsDataFrameXchg: + def select_columns(self, indices: Sequence[int]) -> PolarsDataFrame: + """Create a new DataFrame by selecting a subset of columns by index.""" if not isinstance(indices, Sequence): - raise ValueError("`indices` is not a sequence") + raise TypeError("`indices` is not a sequence") if not isinstance(indices, list): indices = list(indices) - return PolarsDataFrameXchg( - self._df[:, indices], self._nan_as_null, self._allow_copy + return PolarsDataFrame( + self._df[:, indices], + nan_as_null=self._nan_as_null, + allow_copy=self._allow_copy, ) - def select_columns_by_name(self, names: Sequence[str]) -> PolarsDataFrameXchg: + def select_columns_by_name(self, names: Sequence[str]) -> PolarsDataFrame: + """Create a new DataFrame by selecting a subset of columns by name.""" if not isinstance(names, Sequence): raise ValueError("`names` is not a sequence") - return PolarsDataFrameXchg( - self._df.select(names), self._nan_as_null, self._allow_copy + return PolarsDataFrame( + self._df.select(names), + nan_as_null=self._nan_as_null, + allow_copy=self._allow_copy, ) - def get_chunks(self, n_chunks: int | None = None) -> Iterable[PolarsDataFrameXchg]: + def get_chunks(self, n_chunks: int | None = None) -> Iterator[PolarsDataFrame]: + """Return an iterator yielding the chunks.""" total_n_chunks = self.num_chunks() chunks = self._get_chunks_from_col_chunks() if (n_chunks is None) or (n_chunks == total_n_chunks): for chunk in chunks: - yield PolarsDataFrameXchg(chunk, self._allow_copy) + yield PolarsDataFrame( + chunk, nan_as_null=self._nan_as_null, allow_copy=self._allow_copy + ) elif (n_chunks <= 0) or (n_chunks % total_n_chunks != 0): raise ValueError( @@ -108,18 +124,21 @@ def get_chunks(self, n_chunks: int | None = None) -> Iterable[PolarsDataFrameXch if size % subchunks_per_chunk != 0: step += 1 for start in range(0, step * subchunks_per_chunk, step): - yield PolarsDataFrameXchg( - chunk[start : start + step, :], self._allow_copy + yield PolarsDataFrame( + chunk[start : start + step, :], + nan_as_null=self._nan_as_null, + allow_copy=self._allow_copy, ) - def _get_chunks_from_col_chunks(self) -> Iterable[pl.DataFrame]: + def _get_chunks_from_col_chunks(self) -> Iterator[DataFrame]: """ Return chunks of this dataframe according to the chunks of the first column. + If columns are not all chunked identically, they will be rechunked like the first column. If copy is not allowed, raises RuntimeError. """ col_chunks = self.get_column(0).get_chunks() - chunk_sizes = [len(chunk) for chunk in col_chunks] + chunk_sizes = [chunk.size() for chunk in col_chunks] starts = [0] + list(accumulate(chunk_sizes)) for i in range(len(starts) - 1): diff --git a/py-polars/polars/interchange/dataframe_protocol.py b/py-polars/polars/interchange/dataframe_protocol.py index 0606573865843..24e265cce6b3a 100644 --- a/py-polars/polars/interchange/dataframe_protocol.py +++ b/py-polars/polars/interchange/dataframe_protocol.py @@ -2,12 +2,22 @@ """ A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api """ - from __future__ import annotations +from abc import ( + ABC, + abstractmethod, +) import enum -from abc import ABC, abstractmethod -from typing import Any, Iterable, Sequence, Tuple, TypedDict +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) class DlpackDeviceType(enum.IntEnum): @@ -26,6 +36,7 @@ class DlpackDeviceType(enum.IntEnum): class DtypeKind(enum.IntEnum): """ Integer enum for data types. + Attributes ---------- INT : int @@ -59,6 +70,7 @@ class DtypeKind(enum.IntEnum): class ColumnNullType(enum.IntEnum): """ Integer enum for null type representation. + Attributes ---------- NON_NULLABLE : int @@ -83,18 +95,18 @@ class ColumnNullType(enum.IntEnum): class ColumnBuffers(TypedDict): # first element is a buffer containing the column data; # second element is the data buffer's associated dtype - data: tuple[Buffer, Dtype] + data: Tuple["Buffer", Dtype] # first element is a buffer containing mask values indicating missing data; # second element is the mask value buffer's associated dtype. # None if the null representation is not a bit or byte mask - validity: tuple[Buffer, Dtype] | None + validity: Optional[Tuple["Buffer", Dtype]] # first element is a buffer containing the offset values for # variable-size binary data (e.g., variable-length strings); # second element is the offsets buffer's associated dtype. # None if the data buffer does not have an associated offsets buffer - offsets: tuple[Buffer, Dtype] | None + offsets: Optional[Tuple["Buffer", Dtype]] class CategoricalDescription(TypedDict): @@ -104,17 +116,19 @@ class CategoricalDescription(TypedDict): is_dictionary: bool # Python-level only (e.g. ``{int: str}``). # None if not a dictionary-style categorical. - categories: Column | None + categories: Optional[Column] class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. + Note that there is no dtype attribute present, a buffer can be thought of as simply a block of memory. However, if the column that the buffer is attached to has a dtype that's supported by DLPack and ``__dlpack__`` is implemented, then that dtype information will be contained in the return value from ``__dlpack__``. + This distinction is useful to support both data exchange via DLPack on a buffer and (b) dtypes like variable-length strings which do not have a fixed number of bytes per element. @@ -140,16 +154,19 @@ def ptr(self) -> int: def __dlpack__(self): # type: ignore[no-untyped-def] """ Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because it's not completely trivial to implement for a Python-only library. """ raise NotImplementedError("__dlpack__") @abstractmethod - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ Device type and device ID for where the data in the buffer resides. Uses device type codes matching DLPack. @@ -162,27 +179,33 @@ class Column(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three buffers - a data buffer, a mask buffer (depending on null representation), and an offsets buffer (if variable-size binary; e.g., variable-length strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. Instead, it seems to use "children" for both columns with a bit mask, and for nested dtypes. Unclear whether this is elegant or confusing. This design requires checking the null representation explicitly. + The Arrow design requires checking: 1. the ARROW_FLAG_NULLABLE (for sentinel values) 2. if a column has two children, combined with one of those children having a null dtype. + Making the mask concept explicit seems useful. One null dtype would not be enough to cover both bit and byte masks, so that would mean even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as multiple buffers per array (= column here). Semantically it may make sense to have both: chunks were meant for example for lazy evaluation of data which doesn't fit in memory, while multiple buffers per column could also come from doing a selection operation on a single contiguous buffer. + Given these concepts, one would expect chunks to be all of the same size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), while multiple buffers could have data-dependent lengths. Not an issue @@ -191,6 +214,7 @@ class Column(ABC): Are multiple chunks *and* multiple buffers per column necessary for the purposes of this interchange protocol, or must producers either reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. """ @@ -199,8 +223,10 @@ class Column(ABC): def size(self) -> int: """ Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially expensive) computation for some dataframe implementations. """ @@ -211,6 +237,7 @@ def size(self) -> int: def offset(self) -> int: """ Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of equal size M (only the last chunk may be shorter), ``offset = n * M``, ``n = 0 .. N-1``. @@ -222,10 +249,12 @@ def offset(self) -> int: def dtype(self) -> Dtype: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C Data Interface format. Endianness : current only native endianness (``=``) is supported + Notes: - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension) @@ -254,7 +283,9 @@ def describe_categorical(self) -> CategoricalDescription: If the dtype is categorical, there are two options: - There are only values in the data buffer. - There is a separate non-categorical Column encoding categorical values. + Raises TypeError if the dtype is not categorical + Returns the dictionary with description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. @@ -263,16 +294,18 @@ def describe_categorical(self) -> CategoricalDescription: - "categories" : Column representing the (implicit) mapping of indices to category values (e.g. an array of cat1, cat2, ...). None if not a dictionary-style categorical. + TBD: are there any other in-memory representations that are needed? """ pass @property @abstractmethod - def describe_null(self) -> tuple[ColumnNullType, Any]: + def describe_null(self) -> Tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. @@ -281,16 +314,17 @@ def describe_null(self) -> tuple[ColumnNullType, Any]: @property @abstractmethod - def null_count(self) -> int | None: + def null_count(self) -> Optional[int]: """ Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. """ pass @property @abstractmethod - def metadata(self) -> dict[str, Any]: + def metadata(self) -> Dict[str, Any]: """ The metadata for the column. See `DataFrame.metadata` for more details. """ @@ -304,9 +338,10 @@ def num_chunks(self) -> int: pass @abstractmethod - def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. """ pass @@ -315,7 +350,9 @@ def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer containing the data and whose second element is the data buffer's associated dtype. @@ -346,9 +383,11 @@ class DataFrame(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. + A "data frame" represents an ordered collection of named columns. A column's "name" must be a unique string. Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and attributes defined on this DataFrame class could be returned from the ``__dataframe__`` method of a public data frame class in a library adhering @@ -360,9 +399,10 @@ class DataFrame(ABC): @abstractmethod def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> DataFrame: + ) -> "DataFrame": """ Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN``. It is intended for cases where the consumer does not support the bit @@ -376,7 +416,7 @@ def __dataframe__( @property @abstractmethod - def metadata(self) -> dict[str, Any]: + def metadata(self) -> Dict[str, Any]: """ The metadata for the data frame, as a dictionary with string keys. The contents of `metadata` may be anything, they are meant for a library @@ -396,7 +436,7 @@ def num_columns(self) -> int: pass @abstractmethod - def num_rows(self) -> int | None: + def num_rows(self) -> Optional[int]: # TODO: not happy with Optional, but need to flag it may be expensive # why include it if it may be None - what do we expect consumers # to do here? @@ -441,27 +481,29 @@ def get_columns(self) -> Iterable[Column]: pass @abstractmethod - def select_columns(self, indices: Sequence[int]) -> DataFrame: + def select_columns(self, indices: Sequence[int]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by index. """ pass @abstractmethod - def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by name. """ pass @abstractmethod - def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. + By default (None), yields the chunks that the data is stored as by the producer. If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, meaning the producer must subdivide each chunk before yielding it. + Note that the producer must ensure that all columns are chunked the same way. """ diff --git a/py-polars/polars/interchange/utils.py b/py-polars/polars/interchange/utils.py index 18437180280a5..d0e528f7a79d7 100644 --- a/py-polars/polars/interchange/utils.py +++ b/py-polars/polars/interchange/utils.py @@ -3,15 +3,17 @@ from typing import TYPE_CHECKING import polars as pl -from polars.internals.interchange.dataframe_protocol import DtypeKind +from polars.interchange.dataframe_protocol import DtypeKind if TYPE_CHECKING: - from polars.internals.interchange.dataframe_protocol import Dtype + from polars.interchange.dataframe_protocol import Dtype + from polars.type_aliases import PolarsDataType NATIVE_ENDIANNESS = "=" -def polars_dtype_to_dtype(dtype: pl.DataType) -> Dtype: +def polars_dtype_to_dtype(dtype: PolarsDataType) -> Dtype: + """Convert Polars data type to interchange protocol data type.""" if dtype == pl.Int8: return DtypeKind.INT, 8, "c", NATIVE_ENDIANNESS elif dtype == pl.Int16: @@ -41,12 +43,12 @@ def polars_dtype_to_dtype(dtype: pl.DataType) -> Dtype: elif dtype == pl.Time: return DtypeKind.DATETIME, 64, "ttu", NATIVE_ENDIANNESS elif dtype == pl.Datetime: - tu = dtype.tu[0] if dtype.tu is not None else "u" - tz = dtype.tz if dtype.tz is not None else "" + tu = dtype.time_unit[0] if dtype.time_unit is not None else "u" # type: ignore[union-attr] + tz = dtype.time_zone if dtype.time_zone is not None else "" # type: ignore[union-attr] arrow_c_type = f"ts{tu}:{tz}" return DtypeKind.DATETIME, 64, arrow_c_type, NATIVE_ENDIANNESS elif dtype == pl.Duration: - tu = dtype.tu[0] if dtype.tu is not None else "u" + tu = dtype.time_unit[0] if dtype.time_unit is not None else "u" # type: ignore[union-attr] arrow_c_type = f"tD{tu}" return DtypeKind.DATETIME, 64, arrow_c_type, NATIVE_ENDIANNESS elif dtype == pl.Categorical: diff --git a/py-polars/tests/unit/interchange/test_buffer.py b/py-polars/tests/unit/interchange/test_buffer.py index f65080006b9c3..1fc1919a5b406 100644 --- a/py-polars/tests/unit/interchange/test_buffer.py +++ b/py-polars/tests/unit/interchange/test_buffer.py @@ -3,8 +3,8 @@ import pytest import polars as pl -from polars.internals.interchange.buffer import PolarsBuffer -from polars.internals.interchange.dataframe_protocol import DlpackDeviceType +from polars.interchange.buffer import PolarsBuffer +from polars.interchange.dataframe_protocol import DlpackDeviceType @pytest.mark.parametrize( @@ -15,12 +15,12 @@ (pl.concat([pl.Series([1, 2]), pl.Series([1, 2])], rechunk=False), True), ], ) -def test_init(data, allow_copy): +def test_init(data: pl.Series, allow_copy: bool) -> None: buffer = PolarsBuffer(data, allow_copy=allow_copy) assert buffer._data.n_chunks() == 1 -def test_init_invalid_input(): +def test_init_invalid_input() -> None: s = pl.Series([1, 2]) data = pl.concat([s, s], rechunk=False) @@ -37,12 +37,12 @@ def test_init_invalid_input(): (pl.Series([True, False], dtype=pl.Boolean), 2), ], ) -def test_bufsize(data, expected): +def test_bufsize(data: pl.Series, expected: int) -> None: buffer = PolarsBuffer(data) assert buffer.bufsize == expected -def test_ptr(): +def test_ptr() -> None: data = pl.Series([1, 2]) buffer = PolarsBuffer(data) result = buffer.ptr @@ -51,7 +51,7 @@ def test_ptr(): @pytest.mark.skip("Not implemented yet") -def test_ptr_boolean(): +def test_ptr_boolean() -> None: data = pl.Series([True, False]) buffer = PolarsBuffer(data) result = buffer.ptr @@ -59,20 +59,20 @@ def test_ptr_boolean(): assert isinstance(result, int) -def test__dlpack__(): +def test__dlpack__() -> None: data = pl.Series([1, 2]) buffer = PolarsBuffer(data) with pytest.raises(NotImplementedError): buffer.__dlpack__() -def test__dlpack_device__(): +def test__dlpack_device__() -> None: data = pl.Series([1, 2]) buffer = PolarsBuffer(data) assert buffer.__dlpack_device__() == (DlpackDeviceType.CPU, None) -def test__repr__(): +def test__repr__() -> None: data = pl.Series([True, False, True]) buffer = PolarsBuffer(data) print(buffer.__repr__()) diff --git a/py-polars/tests/unit/interchange/test_column.py b/py-polars/tests/unit/interchange/test_column.py index 6a1d965f5e8d5..5a41a3777abae 100644 --- a/py-polars/tests/unit/interchange/test_column.py +++ b/py-polars/tests/unit/interchange/test_column.py @@ -3,10 +3,10 @@ import pytest import polars as pl -from polars.internals.interchange.buffer import PolarsBuffer -from polars.internals.interchange.column import PolarsColumn -from polars.internals.interchange.dataframe_protocol import DtypeKind -from polars.internals.interchange.utils import NATIVE_ENDIANNESS +from polars.interchange.buffer import PolarsBuffer +from polars.interchange.column import PolarsColumn +from polars.interchange.dataframe_protocol import DtypeKind +from polars.interchange.utils import NATIVE_ENDIANNESS def test_size() -> None: diff --git a/py-polars/tests/unit/interchange/test_dataframe.py b/py-polars/tests/unit/interchange/test_dataframe.py index 9d48db4f9f85e..355d87f8dc113 100644 --- a/py-polars/tests/unit/interchange/test_dataframe.py +++ b/py-polars/tests/unit/interchange/test_dataframe.py @@ -1 +1,155 @@ from __future__ import annotations + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +def test_num_chunks() -> None: + df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + dfi = df.__dataframe__() + assert dfi.num_chunks() == 1 + + df2 = pl.concat([df, df], rechunk=False) + dfi2 = df2.__dataframe__() + assert dfi2.num_chunks() == 2 + + +@pytest.mark.parametrize("n_chunks", [None, 2]) +def test_get_chunks(n_chunks: int | None) -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [4, 5]}) + df2 = pl.DataFrame({"a": [3], "b": [6]}) + df = pl.concat([df1, df2], rechunk=False) + + dfi = df.__dataframe__() + out = dfi.get_chunks(n_chunks) + + expected = dfi._get_chunks_from_col_chunks() + for o, e in zip(out, expected): + assert_frame_equal(pl.from_dataframe(o), e) + + +def test_get_chunks_invalid_input() -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [4, 5]}) + df2 = pl.DataFrame({"a": [3], "b": [6]}) + df = pl.concat([df1, df2], rechunk=False) + + dfi = df.__dataframe__() + + with pytest.raises(ValueError): + next(dfi.get_chunks(0)) + + with pytest.raises(ValueError): + next(dfi.get_chunks(3)) + + +def test_get_chunks_subdivided_chunks() -> None: + df1 = pl.DataFrame({"a": [1, 2, 3], "b": [6, 7, 8]}) + df2 = pl.DataFrame({"a": [4, 5], "b": [9, 0]}) + df = pl.concat([df1, df2], rechunk=False) + + dfi = df.__dataframe__() + out = dfi.get_chunks(4) + + chunk1 = next(out) + expected1 = pl.DataFrame({"a": [1, 2], "b": [6, 7]}) + assert_frame_equal(pl.from_dataframe(chunk1), expected1) + + chunk2 = next(out) + expected2 = pl.DataFrame({"a": [3], "b": [8]}) + assert_frame_equal(pl.from_dataframe(chunk2), expected2) + + chunk3 = next(out) + expected3 = pl.DataFrame({"a": [4], "b": [9]}) + assert_frame_equal(pl.from_dataframe(chunk3), expected3) + + chunk4 = next(out) + expected4 = pl.DataFrame({"a": [5], "b": [0]}) + assert_frame_equal(pl.from_dataframe(chunk4), expected4) + + +def test_get_chunks_zero_copy_fail() -> None: + col1 = pl.Series([1, 2]) + col2 = pl.concat([pl.Series([3]), pl.Series([4])], rechunk=False) + df = pl.DataFrame({"a": col1, "b": col2}) + + dfi = df.__dataframe__(allow_copy=False) + + with pytest.raises(RuntimeError): + next(dfi.get_chunks()) + + +@pytest.mark.parametrize("allow_copy", [True, False]) +def test_get_chunks_from_col_chunks_single_chunk(allow_copy: bool) -> None: + df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + dfi = df.__dataframe__(allow_copy=allow_copy) + out = dfi._get_chunks_from_col_chunks() + + chunk1 = next(out) + assert_frame_equal(chunk1, df) + + with pytest.raises(StopIteration): + next(out) + + +@pytest.mark.parametrize("allow_copy", [True, False]) +def test_get_chunks_from_col_chunks_even_chunks(allow_copy: bool) -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [4, 5]}) + df2 = pl.DataFrame({"a": [3], "b": [6]}) + df = pl.concat([df1, df2], rechunk=False) + + dfi = df.__dataframe__(allow_copy=allow_copy) + out = dfi._get_chunks_from_col_chunks() + + chunk1 = next(out) + assert_frame_equal(chunk1, df1) + + chunk2 = next(out) + assert_frame_equal(chunk2, df2) + + with pytest.raises(StopIteration): + next(out) + + +def test_get_chunks_from_col_chunks_uneven_chunks_allow_copy() -> None: + col1 = pl.concat([pl.Series([1, 2]), pl.Series([3, 4, 5])], rechunk=False) + col2 = pl.concat( + [pl.Series([6, 7]), pl.Series([8]), pl.Series([9, 0])], rechunk=False + ) + df = pl.DataFrame({"a": col1, "b": col2}) + + dfi = df.__dataframe__(allow_copy=True) + out = dfi._get_chunks_from_col_chunks() + + expected1 = pl.DataFrame({"a": [1, 2], "b": [6, 7]}) + chunk1 = next(out) + assert_frame_equal(chunk1, expected1) + + expected2 = pl.DataFrame({"a": [3, 4, 5], "b": [8, 9, 0]}) + chunk2 = next(out) + assert_frame_equal(chunk2, expected2) + + with pytest.raises(StopIteration): + next(out) + + +def test_get_chunks_from_col_chunks_uneven_chunks_zero_copy_fails() -> None: + col1 = pl.concat([pl.Series([1, 2]), pl.Series([3, 4, 5])], rechunk=False) + col2 = pl.concat( + [pl.Series([6, 7]), pl.Series([8]), pl.Series([9, 0])], rechunk=False + ) + df = pl.DataFrame({"a": col1, "b": col2}) + + dfi = df.__dataframe__(allow_copy=False) + out = dfi._get_chunks_from_col_chunks() + + # First chunk can be yielded zero copy + expected1 = pl.DataFrame({"a": [1, 2], "b": [6, 7]}) + chunk1 = next(out) + assert_frame_equal(chunk1, expected1) + + # Second chunk requires a rechunk of the second column + with pytest.raises(RuntimeError, match="Columns not chunked the same"): + next(out) diff --git a/py-polars/tests/unit/interchange/test_utils.py b/py-polars/tests/unit/interchange/test_utils.py index 692f2546c81d8..a9c9508aff621 100644 --- a/py-polars/tests/unit/interchange/test_utils.py +++ b/py-polars/tests/unit/interchange/test_utils.py @@ -5,12 +5,12 @@ import pytest import polars as pl -from polars.internals.interchange.dataframe_protocol import DtypeKind -from polars.internals.interchange.utils import NATIVE_ENDIANNESS as NE -from polars.internals.interchange.utils import polars_dtype_to_dtype +from polars.interchange.dataframe_protocol import DtypeKind +from polars.interchange.utils import NATIVE_ENDIANNESS as NE +from polars.interchange.utils import polars_dtype_to_dtype if TYPE_CHECKING: - from polars.internals.interchange.dataframe_protocol import Dtype + from polars.interchange.dataframe_protocol import Dtype @pytest.mark.parametrize(