-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add from_arrow (which uses the PyCapsule Interface) #1181
Changes from 5 commits
f5da0cb
88cebde
8fc2099
0c48b89
1acf8ef
2ac16c9
fd8eea1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
from typing import Any | ||
from typing import Iterable | ||
from typing import Literal | ||
from typing import Protocol | ||
from typing import TypeVar | ||
from typing import Union | ||
|
||
|
@@ -21,6 +22,7 @@ | |
# The rest of the annotations seem to work fine with this anyway | ||
FrameT = TypeVar("FrameT", bound=Union[DataFrame, LazyFrame]) # type: ignore[type-arg] | ||
|
||
|
||
if TYPE_CHECKING: | ||
from types import ModuleType | ||
|
||
|
@@ -29,6 +31,9 @@ | |
from narwhals.series import Series | ||
from narwhals.typing import DTypes | ||
|
||
class SupportsPyCapsule(Protocol): | ||
def __arrow_c_stream__(self) -> Any: ... | ||
|
||
|
||
def concat( | ||
items: Iterable[FrameT], | ||
|
@@ -406,6 +411,100 @@ def _from_dict_impl( | |
return from_native(native_frame, eager_only=True) | ||
|
||
|
||
def from_pycapsule( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally I'd call this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. based on pola-rs/polars#12530 (comment), i think this risks being confused with Polars' i'd be more inclined to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you only have top-level functions? Do you export a But there is a difference between yours and the polars function because your function seems to only convert to a
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks! yeah maybe class constructors are the way to go for this one EDIT: my hesitation with class constructors is that then we'd be adding something which is outside the Polars API (which we generally try to adhere to where possible) I think this is probably OK for now, but we can always revisit later |
||
native_frame: SupportsPyCapsule, *, native_namespace: ModuleType | ||
) -> DataFrame[Any]: | ||
""" | ||
Construct a DataFrame from an object which supports the PyCapsule Interface. | ||
|
||
Arguments: | ||
native_frame: Object which implements `__arrow_c_stream__`. | ||
native_namespace: The native library to use for DataFrame creation. | ||
|
||
Examples: | ||
>>> import pandas as pd | ||
>>> import polars as pl | ||
>>> import pyarrow as pa | ||
>>> import narwhals as nw | ||
>>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} | ||
|
||
Let's define a dataframe-agnostic function which creates a PyArrow | ||
Table. | ||
|
||
>>> @nw.narwhalify | ||
... def func(df): | ||
... return nw.from_pycapsule(df, native_namespace=pa) | ||
|
||
Let's see what happens when passing pandas / Polars input: | ||
|
||
>>> func(pd.DataFrame(data)) # doctest: +SKIP | ||
pyarrow.Table | ||
a: int64 | ||
b: int64 | ||
---- | ||
a: [[1,2,3]] | ||
b: [[4,5,6]] | ||
>>> func(pl.DataFrame(data)) # doctest: +SKIP | ||
pyarrow.Table | ||
a: int64 | ||
b: int64 | ||
---- | ||
a: [[1,2,3]] | ||
b: [[4,5,6]] | ||
""" | ||
if not hasattr(native_frame, "__arrow_c_stream__"): | ||
msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface" | ||
raise TypeError(msg) | ||
implementation = Implementation.from_native_namespace(native_namespace) | ||
|
||
if implementation is Implementation.POLARS and parse_version( | ||
native_namespace.__version__ | ||
) >= (1, 3): | ||
native_frame = native_namespace.DataFrame(native_frame) | ||
elif implementation in { | ||
Implementation.PANDAS, | ||
Implementation.MODIN, | ||
Implementation.CUDF, | ||
Implementation.POLARS, | ||
}: | ||
# These don't (yet?) support the PyCapsule Interface for import | ||
# so we go via PyArrow | ||
try: | ||
import pyarrow as pa # ignore-banned-import | ||
except ModuleNotFoundError as exc: # pragma: no cover | ||
msg = f"PyArrow>=14.0.0 is required for `from_pycapsule` for object of type {native_namespace}" | ||
raise ModuleNotFoundError(msg) from exc | ||
if parse_version(pa.__version__) < (14, 0): # pragma: no cover | ||
msg = f"PyArrow>=14.0.0 is required for `from_pycapsule` for object of type {native_namespace}" | ||
raise ModuleNotFoundError(msg) from None | ||
|
||
tbl = pa.table(native_frame) | ||
if implementation is Implementation.PANDAS: | ||
native_frame = tbl.to_pandas() | ||
elif implementation is Implementation.MODIN: # pragma: no cover | ||
from modin.pandas.utils import from_arrow | ||
|
||
native_frame = from_arrow(tbl) | ||
elif implementation is Implementation.CUDF: # pragma: no cover | ||
native_frame = native_namespace.DataFrame.from_arrow(tbl) | ||
elif implementation is Implementation.POLARS: # pragma: no cover | ||
native_frame = native_namespace.from_arrow(tbl) | ||
else: # pragma: no cover | ||
msg = "congratulations, you entered unrecheable code - please report a bug" | ||
raise AssertionError(msg) | ||
elif implementation is Implementation.PYARROW: | ||
native_frame = native_namespace.table(native_frame) | ||
else: # pragma: no cover | ||
try: | ||
# implementation is UNKNOWN, Narwhals extension using this feature should | ||
# implement PyCapsule support | ||
native_frame = native_namespace.DataFrame(native_frame) | ||
except AttributeError as e: | ||
msg = "Unknown namespace is expected to implement `DataFrame` class which accepts object which supports PyCapsule Interface." | ||
raise AttributeError(msg) from e | ||
return from_native(native_frame, eager_only=True) | ||
|
||
|
||
def _get_sys_info() -> dict[str, str]: | ||
"""System information | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import sys | ||
|
||
import pandas as pd | ||
import polars as pl | ||
import pyarrow as pa | ||
import pytest | ||
|
||
import narwhals.stable.v1 as nw | ||
from narwhals.utils import parse_version | ||
from tests.utils import compare_dicts | ||
|
||
|
||
@pytest.mark.xfail(parse_version(pa.__version__) < (14,), reason="too old") | ||
def test_from_pycapsule_to_arrow() -> None: | ||
df = nw.from_native(pl.DataFrame({"ab": [1, 2, 3], "ba": [4, 5, 6]}), eager_only=True) | ||
result = nw.from_pycapsule(df, native_namespace=pa) | ||
assert isinstance(result.to_native(), pa.Table) | ||
expected = {"ab": [1, 2, 3], "ba": [4, 5, 6]} | ||
compare_dicts(result, expected) | ||
|
||
|
||
@pytest.mark.xfail(parse_version(pa.__version__) < (14,), reason="too old") | ||
def test_from_pycapsule_to_polars(monkeypatch: pytest.MonkeyPatch) -> None: | ||
tbl = pa.table({"ab": [1, 2, 3], "ba": [4, 5, 6]}) | ||
monkeypatch.delitem(sys.modules, "pandas") | ||
df = nw.from_native(tbl, eager_only=True) | ||
result = nw.from_pycapsule(df, native_namespace=pl) | ||
assert isinstance(result.to_native(), pl.DataFrame) | ||
expected = {"ab": [1, 2, 3], "ba": [4, 5, 6]} | ||
compare_dicts(result, expected) | ||
assert "pandas" not in sys.modules | ||
|
||
|
||
@pytest.mark.xfail(parse_version(pa.__version__) < (14,), reason="too old") | ||
def test_from_pycapsule_to_pandas() -> None: | ||
df = nw.from_native(pa.table({"ab": [1, 2, 3], "ba": [4, 5, 6]}), eager_only=True) | ||
result = nw.from_pycapsule(df, native_namespace=pd) | ||
assert isinstance(result.to_native(), pd.DataFrame) | ||
expected = {"ab": [1, 2, 3], "ba": [4, 5, 6]} | ||
compare_dicts(result, expected) | ||
|
||
|
||
def test_from_pycapsule_invalid() -> None: | ||
with pytest.raises(TypeError, match="PyCapsule"): | ||
nw.from_pycapsule({"a": [1]}, native_namespace=pa) # type: ignore[arg-type] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't the correct type hint: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also personally stick with the upstream naming suggestion and call this
ArrowStreamExportable
https://github.com/kylebarron/arro3/blob/45be4a12dd62cee025c5d0ecf8c8c081e13643ea/arro3-core/python/arro3/core/types.py#L52-L75