Skip to content

Commit

Permalink
Make Column.to_pandas return Index instead of Series (rapidsai#15833)
Browse files Browse the repository at this point in the history
Column.to_pandas backs `Index.to_pandas`/`Series.to_pandas`/`DataFrame.to_pandas` and returned a `pandas.Series`; however, the `index` of this `pandas.Series` was not strictly necessary for `Index.to_pandas` and `DataFrame.to_pandas`.

Additionally, `pandas.Index` is 1D-like like `Column` and provides a better mental model to `to_pandas` conversion.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#15833
  • Loading branch information
mroeschke authored Jun 4, 2024
1 parent faf3929 commit fe74129
Show file tree
Hide file tree
Showing 12 changed files with 46 additions and 143 deletions.
7 changes: 3 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,12 +789,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif arrow_type:
raise NotImplementedError(f"{arrow_type=} is not implemented.")

Expand Down Expand Up @@ -828,7 +827,7 @@ def to_pandas(
data = pd.Categorical.from_codes(
codes, categories=cats.to_pandas(), ordered=col.ordered
)
return pd.Series(data, index=index)
return pd.Index(data)

def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array."""
Expand Down
13 changes: 3 additions & 10 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,9 @@ def __repr__(self):
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
"""Convert object to pandas type.
The default implementation falls back to PyArrow for the conversion.
Expand All @@ -208,15 +207,9 @@ def to_pandas(
raise NotImplementedError(f"{nullable=} is not implemented.")
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
return pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
else:
pd_series = pa_array.to_pandas()

if index is not None:
pd_series.index = index
return pd_series
return pd.Index(pa_array.to_pandas())

@property
def values_host(self) -> "np.ndarray":
Expand Down
20 changes: 4 additions & 16 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,27 +840,15 @@ def __init__(
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
) -> pd.Index:
if arrow_type or nullable:
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
else:
series = self._local_time.to_pandas().dt.tz_localize(
return self._local_time.to_pandas().tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)
if index is not None:
series.index = index
return series

def to_arrow(self):
return pa.compute.assume_timezone(
Expand Down
15 changes: 3 additions & 12 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
from typing import Optional

import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -109,28 +107,21 @@ def as_interval_column(self, dtype):
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
# Note: This does not handle null values in the interval column.
# However, this exact sequence (calling __from_arrow__ on the output of
# self.to_arrow) is currently the best known way to convert interval
# types into pandas (trying to convert the underlying numerical columns
# directly is problematic), so we're stuck with this for now.
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif arrow_type:
raise NotImplementedError(f"{arrow_type=} is not implemented.")

pd_type = self.dtype.to_pandas()
return pd.Series(
pd_type.__from_arrow__(self.to_arrow()), index=index, dtype=pd_type
)
return pd.Index(pd_type.__from_arrow__(self.to_arrow()), dtype=pd_type)

def element_indexing(self, index: int):
result = super().element_indexing(index)
Expand Down
20 changes: 4 additions & 16 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,25 +292,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
# Can't rely on Column.to_pandas implementation for lists.
# Need to perform `to_pylist` to preserve list types.
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
) -> pd.Index:
if arrow_type or nullable:
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
else:
return pd.Series(pa_array.tolist(), dtype="object", index=index)
return pd.Index(self.to_arrow().tolist(), dtype="object")


class ListMethods(ColumnMethods):
Expand Down
17 changes: 6 additions & 11 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,18 +674,13 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif (
nullable
and (
Expand All @@ -697,11 +692,11 @@ def to_pandas(
):
arrow_array = self.to_arrow()
pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) # type: ignore[attr-defined]
return pd.Series(pandas_array, copy=False, index=index)
return pd.Index(pandas_array, copy=False)
elif self.dtype.kind in set("iuf") and not self.has_nulls():
return pd.Series(self.values_host, copy=False, index=index)
return pd.Index(self.values_host, copy=False)
else:
return super().to_pandas(index=index, nullable=nullable)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
col_dtype = self.dtype
Expand Down
17 changes: 4 additions & 13 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5783,23 +5783,14 @@ def values(self) -> cupy.ndarray:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
elif nullable:
) -> pd.Index:
if nullable and not arrow_type:
pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
return pd.Series(pandas_array, copy=False, index=index)
return pd.Index(pandas_array, copy=False)
else:
return super().to_pandas(index=index, nullable=nullable)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)

def can_cast_safely(self, to_dtype: Dtype) -> bool:
to_dtype = cudf.api.types.dtype(to_dtype)
Expand Down
19 changes: 4 additions & 15 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from __future__ import annotations

from functools import cached_property
from typing import Optional

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -60,25 +59,15 @@ def to_arrow(self):
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
# We cannot go via Arrow's `to_pandas` because of the following issue:
# https://issues.apache.org/jira/browse/ARROW-12680
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
if arrow_type or nullable:
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
else:
return pd.Series(pa_array.tolist(), dtype="object", index=index)
return pd.Index(self.to_arrow().tolist(), dtype="object")

@cached_property
def memory_usage(self):
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5321,9 +5321,7 @@ def to_pandas(
"""
out_index = self.index.to_pandas()
out_data = {
i: col.to_pandas(
index=out_index, nullable=nullable, arrow_type=arrow_type
)
i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
for i, col in enumerate(self._data.columns)
}

Expand Down
45 changes: 8 additions & 37 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1568,10 +1568,11 @@ def any(self):
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.Index:
return pd.Index(
self._values.to_pandas(nullable=nullable, arrow_type=arrow_type),
name=self.name,
result = self._column.to_pandas(
nullable=nullable, arrow_type=arrow_type
)
result.name = self.name
return result

def append(self, other):
if is_list_like(other):
Expand Down Expand Up @@ -2191,23 +2192,10 @@ def isocalendar(self):
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.DatetimeIndex:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

result = self._values.to_pandas(arrow_type=arrow_type)
if arrow_type:
return pd.Index(result, name=self.name)
else:
freq = (
self._freq._maybe_as_fast_pandas_offset()
if self._freq is not None
else None
)
return pd.DatetimeIndex(result, name=self.name, freq=freq)
result = super().to_pandas(nullable=nullable, arrow_type=arrow_type)
if not arrow_type and self._freq is not None:
result.freq = self._freq._maybe_as_fast_pandas_offset()
return result

@_cudf_nvtx_annotate
def _get_dt_field(self, field):
Expand Down Expand Up @@ -2527,23 +2515,6 @@ def __getitem__(self, index):
return pd.Timedelta(value)
return value

@_cudf_nvtx_annotate
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.TimedeltaIndex:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

result = self._values.to_pandas(arrow_type=arrow_type)
if arrow_type:
return pd.Index(result, name=self.name)
else:
return pd.TimedeltaIndex(result, name=self.name)

@property # type: ignore
@_cudf_nvtx_annotate
def days(self):
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2022,11 +2022,11 @@ def to_pandas(
index = self.index.to_pandas()
else:
index = None # type: ignore[assignment]
s = self._column.to_pandas(
index=index, nullable=nullable, arrow_type=arrow_type
return pd.Series(
self._column.to_pandas(nullable=nullable, arrow_type=arrow_type),
index=index,
name=self.name,
)
s.name = self.name
return s

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_cuda_array_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
a = cudf.Series(cupy.asarray([1, 2, 3]))._column
a = cudf.core.column.as_column(a)
b = cupy.asarray([1, 1, 1]) # noqa: F841
assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
assert_eq(pd.Index([1, 2, 3]), a.to_pandas())

a = cudf.Series(cupy.asarray([1, 2, 3]))._column
a.name = "b"
b = cupy.asarray([1, 1, 1]) # noqa: F841
assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
assert_eq(pd.Index([1, 2, 3]), a.to_pandas())


@pytest.mark.xfail(
Expand Down

0 comments on commit fe74129

Please sign in to comment.