Skip to content

Commit

Permalink
Standarize methods used from cudf.core._internals (rapidsai#17765)
Browse files Browse the repository at this point in the history
* Standardizes imports from `cudf.core._internals`
* Make functions in `cudf.core._internals.unary` methods on `cudf.core.column.ColumnBase`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: rapidsai#17765
  • Loading branch information
mroeschke authored Jan 23, 2025
1 parent ffc5193 commit 1e8fa70
Show file tree
Hide file tree
Showing 16 changed files with 145 additions and 161 deletions.
15 changes: 6 additions & 9 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@
import cudf
from cudf.api.extensions import no_default
from cudf.api.types import is_integer, is_list_like, is_scalar
from cudf.core._internals import copying
from cudf.core._internals.stream_compaction import (
apply_boolean_mask,
drop_duplicates,
drop_nulls,
)
from cudf.core._internals import copying, stream_compaction
from cudf.core.abc import Serializable
from cudf.core.column import ColumnBase, column
from cudf.core.copy_types import GatherMap
Expand Down Expand Up @@ -1945,7 +1940,7 @@ def drop_duplicates(
# This utilizes the fact that all `Index` is also a `Frame`.
# Except RangeIndex.
return self._from_columns_like_self(
drop_duplicates(
stream_compaction.drop_duplicates(
list(self._columns),
keep=keep,
nulls_are_equal=nulls_are_equal,
Expand Down Expand Up @@ -2032,7 +2027,7 @@ def dropna(self, how="any"):
data_columns = [col.nans_to_nulls() for col in self._columns]

return self._from_columns_like_self(
drop_nulls(
stream_compaction.drop_nulls(
data_columns,
how=how,
),
Expand Down Expand Up @@ -2103,7 +2098,9 @@ def _apply_boolean_mask(self, boolean_mask):
raise ValueError("boolean_mask is not boolean type.")

return self._from_columns_like_self(
apply_boolean_mask(list(self._columns), boolean_mask),
stream_compaction.apply_boolean_mask(
list(self._columns), boolean_mask
),
column_names=self._column_names,
)

Expand Down
64 changes: 0 additions & 64 deletions python/cudf/cudf/core/_internals/unary.py

This file was deleted.

13 changes: 6 additions & 7 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from typing_extensions import Self

import cudf
from cudf.core._internals import unary
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethods
from cudf.core.dtypes import CategoricalDtype, IntervalDtype
Expand Down Expand Up @@ -126,7 +125,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex):
super().__init__(parent=parent)

@property
def categories(self) -> "cudf.core.index.Index":
def categories(self) -> cudf.Index:
"""
The categories of this categorical.
"""
Expand Down Expand Up @@ -608,7 +607,7 @@ def children(self) -> tuple[NumericalColumn]:

@property
def categories(self) -> ColumnBase:
return self.dtype.categories._values
return self.dtype.categories._column

@property
def codes(self) -> NumericalColumn:
Expand Down Expand Up @@ -1010,12 +1009,12 @@ def isnull(self) -> ColumnBase:
"""
Identify missing values in a CategoricalColumn.
"""
result = unary.is_null(self)
result = super().isnull()

if self.categories.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of an underlying float column
categories = unary.is_nan(self.categories)
categories = self.categories.isnan()
if categories.any():
code = self._encode(np.nan)
result = result | (self.codes == cudf.Scalar(code))
Expand All @@ -1026,12 +1025,12 @@ def notnull(self) -> ColumnBase:
"""
Identify non-missing values in a CategoricalColumn.
"""
result = unary.is_valid(self)
result = super().is_valid()

if self.categories.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of an underlying float column
categories = unary.is_nan(self.categories)
categories = self.categories.isnan()
if categories.any():
code = self._encode(np.nan)
result = result & (self.codes != cudf.Scalar(code))
Expand Down
93 changes: 72 additions & 21 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,18 @@
_is_non_decimal_numeric_dtype,
_is_pandas_nullable_extension_dtype,
infer_dtype,
is_decimal_dtype,
is_dtype_equal,
is_scalar,
is_string_dtype,
)
from cudf.core._compat import PANDAS_GE_210
from cudf.core._internals import aggregation, copying, sorting, unary
from cudf.core._internals.stream_compaction import (
apply_boolean_mask,
drop_duplicates,
drop_nulls,
from cudf.core._internals import (
aggregation,
copying,
search,
sorting,
stream_compaction,
)
from cudf.core._internals.timezones import get_compatible_timezone
from cudf.core.abc import Serializable
Expand Down Expand Up @@ -294,7 +296,9 @@ def any(self, skipna: bool = True) -> bool:

def dropna(self) -> Self:
if self.has_nulls():
return drop_nulls([self])[0]._with_type_metadata(self.dtype) # type: ignore[return-value]
return stream_compaction.drop_nulls([self])[0]._with_type_metadata(
self.dtype
) # type: ignore[return-value]
else:
return self.copy()

Expand Down Expand Up @@ -706,6 +710,7 @@ def _scatter_by_column(
self,
key: cudf.core.column.NumericalColumn,
value: cudf.core.scalar.Scalar | ColumnBase,
bounds_check: bool = True,
) -> Self:
if key.dtype.kind == "b":
# `key` is boolean mask
Expand Down Expand Up @@ -747,9 +752,9 @@ def _scatter_by_column(
._with_type_metadata(self.dtype)
)
else:
return copying.scatter([value], key, [self])[
0
]._with_type_metadata(self.dtype)
return copying.scatter(
[value], key, [self], bounds_check=bounds_check
)[0]._with_type_metadata(self.dtype)

def _check_scatter_key_length(
self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
Expand Down Expand Up @@ -827,17 +832,45 @@ def fillna(
result = type(self).from_pylibcudf(plc_column)
return result._with_type_metadata(self.dtype) # type: ignore[return-value]

@acquire_spill_lock()
def is_valid(self) -> ColumnBase:
"""Identify non-null values"""
return type(self).from_pylibcudf(
plc.unary.is_valid(self.to_pylibcudf(mode="read"))
)

def isnan(self) -> ColumnBase:
"""Identify NaN values in a Column."""
if self.dtype.kind != "f":
return as_column(False, length=len(self))
with acquire_spill_lock():
return type(self).from_pylibcudf(
plc.unary.is_nan(self.to_pylibcudf(mode="read"))
)

def notnan(self) -> ColumnBase:
"""Identify non-NaN values in a Column."""
if self.dtype.kind != "f":
return as_column(True, length=len(self))
with acquire_spill_lock():
return type(self).from_pylibcudf(
plc.unary.is_not_nan(self.to_pylibcudf(mode="read"))
)

def isnull(self) -> ColumnBase:
"""Identify missing values in a Column."""
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(False, length=len(self))

result = unary.is_null(self)
with acquire_spill_lock():
result = type(self).from_pylibcudf(
plc.unary.is_null(self.to_pylibcudf(mode="read"))
)

if self.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of a float column
result = result | unary.is_nan(self)
result = result | self.isnan()

return result

Expand All @@ -846,15 +879,22 @@ def notnull(self) -> ColumnBase:
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(True, length=len(self))

result = unary.is_valid(self)
with acquire_spill_lock():
result = type(self).from_pylibcudf(
plc.unary.is_valid(self.to_pylibcudf(mode="read"))
)

if self.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of a float column
result = result & unary.is_non_nan(self)
result = result & self.notnan()

return result

@cached_property
def nan_count(self) -> int:
return 0

def indices_of(
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
Expand All @@ -875,9 +915,9 @@ def indices_of(
else:
value = as_column(value, dtype=self.dtype, length=1)
mask = value.contains(self)
return apply_boolean_mask( # type: ignore[return-value]
[as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
)[0]
return as_column(
range(len(self)), dtype=SIZE_TYPE_DTYPE
).apply_boolean_mask(mask) # type: ignore[return-value]

def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
indices = self.indices_of(value)
Expand Down Expand Up @@ -1124,6 +1164,17 @@ def distinct_count(self, dropna: bool = True) -> int:
def can_cast_safely(self, to_dtype: Dtype) -> bool:
raise NotImplementedError()

@acquire_spill_lock()
def cast(self, dtype: Dtype) -> ColumnBase:
result = type(self).from_pylibcudf(
plc.unary.cast(
self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
)
)
if is_decimal_dtype(result.dtype):
result.dtype.precision = dtype.precision # type: ignore[union-attr]
return result

def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
if len(self) == 0:
dtype = cudf.dtype(dtype)
Expand Down Expand Up @@ -1257,9 +1308,9 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
if mask.dtype.kind != "b":
raise ValueError("boolean_mask is not boolean type.")

return apply_boolean_mask([self], mask)[0]._with_type_metadata(
self.dtype
)
return stream_compaction.apply_boolean_mask([self], mask)[
0
]._with_type_metadata(self.dtype)

def argsort(
self,
Expand Down Expand Up @@ -1332,7 +1383,7 @@ def searchsorted(
raise ValueError(
"Column searchsorted expects values to be column of same dtype"
)
return cudf.core._internals.search.search_sorted( # type: ignore[return-value]
return search.search_sorted( # type: ignore[return-value]
[self],
[value],
side=side,
Expand All @@ -1347,7 +1398,7 @@ def unique(self) -> Self:
if self.is_unique:
return self.copy()
else:
return drop_duplicates([self], keep="first")[ # type: ignore[return-value]
return stream_compaction.drop_duplicates([self], keep="first")[ # type: ignore[return-value]
0
]._with_type_metadata(self.dtype)

Expand Down
17 changes: 11 additions & 6 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@
import cudf.core.column.column as column
from cudf import _lib as libcudf
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals import binaryop, unary
from cudf.core._internals.search import search_sorted
from cudf.core._internals import binaryop
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
get_compatible_timezone,
Expand Down Expand Up @@ -574,7 +573,7 @@ def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
"Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
"Use tz_localize instead."
)
return unary.cast(self, dtype=dtype) # type: ignore[return-value]
return self.cast(dtype=dtype) # type: ignore[return-value]

def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override]
raise TypeError(
Expand Down Expand Up @@ -958,7 +957,7 @@ def tz_localize(
localized.dtype
)
indices = (
search_sorted([transition_times_local], [localized], "right") - 1
transition_times_local.searchsorted(localized, side="right") - 1
)
offsets_to_utc = offsets.take(indices, nullify=True)
gmt_data = localized - offsets_to_utc
Expand Down Expand Up @@ -1043,8 +1042,14 @@ def _utc_time(self):
def _local_time(self):
"""Return the local time as naive timestamps."""
transition_times, offsets = get_tz_data(str(self.dtype.tz))
transition_times = transition_times.astype(_get_base_dtype(self.dtype))
indices = search_sorted([transition_times], [self], "right") - 1
base_dtype = _get_base_dtype(self.dtype)
transition_times = transition_times.astype(base_dtype)
indices = (
transition_times.searchsorted(
self.astype(base_dtype), side="right"
)
- 1
)
offsets_from_utc = offsets.take(indices, nullify=True)
return self + offsets_from_utc

Expand Down
Loading

0 comments on commit 1e8fa70

Please sign in to comment.