Standarize methods used from cudf.core._internals (rapidsai#17765)

* Standardizes imports from `cudf.core._internals` * Make functions in `cudf.core._internals.unary` methods on `cudf.core.column.ColumnBase` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: rapidsai#17765
Matt711 · Jan 23, 2025 · 1e8fa70 · 1e8fa70
1 parent ffc5193
commit 1e8fa70
Show file tree

Hide file tree

Showing 16 changed files with 145 additions and 161 deletions.
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -12,12 +12,7 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_scalar
-from cudf.core._internals import copying
-from cudf.core._internals.stream_compaction import (
-    apply_boolean_mask,
-    drop_duplicates,
-    drop_nulls,
-)
+from cudf.core._internals import copying, stream_compaction
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.core.copy_types import GatherMap
@@ -1945,7 +1940,7 @@ def drop_duplicates(
         # This utilizes the fact that all `Index` is also a `Frame`.
         # Except RangeIndex.
         return self._from_columns_like_self(
-            drop_duplicates(
+            stream_compaction.drop_duplicates(
                 list(self._columns),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
@@ -2032,7 +2027,7 @@ def dropna(self, how="any"):
         data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
-            drop_nulls(
+            stream_compaction.drop_nulls(
                 data_columns,
                 how=how,
             ),
@@ -2103,7 +2098,9 @@ def _apply_boolean_mask(self, boolean_mask):
             raise ValueError("boolean_mask is not boolean type.")
 
         return self._from_columns_like_self(
-            apply_boolean_mask(list(self._columns), boolean_mask),
+            stream_compaction.apply_boolean_mask(
+                list(self._columns), boolean_mask
+            ),
             column_names=self._column_names,
         )
 

diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -12,7 +12,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -126,7 +125,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex):
         super().__init__(parent=parent)
 
     @property
-    def categories(self) -> "cudf.core.index.Index":
+    def categories(self) -> cudf.Index:
         """
         The categories of this categorical.
         """
@@ -608,7 +607,7 @@ def children(self) -> tuple[NumericalColumn]:
 
     @property
     def categories(self) -> ColumnBase:
-        return self.dtype.categories._values
+        return self.dtype.categories._column
 
     @property
     def codes(self) -> NumericalColumn:
@@ -1010,12 +1009,12 @@ def isnull(self) -> ColumnBase:
         """
         Identify missing values in a CategoricalColumn.
         """
-        result = unary.is_null(self)
+        result = super().isnull()
 
         if self.categories.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of an underlying float column
-            categories = unary.is_nan(self.categories)
+            categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
                 result = result | (self.codes == cudf.Scalar(code))
@@ -1026,12 +1025,12 @@ def notnull(self) -> ColumnBase:
         """
         Identify non-missing values in a CategoricalColumn.
         """
-        result = unary.is_valid(self)
+        result = super().is_valid()
 
         if self.categories.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of an underlying float column
-            categories = unary.is_nan(self.categories)
+            categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
                 result = result & (self.codes != cudf.Scalar(code))

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -29,16 +29,18 @@
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
+    is_decimal_dtype,
     is_dtype_equal,
     is_scalar,
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import aggregation, copying, sorting, unary
-from cudf.core._internals.stream_compaction import (
-    apply_boolean_mask,
-    drop_duplicates,
-    drop_nulls,
+from cudf.core._internals import (
+    aggregation,
+    copying,
+    search,
+    sorting,
+    stream_compaction,
 )
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
@@ -294,7 +296,9 @@ def any(self, skipna: bool = True) -> bool:
 
     def dropna(self) -> Self:
         if self.has_nulls():
-            return drop_nulls([self])[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
+            return stream_compaction.drop_nulls([self])[0]._with_type_metadata(
+                self.dtype
+            )  # type: ignore[return-value]
         else:
             return self.copy()
 
@@ -706,6 +710,7 @@ def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
         value: cudf.core.scalar.Scalar | ColumnBase,
+        bounds_check: bool = True,
     ) -> Self:
         if key.dtype.kind == "b":
             # `key` is boolean mask
@@ -747,9 +752,9 @@ def _scatter_by_column(
                     ._with_type_metadata(self.dtype)
                 )
         else:
-            return copying.scatter([value], key, [self])[
-                0
-            ]._with_type_metadata(self.dtype)
+            return copying.scatter(
+                [value], key, [self], bounds_check=bounds_check
+            )[0]._with_type_metadata(self.dtype)
 
     def _check_scatter_key_length(
         self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
@@ -827,17 +832,45 @@ def fillna(
             result = type(self).from_pylibcudf(plc_column)
         return result._with_type_metadata(self.dtype)  # type: ignore[return-value]
 
+    @acquire_spill_lock()
+    def is_valid(self) -> ColumnBase:
+        """Identify non-null values"""
+        return type(self).from_pylibcudf(
+            plc.unary.is_valid(self.to_pylibcudf(mode="read"))
+        )
+
+    def isnan(self) -> ColumnBase:
+        """Identify NaN values in a Column."""
+        if self.dtype.kind != "f":
+            return as_column(False, length=len(self))
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.is_nan(self.to_pylibcudf(mode="read"))
+            )
+
+    def notnan(self) -> ColumnBase:
+        """Identify non-NaN values in a Column."""
+        if self.dtype.kind != "f":
+            return as_column(True, length=len(self))
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.is_not_nan(self.to_pylibcudf(mode="read"))
+            )
+
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
         if not self.has_nulls(include_nan=self.dtype.kind == "f"):
             return as_column(False, length=len(self))
 
-        result = unary.is_null(self)
+        with acquire_spill_lock():
+            result = type(self).from_pylibcudf(
+                plc.unary.is_null(self.to_pylibcudf(mode="read"))
+            )
 
         if self.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of a float column
-            result = result | unary.is_nan(self)
+            result = result | self.isnan()
 
         return result
 
@@ -846,15 +879,22 @@ def notnull(self) -> ColumnBase:
         if not self.has_nulls(include_nan=self.dtype.kind == "f"):
             return as_column(True, length=len(self))
 
-        result = unary.is_valid(self)
+        with acquire_spill_lock():
+            result = type(self).from_pylibcudf(
+                plc.unary.is_valid(self.to_pylibcudf(mode="read"))
+            )
 
         if self.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of a float column
-            result = result & unary.is_non_nan(self)
+            result = result & self.notnan()
 
         return result
 
+    @cached_property
+    def nan_count(self) -> int:
+        return 0
+
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
@@ -875,9 +915,9 @@ def indices_of(
         else:
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
-        return apply_boolean_mask(  # type: ignore[return-value]
-            [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
-        )[0]
+        return as_column(
+            range(len(self)), dtype=SIZE_TYPE_DTYPE
+        ).apply_boolean_mask(mask)  # type: ignore[return-value]
 
     def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
         indices = self.indices_of(value)
@@ -1124,6 +1164,17 @@ def distinct_count(self, dropna: bool = True) -> int:
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
+    @acquire_spill_lock()
+    def cast(self, dtype: Dtype) -> ColumnBase:
+        result = type(self).from_pylibcudf(
+            plc.unary.cast(
+                self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
+            )
+        )
+        if is_decimal_dtype(result.dtype):
+            result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result
+
     def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
         if len(self) == 0:
             dtype = cudf.dtype(dtype)
@@ -1257,9 +1308,9 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
         if mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
-        return apply_boolean_mask([self], mask)[0]._with_type_metadata(
-            self.dtype
-        )
+        return stream_compaction.apply_boolean_mask([self], mask)[
+            0
+        ]._with_type_metadata(self.dtype)
 
     def argsort(
         self,
@@ -1332,7 +1383,7 @@ def searchsorted(
             raise ValueError(
                 "Column searchsorted expects values to be column of same dtype"
             )
-        return cudf.core._internals.search.search_sorted(  # type: ignore[return-value]
+        return search.search_sorted(  # type: ignore[return-value]
             [self],
             [value],
             side=side,
@@ -1347,7 +1398,7 @@ def unique(self) -> Self:
         if self.is_unique:
             return self.copy()
         else:
-            return drop_duplicates([self], keep="first")[  # type: ignore[return-value]
+            return stream_compaction.drop_duplicates([self], keep="first")[  # type: ignore[return-value]
                 0
             ]._with_type_metadata(self.dtype)
 

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -21,8 +21,7 @@
 import cudf.core.column.column as column
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core._internals import binaryop, unary
-from cudf.core._internals.search import search_sorted
+from cudf.core._internals import binaryop
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
     get_compatible_timezone,
@@ -574,7 +573,7 @@ def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
                 "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
                 "Use tz_localize instead."
             )
-        return unary.cast(self, dtype=dtype)  # type: ignore[return-value]
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
@@ -958,7 +957,7 @@ def tz_localize(
             localized.dtype
         )
         indices = (
-            search_sorted([transition_times_local], [localized], "right") - 1
+            transition_times_local.searchsorted(localized, side="right") - 1
         )
         offsets_to_utc = offsets.take(indices, nullify=True)
         gmt_data = localized - offsets_to_utc
@@ -1043,8 +1042,14 @@ def _utc_time(self):
     def _local_time(self):
         """Return the local time as naive timestamps."""
         transition_times, offsets = get_tz_data(str(self.dtype.tz))
-        transition_times = transition_times.astype(_get_base_dtype(self.dtype))
-        indices = search_sorted([transition_times], [self], "right") - 1
+        base_dtype = _get_base_dtype(self.dtype)
+        transition_times = transition_times.astype(base_dtype)
+        indices = (
+            transition_times.searchsorted(
+                self.astype(base_dtype), side="right"
+            )
+            - 1
+        )
         offsets_from_utc = offsets.take(indices, nullify=True)
         return self + offsets_from_utc