Skip to content

Commit

Permalink
ENH: EA.fillna copy=True (pandas-dev#53728)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Jul 26, 2023
1 parent 317290a commit edc0870
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 35 deletions.
18 changes: 14 additions & 4 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,9 @@ def _fill_mask_inplace(
func(self._ndarray.T, limit=limit, mask=mask.T)

@doc(ExtensionArray.fillna)
def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
Expand All @@ -313,7 +315,9 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
# TODO: check value is None
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T.copy()
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T

Expand All @@ -322,14 +326,20 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
new_values = self._from_backing_data(npvalues)
else:
# fill with value
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)

new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values

# ------------------------------------------------------------------------
Expand Down
10 changes: 4 additions & 6 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,18 +905,16 @@ def fillna(
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
value, method = validate_fillna_kwargs(value, method)

if not self._hasna:
# TODO(CoW): Not necessary anymore when CoW is the default
return self.copy()

if limit is not None:
return super().fillna(value=value, method=method, limit=limit)

if method is not None:
return super().fillna(value=value, method=method, limit=limit)
if limit is not None or method is not None:
return super().fillna(value=value, method=method, limit=limit, copy=copy)

if isinstance(value, (np.ndarray, ExtensionArray)):
# Similar to check_value_size, but we do not mask here since we may
Expand Down Expand Up @@ -959,7 +957,7 @@ def convert_fill_value(value, pa_type, dtype):
# a kernel for duration types.
pass

return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

def isin(self, values) -> npt.NDArray[np.bool_]:
# short-circuit to return all False array.
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,7 @@ def fillna(
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
"""
Fill NA/NaN values using the specified method.
Expand All @@ -896,6 +897,14 @@ def fillna(
maximum number of entries along the entire axis where NaNs will be
filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.
The base class implementation ignores the keyword in pad/backfill
cases.
Returns
-------
ExtensionArray
Expand Down Expand Up @@ -932,10 +941,16 @@ def fillna(
return self[::-1].take(indexer, allow_fill=True)
else:
# fill with value
new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values

def dropna(self) -> Self:
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,9 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr
indexer = obj.argsort()[-1]
return obj[indexer]

def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
"""
Fill NA/NaN values using the specified method.
Expand All @@ -911,11 +913,18 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.
Returns
-------
filled : IntervalArray with NA/NaN filled
"""
if copy is False:
raise NotImplementedError
if method is not None:
return super().fillna(value=value, method=method, limit=limit)

Expand Down
21 changes: 16 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any:
return self._simple_new(self._data[item], newmask)

@doc(ExtensionArray.fillna)
def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(value, method)

mask = self._mask
Expand All @@ -195,16 +197,25 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
if mask.any():
if method is not None:
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._data.copy().T
new_mask = mask.copy().T
npvalues = self._data.T
new_mask = mask.T
if copy:
npvalues = npvalues.copy()
new_mask = new_mask.copy()
func(npvalues, limit=limit, mask=new_mask)
return self._simple_new(npvalues.T, new_mask.T)
else:
# fill with value
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
new_values = self.copy()
if copy:
new_values = self.copy()
else:
new_values = self[:]
return new_values

@classmethod
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,16 +790,18 @@ def searchsorted(
m8arr = self._ndarray.view("M8[ns]")
return m8arr.searchsorted(npvalue, side=side, sorter=sorter)

def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
if method is not None:
# view as dt64 so we get treated as timelike in core.missing,
# similar to dtl._period_dispatch
dta = self.view("M8[ns]")
result = dta.fillna(value=value, method=method, limit=limit)
result = dta.fillna(value=value, method=method, limit=limit, copy=copy)
# error: Incompatible return value type (got "Union[ExtensionArray,
# ndarray[Any, Any]]", expected "PeriodArray")
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)
return super().fillna(value=value, method=method, limit=limit, copy=copy)

# ------------------------------------------------------------------
# Arithmetic Methods
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,7 @@ def fillna(
value=None,
method: FillnaOptions | None = None,
limit: int | None = None,
copy: bool = True,
) -> Self:
"""
Fill missing values with `value`.
Expand All @@ -734,6 +735,9 @@ def fillna(
limit : int, optional
copy: bool, default True
Ignored for SparseArray.
Returns
-------
SparseArray
Expand Down
53 changes: 48 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
cast,
final,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -41,6 +42,7 @@
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.astype import (
Expand Down Expand Up @@ -1895,12 +1897,32 @@ def pad_or_backfill(
using_cow: bool = False,
) -> list[Block]:
values = self.values
copy, refs = self._get_refs_and_copy(using_cow, inplace)

if values.ndim == 2 and axis == 1:
# NDArrayBackedExtensionArray.fillna assumes axis=0
new_values = values.T.fillna(method=method, limit=limit).T
new_values = values.T.fillna(method=method, limit=limit, copy=copy).T
else:
new_values = values.fillna(method=method, limit=limit)
return [self.make_block_same_class(new_values)]
try:
new_values = values.fillna(method=method, limit=limit, copy=copy)
except TypeError:
# 3rd party EA that has not implemented copy keyword yet
refs = None
new_values = values.fillna(method=method, limit=limit)
# issue the warning *after* retrying, in case the TypeError
# was caused by an invalid fill_value
warnings.warn(
# GH#53278
"ExtensionArray.fillna added a 'copy' keyword in pandas "
"2.1.0. In a future version, ExtensionArray subclasses will "
"need to implement this keyword or an exception will be "
"raised. In the interim, the keyword is ignored by "
f"{type(self.values).__name__}.",
FutureWarning,
stacklevel=find_stack_level(),
)

return [self.make_block_same_class(new_values, refs=refs)]


class ExtensionBlock(libinternals.Block, EABackedBlock):
Expand Down Expand Up @@ -1938,8 +1960,29 @@ def fillna(
refs = self.refs
new_values = self.values
else:
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
copy, refs = self._get_refs_and_copy(using_cow, inplace)

try:
new_values = self.values.fillna(
value=value, method=None, limit=limit, copy=copy
)
except TypeError:
# 3rd party EA that has not implemented copy keyword yet
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
# issue the warning *after* retrying, in case the TypeError
# was caused by an invalid fill_value
warnings.warn(
# GH#53278
"ExtensionArray.fillna added a 'copy' keyword in pandas "
"2.1.0. In a future version, ExtensionArray subclasses will "
"need to implement this keyword or an exception will be "
"raised. In the interim, the keyword is ignored by "
f"{type(self.values).__name__}.",
FutureWarning,
stacklevel=find_stack_level(),
)

nb = self.make_block_same_class(new_values, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow=using_cow)

Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,19 +331,23 @@ def test_fillna_inplace_ea_noop_shares_memory(
view = df[:]
df.fillna(100, inplace=True)

assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
# MaskedArray can actually respect inplace=True
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))

assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not df._mgr._has_no_reference(1)
assert not view._mgr._has_no_reference(1)
elif isinstance(df.dtypes.iloc[0], ArrowDtype):
# arrow is immutable, so no-ops do not need to copy underlying array
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(view, "b"))

df.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, view)
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
tm.assert_frame_equal(df_orig, view)
else:
# we actually have a view
tm.assert_frame_equal(df, view)


def test_fillna_chained_assignment(using_copy_on_write):
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,17 @@ def convert_values(param):
def value_counts(self, dropna: bool = True):
return value_counts(self.to_numpy(), dropna=dropna)

# Simulate a 3rd-party EA that has not yet updated to include a "copy"
# keyword in its fillna method.
# error: Signature of "fillna" incompatible with supertype "ExtensionArray"
def fillna( # type: ignore[override]
self,
value=None,
method=None,
limit: int | None = None,
):
return super().fillna(value=value, method=method, limit=limit, copy=True)


def to_decimal(values, context=None):
return DecimalArray([decimal.Decimal(x) for x in values], context=context)
Expand Down
Loading

0 comments on commit edc0870

Please sign in to comment.