diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d399b4780a938..365f85a908099 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -296,7 +296,9 @@ def _fill_mask_inplace( func(self._ndarray.T, limit=limit, mask=mask.T) @doc(ExtensionArray.fillna) - def fillna(self, value=None, method=None, limit: int | None = None) -> Self: + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: value, method = validate_fillna_kwargs( value, method, validate_scalar_dict_value=False ) @@ -313,7 +315,9 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self: # TODO: check value is None # (for now) when self.ndim == 2, we assume axis=0 func = missing.get_fill_func(method, ndim=self.ndim) - npvalues = self._ndarray.T.copy() + npvalues = self._ndarray.T + if copy: + npvalues = npvalues.copy() func(npvalues, limit=limit, mask=mask.T) npvalues = npvalues.T @@ -322,14 +326,20 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self: new_values = self._from_backing_data(npvalues) else: # fill with value - new_values = self.copy() + if copy: + new_values = self.copy() + else: + new_values = self[:] new_values[mask] = value else: # We validate the fill_value even if there is nothing to fill if value is not None: self._validate_setitem_value(value) - new_values = self.copy() + if not copy: + new_values = self[:] + else: + new_values = self.copy() return new_values # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1dcaba34c36f3..7976f97cb49aa 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -905,6 +905,7 @@ def fillna( value: object | ArrayLike | None = None, method: FillnaOptions | None = None, limit: int | None = None, + copy: bool = True, ) -> Self: value, method = validate_fillna_kwargs(value, method) @@ -912,11 +913,8 @@ def fillna( # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is not None: - return super().fillna(value=value, method=method, limit=limit) - - if method is not None: - return super().fillna(value=value, method=method, limit=limit) + if limit is not None or method is not None: + return super().fillna(value=value, method=method, limit=limit, copy=copy) if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may @@ -959,7 +957,7 @@ def convert_fill_value(value, pa_type, dtype): # a kernel for duration types. pass - return super().fillna(value=value, method=method, limit=limit) + return super().fillna(value=value, method=method, limit=limit, copy=copy) def isin(self, values) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5ca9716f5c8d6..265d8aa41f53f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -872,6 +872,7 @@ def fillna( value: object | ArrayLike | None = None, method: FillnaOptions | None = None, limit: int | None = None, + copy: bool = True, ) -> Self: """ Fill NA/NaN values using the specified method. @@ -896,6 +897,14 @@ def fillna( maximum number of entries along the entire axis where NaNs will be filled. + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. + The base class implementation ignores the keyword in pad/backfill + cases. + Returns ------- ExtensionArray @@ -932,10 +941,16 @@ def fillna( return self[::-1].take(indexer, allow_fill=True) else: # fill with value - new_values = self.copy() + if not copy: + new_values = self[:] + else: + new_values = self.copy() new_values[mask] = value else: - new_values = self.copy() + if not copy: + new_values = self[:] + else: + new_values = self.copy() return new_values def dropna(self) -> Self: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index fad866374a9b1..01f631b54a1d7 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -889,7 +889,9 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def fillna(self, value=None, method=None, limit: int | None = None) -> Self: + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: """ Fill NA/NaN values using the specified method. @@ -911,11 +913,18 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self: be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. Returns ------- filled : IntervalArray with NA/NaN filled """ + if copy is False: + raise NotImplementedError if method is not None: return super().fillna(value=value, method=method, limit=limit) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6af30f31270d7..955adfab7f8dc 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -185,7 +185,9 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) @doc(ExtensionArray.fillna) - def fillna(self, value=None, method=None, limit: int | None = None) -> Self: + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: value, method = validate_fillna_kwargs(value, method) mask = self._mask @@ -195,16 +197,25 @@ def fillna(self, value=None, method=None, limit: int | None = None) -> Self: if mask.any(): if method is not None: func = missing.get_fill_func(method, ndim=self.ndim) - npvalues = self._data.copy().T - new_mask = mask.copy().T + npvalues = self._data.T + new_mask = mask.T + if copy: + npvalues = npvalues.copy() + new_mask = new_mask.copy() func(npvalues, limit=limit, mask=new_mask) return self._simple_new(npvalues.T, new_mask.T) else: # fill with value - new_values = self.copy() + if copy: + new_values = self.copy() + else: + new_values = self[:] new_values[mask] = value else: - new_values = self.copy() + if copy: + new_values = self.copy() + else: + new_values = self[:] return new_values @classmethod diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a8bbe607caba5..af6402b9964e5 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -790,16 +790,18 @@ def searchsorted( m8arr = self._ndarray.view("M8[ns]") return m8arr.searchsorted(npvalue, side=side, sorter=sorter) - def fillna(self, value=None, method=None, limit: int | None = None) -> Self: + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: if method is not None: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta.fillna(value=value, method=method, limit=limit) + result = dta.fillna(value=value, method=method, limit=limit, copy=copy) # error: Incompatible return value type (got "Union[ExtensionArray, # ndarray[Any, Any]]", expected "PeriodArray") return result.view(self.dtype) # type: ignore[return-value] - return super().fillna(value=value, method=method, limit=limit) + return super().fillna(value=value, method=method, limit=limit, copy=copy) # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 40079f8d7ff55..f666f4f8459d7 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -717,6 +717,7 @@ def fillna( value=None, method: FillnaOptions | None = None, limit: int | None = None, + copy: bool = True, ) -> Self: """ Fill missing values with `value`. @@ -734,6 +735,9 @@ def fillna( limit : int, optional + copy: bool, default True + Ignored for SparseArray. + Returns ------- SparseArray diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 64906d8792a8a..e584b40cf2a9b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -10,6 +10,7 @@ cast, final, ) +import warnings import numpy as np @@ -41,6 +42,7 @@ ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.astype import ( @@ -1895,12 +1897,32 @@ def pad_or_backfill( using_cow: bool = False, ) -> list[Block]: values = self.values + copy, refs = self._get_refs_and_copy(using_cow, inplace) + if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T.fillna(method=method, limit=limit).T + new_values = values.T.fillna(method=method, limit=limit, copy=copy).T else: - new_values = values.fillna(method=method, limit=limit) - return [self.make_block_same_class(new_values)] + try: + new_values = values.fillna(method=method, limit=limit, copy=copy) + except TypeError: + # 3rd party EA that has not implemented copy keyword yet + refs = None + new_values = values.fillna(method=method, limit=limit) + # issue the warning *after* retrying, in case the TypeError + # was caused by an invalid fill_value + warnings.warn( + # GH#53278 + "ExtensionArray.fillna added a 'copy' keyword in pandas " + "2.1.0. In a future version, ExtensionArray subclasses will " + "need to implement this keyword or an exception will be " + "raised. In the interim, the keyword is ignored by " + f"{type(self.values).__name__}.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return [self.make_block_same_class(new_values, refs=refs)] class ExtensionBlock(libinternals.Block, EABackedBlock): @@ -1938,8 +1960,29 @@ def fillna( refs = self.refs new_values = self.values else: - refs = None - new_values = self.values.fillna(value=value, method=None, limit=limit) + copy, refs = self._get_refs_and_copy(using_cow, inplace) + + try: + new_values = self.values.fillna( + value=value, method=None, limit=limit, copy=copy + ) + except TypeError: + # 3rd party EA that has not implemented copy keyword yet + refs = None + new_values = self.values.fillna(value=value, method=None, limit=limit) + # issue the warning *after* retrying, in case the TypeError + # was caused by an invalid fill_value + warnings.warn( + # GH#53278 + "ExtensionArray.fillna added a 'copy' keyword in pandas " + "2.1.0. In a future version, ExtensionArray subclasses will " + "need to implement this keyword or an exception will be " + "raised. In the interim, the keyword is ignored by " + f"{type(self.values).__name__}.", + FutureWarning, + stacklevel=find_stack_level(), + ) + nb = self.make_block_same_class(new_values, refs=refs) return nb._maybe_downcast([nb], downcast, using_cow=using_cow) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index eb10e3b4e716a..cfaae644af39a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -331,19 +331,23 @@ def test_fillna_inplace_ea_noop_shares_memory( view = df[:] df.fillna(100, inplace=True) - assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + else: + # MaskedArray can actually respect inplace=True + assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) assert not df._mgr._has_no_reference(1) assert not view._mgr._has_no_reference(1) - elif isinstance(df.dtypes.iloc[0], ArrowDtype): - # arrow is immutable, so no-ops do not need to copy underlying array - assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(view, "b")) + df.iloc[0, 1] = 100 - tm.assert_frame_equal(df_orig, view) + if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: + tm.assert_frame_equal(df_orig, view) + else: + # we actually have a view + tm.assert_frame_equal(df, view) def test_fillna_chained_assignment(using_copy_on_write): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index fc579a50fef78..3101bbd171f75 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -281,6 +281,17 @@ def convert_values(param): def value_counts(self, dropna: bool = True): return value_counts(self.to_numpy(), dropna=dropna) + # Simulate a 3rd-party EA that has not yet updated to include a "copy" + # keyword in its fillna method. + # error: Signature of "fillna" incompatible with supertype "ExtensionArray" + def fillna( # type: ignore[override] + self, + value=None, + method=None, + limit: int | None = None, + ): + return super().fillna(value=value, method=method, limit=limit, copy=True) + def to_decimal(values, context=None): return DecimalArray([decimal.Decimal(x) for x in values], context=context) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 3f6b1ec8d20dd..4f0ff427dd900 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -97,7 +97,45 @@ class TestIndex(base.BaseIndexTests): class TestMissing(base.BaseMissingTests): - pass + def test_fillna_frame(self, data_missing): + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_frame(data_missing) + + def test_fillna_limit_pad(self, data_missing): + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_pad(data_missing) + + def test_fillna_limit_backfill(self, data_missing): + msg = "|".join( + [ + "ExtensionArray.fillna added a 'copy' keyword", + "Series.fillna with 'method' is deprecated", + ] + ) + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_backfill(data_missing) + + def test_fillna_series(self, data_missing): + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_series(data_missing) + + def test_fillna_series_method(self, data_missing, fillna_method): + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_series_method(data_missing, fillna_method) class Reduce: @@ -168,6 +206,18 @@ class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests): class TestMethods(base.BaseMethodsTests): + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + warn = FutureWarning if not using_copy_on_write else None + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + super().test_fillna_copy_frame(data_missing) + + def test_fillna_copy_series(self, data_missing, using_copy_on_write): + warn = FutureWarning if not using_copy_on_write else None + msg = "ExtensionArray.fillna added a 'copy' keyword" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + super().test_fillna_copy_series(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna, request): all_data = all_data[:10]