Merge branch 'main' into ref-from_sequence_of_strings

jbrockmendel · Jan 10, 2024 · 2b1103a · 2b1103a
2 parents a00ff07 + c84f989
commit 2b1103a
Show file tree

Hide file tree

Showing 236 changed files with 2,121 additions and 1,688 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,11 +18,6 @@ ci:
     # manual stage hooks
     skip: [pylint, pyright, mypy]
 repos:
--   repo: https://github.com/hauntsaninja/black-pre-commit-mirror
-    # black compiled with mypyc
-    rev: 23.11.0
-    hooks:
-      - id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.6
     hooks:
@@ -35,6 +30,9 @@ repos:
         files: ^pandas
         exclude: ^pandas/tests
         args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
+    -   id: ruff-format
+        # TODO: "." not needed in ruff 0.1.8
+        args: ["."]
 -   repo: https://github.com/jendrikseipp/vulture
     rev: 'v2.10'
     hooks:
@@ -274,13 +272,6 @@ repos:
         language: python
         types: [rst]
         files: ^doc/source/(development|reference)/
-    -   id: unwanted-patterns-bare-pytest-raises
-        name: Check for use of bare pytest raises
-        language: python
-        entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
-        types: [python]
-        files: ^pandas/tests/
-        exclude: ^pandas/tests/extension/
     -   id: unwanted-patterns-private-function-across-module
         name: Check for use of private functions across modules
         language: python

diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,8 @@ RUN apt-get update && apt-get -y upgrade
 RUN apt-get install -y build-essential
 
 # hdf5 needed for pytables installation
-RUN apt-get install -y libhdf5-dev
+# libgles2-mesa needed for pytest-qt
+RUN apt-get install -y libhdf5-dev libgles2-mesa-dev
 
 RUN python -m pip install --upgrade pip
 RUN python -m pip install \

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -84,9 +84,7 @@ def time_loc_slice(self, index, index_structure):
 
 class NumericMaskedIndexing:
     monotonic_list = list(range(10**6))
-    non_monotonic_list = (
-        list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
-    )
+    non_monotonic_list = list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
 
     params = [
         ("Int64", "UInt64", "Float64"),

diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py
@@ -76,7 +76,8 @@ def _style_format(self):
         # apply a formatting function
         # subset is flexible but hinders vectorised solutions
         self.st = self.df.style.format(
-            "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"]
+            "{:,.3f}",
+            subset=IndexSlice["row_1" : f"row_{ir}", "float_1" : f"float_{ic}"],
         )
 
     def _style_apply_format_hide(self):

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -16,12 +16,18 @@
 
 set -uo pipefail
 
-[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \
+if [[ -v 1 ]]; then
+    CHECK=$1
+else
+    # script will fail if it uses an unset variable (i.e. $1 is not provided)
+    CHECK=""
+fi
+
+[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
     { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
 
 BASE_DIR="$(dirname $0)/.."
 RET=0
-CHECK=$1
 
 ### CODE ###
 if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -38,7 +38,7 @@ Pre-commit
 ----------
 
 Additionally, :ref:`Continuous Integration <contributing.ci>` will run code formatting checks
-like ``black``, ``ruff``,
+like ``ruff``,
 ``isort``, and ``clang-format`` and more using `pre-commit hooks <https://pre-commit.com/>`_.
 Any warnings from these checks will cause the :ref:`Continuous Integration <contributing.ci>` to fail; therefore,
 it is helpful to run the check yourself before submitting code. This

diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
    :toctree: api/
 
    Series.align
+   Series.case_when
    Series.drop
    Series.droplevel
    Series.drop_duplicates

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3471,20 +3471,15 @@ saving a ``DataFrame`` to Excel.  Generally the semantics are
 similar to working with :ref:`csv<io.read_csv_table>` data.
 See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.
 
-.. warning::
-
-   The `xlrd <https://xlrd.readthedocs.io/en/latest/>`__ package is now only for reading
-   old-style ``.xls`` files.
+.. note::
 
-   Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
-   would result in using the ``xlrd`` engine in many cases, including new
-   Excel 2007+ (``.xlsx``) files. pandas will now default to using the
-   `openpyxl <https://openpyxl.readthedocs.io/en/stable/>`__ engine.
+   When ``engine=None``, the following logic will be used to determine the engine:
 
-   It is strongly encouraged to install ``openpyxl`` to read Excel 2007+
-   (``.xlsx``) files.
-   **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
-   This is no longer supported, switch to using ``openpyxl`` instead.
+   - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+     then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+   - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.
+   - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.
+   - Otherwise ``openpyxl`` will be used.
 
 .. _io.excel_reader:
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d
 
   In [3]: ser[0] = 'not an int64'
   FutureWarning:
-    Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas.
+    Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas.
     Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 
   In [4]: ser

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -188,6 +188,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
 Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
 documentation.
 
+.. _whatsnew_220.enhancements.case_when:
+
+Create a pandas Series based on one or more conditions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`)
+
+.. ipython:: python
+
+   import pandas as pd
+
+   df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6]))
+   default=pd.Series('default', index=df.index)
+   default.case_when(
+        caselist=[
+            (df.a == 1, 'first'),                              # condition, replacement
+            (df.a.gt(1) & df.b.eq(5), 'second'),  # condition, replacement
+        ],
+   )
+
 .. _whatsnew_220.enhancements.to_numpy_ea:
 
 ``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype
@@ -321,7 +341,7 @@ Other enhancements
 - :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
-- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
+- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`)
 - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
 - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
 - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
@@ -741,6 +761,7 @@ Categorical
 ^^^^^^^^^^^
 - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`)
 - Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`)
+- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`)
 
 Datetimelike
 ^^^^^^^^^^^^
@@ -786,8 +807,11 @@ Timezones
 Numeric
 ^^^^^^^
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
+- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
 - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
 - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`)
+- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`)
 
 Conversion
 ^^^^^^^^^^
@@ -814,6 +838,7 @@ Interval
 - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`)
 - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`)
 - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
+- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`)
 - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
 - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
 - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
@@ -845,6 +870,7 @@ I/O
 - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
 - Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`)
 - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
+- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`)
 - Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`)
 - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
 - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
@@ -872,6 +898,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`)
 - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`)
 - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`)
+- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`)
 - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
 - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
@@ -887,6 +914,7 @@ Reshaping
 - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
 - Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
 - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
+- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
 - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
 - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
 - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
@@ -900,6 +928,7 @@ Sparse
 
 Other
 ^^^^^
+- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
 - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
 - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
 - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -92,7 +92,8 @@ Other API changes
 
 Deprecations
 ~~~~~~~~~~~~
--
+- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
+- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -108,6 +109,8 @@ Performance improvements
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
+
 
 Categorical
 ^^^^^^^^^^^

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -196,15 +196,18 @@ class HashTable:
         *,
         return_inverse: Literal[True],
         mask: None = ...,
-    ) -> tuple[np.ndarray, npt.NDArray[np.intp],]: ...  # np.ndarray[subclass-specific]
+    ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ...  # np.ndarray[subclass-specific]
     @overload
     def unique(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
         *,
         return_inverse: Literal[False] = ...,
         mask: npt.NDArray[np.bool_],
-    ) -> tuple[np.ndarray, npt.NDArray[np.bool_],]: ...  # np.ndarray[subclass-specific]
+    ) -> tuple[
+        np.ndarray,
+        npt.NDArray[np.bool_],
+    ]: ...  # np.ndarray[subclass-specific]
     def factorize(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -179,7 +179,8 @@ def indices_fast(
     sorted_labels: list[npt.NDArray[np.int64]],
 ) -> dict[Hashable, npt.NDArray[np.intp]]: ...
 def generate_slices(
-    labels: np.ndarray, ngroups: int  # const intp_t[:]
+    labels: np.ndarray,
+    ngroups: int,  # const intp_t[:]
 ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
 def count_level_2d(
     mask: np.ndarray,  # ndarray[uint8_t, ndim=2, cast=True],
@@ -209,5 +210,6 @@ def get_reverse_indexer(
 def is_bool_list(obj: list) -> bool: ...
 def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
 def is_range_indexer(
-    left: np.ndarray, n: int  # np.ndarray[np.int64, ndim=1]
+    left: np.ndarray,
+    n: int,  # np.ndarray[np.int64, ndim=1]
 ) -> bool: ...
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -4734,6 +4734,10 @@ cpdef to_offset(freq, bint is_period=False):
     Parameters
     ----------
     freq : str, datetime.timedelta, BaseOffset or None
+        The frequency represented.
+    is_period : bool, default False
+        Convert string denoting period frequency to corresponding offsets
+        frequency if is_period=True.
 
     Returns
     -------
@@ -4768,6 +4772,17 @@ cpdef to_offset(freq, bint is_period=False):
 
     >>> to_offset(pd.offsets.Hour())
     <Hour>
+
+    Passing the parameter ``is_period`` equal to True, you can use a string
+    denoting period frequency:
+
+    >>> freq = to_offset(freq="ME", is_period=False)
+    >>> freq.rule_code
+    'ME'
+
+    >>> freq = to_offset(freq="M", is_period=True)
+    >>> freq.rule_code
+    'ME'
     """
     if freq is None:
         return None

diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -132,7 +132,7 @@ cdef bint parse_today_now(
         if infer_reso:
             creso = NPY_DATETIMEUNIT.NPY_FR_us
         if utc:
-            ts = <_Timestamp>Timestamp.utcnow()
+            ts = <_Timestamp>Timestamp.now(timezone.utc)
             iresult[0] = ts._as_creso(creso)._value
         else:
             # GH#18705 make sure to_datetime("now") matches Timestamp("now")

diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -1418,6 +1418,14 @@ class Timestamp(_Timestamp):
         >>> pd.Timestamp.utcnow()   # doctest: +SKIP
         Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC')
         """
+        warnings.warn(
+            # The stdlib datetime.utcnow is deprecated, so we deprecate to match.
+            #  GH#56680
+            "Timestamp.utcnow is deprecated and will be removed in a future "
+            "version. Use Timestamp.now('UTC') instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return cls.now(UTC)
 
     @classmethod
@@ -1438,6 +1446,14 @@ class Timestamp(_Timestamp):
         Timestamp('2020-03-14 15:32:52+0000', tz='UTC')
         """
         # GH#22451
+        warnings.warn(
+            # The stdlib datetime.utcfromtimestamp is deprecated, so we deprecate
+            #  to match. GH#56680
+            "Timestamp.utcfromtimestamp is deprecated and will be removed in a "
+            "future version. Use Timestamp.fromtimestamp(ts, 'UTC') instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return cls.fromtimestamp(ts, tz="UTC")
 
     @classmethod

diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py
@@ -54,12 +54,8 @@
     DATETIME_NO_TZ = st.datetimes()
 
 DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes(
-    min_value=pd.Timestamp(
-        1900, 1, 1
-    ).to_pydatetime(),  # pyright: ignore[reportGeneralTypeIssues]
-    max_value=pd.Timestamp(
-        1900, 1, 1
-    ).to_pydatetime(),  # pyright: ignore[reportGeneralTypeIssues]
+    min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),  # pyright: ignore[reportGeneralTypeIssues]
+    max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),  # pyright: ignore[reportGeneralTypeIssues]
     timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()),
 )