Skip to content

Commit

Permalink
Merge branch 'main' into ref-from_sequence_of_strings
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Jan 10, 2024
2 parents a00ff07 + c84f989 commit 2b1103a
Show file tree
Hide file tree
Showing 236 changed files with 2,121 additions and 1,688 deletions.
15 changes: 3 additions & 12 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ ci:
# manual stage hooks
skip: [pylint, pyright, mypy]
repos:
- repo: https://github.com/hauntsaninja/black-pre-commit-mirror
# black compiled with mypyc
rev: 23.11.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
hooks:
Expand All @@ -35,6 +30,9 @@ repos:
files: ^pandas
exclude: ^pandas/tests
args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
- id: ruff-format
# TODO: "." not needed in ruff 0.1.8
args: ["."]
- repo: https://github.com/jendrikseipp/vulture
rev: 'v2.10'
hooks:
Expand Down Expand Up @@ -274,13 +272,6 @@ repos:
language: python
types: [rst]
files: ^doc/source/(development|reference)/
- id: unwanted-patterns-bare-pytest-raises
name: Check for use of bare pytest raises
language: python
entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
types: [python]
files: ^pandas/tests/
exclude: ^pandas/tests/extension/
- id: unwanted-patterns-private-function-across-module
name: Check for use of private functions across modules
language: python
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ RUN apt-get update && apt-get -y upgrade
RUN apt-get install -y build-essential

# hdf5 needed for pytables installation
RUN apt-get install -y libhdf5-dev
# libgles2-mesa needed for pytest-qt
RUN apt-get install -y libhdf5-dev libgles2-mesa-dev

RUN python -m pip install --upgrade pip
RUN python -m pip install \
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ def time_loc_slice(self, index, index_structure):

class NumericMaskedIndexing:
monotonic_list = list(range(10**6))
non_monotonic_list = (
list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
)
non_monotonic_list = list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))

params = [
("Int64", "UInt64", "Float64"),
Expand Down
3 changes: 2 additions & 1 deletion asv_bench/benchmarks/io/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def _style_format(self):
# apply a formatting function
# subset is flexible but hinders vectorised solutions
self.st = self.df.style.format(
"{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"]
"{:,.3f}",
subset=IndexSlice["row_1" : f"row_{ir}", "float_1" : f"float_{ic}"],
)

def _style_apply_format_hide(self):
Expand Down
10 changes: 8 additions & 2 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@

set -uo pipefail

[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \
if [[ -v 1 ]]; then
CHECK=$1
else
# script will fail if it uses an unset variable (i.e. $1 is not provided)
CHECK=""
fi

[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }

BASE_DIR="$(dirname $0)/.."
RET=0
CHECK=$1

### CODE ###
if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion doc/source/development/contributing_codebase.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Pre-commit
----------

Additionally, :ref:`Continuous Integration <contributing.ci>` will run code formatting checks
like ``black``, ``ruff``,
like ``ruff``,
``isort``, and ``clang-format`` and more using `pre-commit hooks <https://pre-commit.com/>`_.
Any warnings from these checks will cause the :ref:`Continuous Integration <contributing.ci>` to fail; therefore,
it is helpful to run the check yourself before submitting code. This
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
:toctree: api/

Series.align
Series.case_when
Series.drop
Series.droplevel
Series.drop_duplicates
Expand Down
19 changes: 7 additions & 12 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3471,20 +3471,15 @@ saving a ``DataFrame`` to Excel. Generally the semantics are
similar to working with :ref:`csv<io.read_csv_table>` data.
See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.

.. warning::

The `xlrd <https://xlrd.readthedocs.io/en/latest/>`__ package is now only for reading
old-style ``.xls`` files.
.. note::

Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
would result in using the ``xlrd`` engine in many cases, including new
Excel 2007+ (``.xlsx``) files. pandas will now default to using the
`openpyxl <https://openpyxl.readthedocs.io/en/stable/>`__ engine.
When ``engine=None``, the following logic will be used to determine the engine:

It is strongly encouraged to install ``openpyxl`` to read Excel 2007+
(``.xlsx``) files.
**Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
This is no longer supported, switch to using ``openpyxl`` instead.
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.
- Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.
- Otherwise ``openpyxl`` will be used.

.. _io.excel_reader:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d
In [3]: ser[0] = 'not an int64'
FutureWarning:
Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas.
Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas.
Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
In [4]: ser
Expand Down
31 changes: 30 additions & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
documentation.

.. _whatsnew_220.enhancements.case_when:

Create a pandas Series based on one or more conditions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`)

.. ipython:: python
import pandas as pd
df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6]))
default=pd.Series('default', index=df.index)
default.case_when(
caselist=[
(df.a == 1, 'first'), # condition, replacement
(df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement
],
)
.. _whatsnew_220.enhancements.to_numpy_ea:

``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype
Expand Down Expand Up @@ -321,7 +341,7 @@ Other enhancements
- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`)
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
Expand Down Expand Up @@ -741,6 +761,7 @@ Categorical
^^^^^^^^^^^
- :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`)
- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`)
- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`)

Datetimelike
^^^^^^^^^^^^
Expand Down Expand Up @@ -786,8 +807,11 @@ Timezones
Numeric
^^^^^^^
- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`)
- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`)

Conversion
^^^^^^^^^^
Expand All @@ -814,6 +838,7 @@ Interval
- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`)
- Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`)
- Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`)
- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
Expand Down Expand Up @@ -845,6 +870,7 @@ I/O
- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`)
- Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`)
- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`)
- Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
- Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
Expand Down Expand Up @@ -872,6 +898,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`)
- Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`)
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`)
- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`)
- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
Expand All @@ -887,6 +914,7 @@ Reshaping
- Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
- Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
- Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
Expand All @@ -900,6 +928,7 @@ Sparse

Other
^^^^^
- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
Expand Down
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ Other API changes

Deprecations
~~~~~~~~~~~~
-
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
-

.. ---------------------------------------------------------------------------
Expand All @@ -108,6 +109,8 @@ Performance improvements

Bug fixes
~~~~~~~~~
- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)


Categorical
^^^^^^^^^^^
Expand Down
7 changes: 5 additions & 2 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -196,15 +196,18 @@ class HashTable:
*,
return_inverse: Literal[True],
mask: None = ...,
) -> tuple[np.ndarray, npt.NDArray[np.intp],]: ... # np.ndarray[subclass-specific]
) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific]
@overload
def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
*,
return_inverse: Literal[False] = ...,
mask: npt.NDArray[np.bool_],
) -> tuple[np.ndarray, npt.NDArray[np.bool_],]: ... # np.ndarray[subclass-specific]
) -> tuple[
np.ndarray,
npt.NDArray[np.bool_],
]: ... # np.ndarray[subclass-specific]
def factorize(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ def indices_fast(
sorted_labels: list[npt.NDArray[np.int64]],
) -> dict[Hashable, npt.NDArray[np.intp]]: ...
def generate_slices(
labels: np.ndarray, ngroups: int # const intp_t[:]
labels: np.ndarray,
ngroups: int, # const intp_t[:]
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
def count_level_2d(
mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
Expand Down Expand Up @@ -209,5 +210,6 @@ def get_reverse_indexer(
def is_bool_list(obj: list) -> bool: ...
def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
def is_range_indexer(
left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1]
left: np.ndarray,
n: int, # np.ndarray[np.int64, ndim=1]
) -> bool: ...
15 changes: 15 additions & 0 deletions pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4734,6 +4734,10 @@ cpdef to_offset(freq, bint is_period=False):
Parameters
----------
freq : str, datetime.timedelta, BaseOffset or None
The frequency represented.
is_period : bool, default False
Convert string denoting period frequency to corresponding offsets
frequency if is_period=True.
Returns
-------
Expand Down Expand Up @@ -4768,6 +4772,17 @@ cpdef to_offset(freq, bint is_period=False):
>>> to_offset(pd.offsets.Hour())
<Hour>
Passing the parameter ``is_period`` equal to True, you can use a string
denoting period frequency:
>>> freq = to_offset(freq="ME", is_period=False)
>>> freq.rule_code
'ME'
>>> freq = to_offset(freq="M", is_period=True)
>>> freq.rule_code
'ME'
"""
if freq is None:
return None
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ cdef bint parse_today_now(
if infer_reso:
creso = NPY_DATETIMEUNIT.NPY_FR_us
if utc:
ts = <_Timestamp>Timestamp.utcnow()
ts = <_Timestamp>Timestamp.now(timezone.utc)
iresult[0] = ts._as_creso(creso)._value
else:
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
Expand Down
16 changes: 16 additions & 0 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,14 @@ class Timestamp(_Timestamp):
>>> pd.Timestamp.utcnow() # doctest: +SKIP
Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC')
"""
warnings.warn(
# The stdlib datetime.utcnow is deprecated, so we deprecate to match.
# GH#56680
"Timestamp.utcnow is deprecated and will be removed in a future "
"version. Use Timestamp.now('UTC') instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
return cls.now(UTC)

@classmethod
Expand All @@ -1438,6 +1446,14 @@ class Timestamp(_Timestamp):
Timestamp('2020-03-14 15:32:52+0000', tz='UTC')
"""
# GH#22451
warnings.warn(
# The stdlib datetime.utcfromtimestamp is deprecated, so we deprecate
# to match. GH#56680
"Timestamp.utcfromtimestamp is deprecated and will be removed in a "
"future version. Use Timestamp.fromtimestamp(ts, 'UTC') instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
return cls.fromtimestamp(ts, tz="UTC")

@classmethod
Expand Down
8 changes: 2 additions & 6 deletions pandas/_testing/_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@
DATETIME_NO_TZ = st.datetimes()

DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes(
min_value=pd.Timestamp(
1900, 1, 1
).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues]
max_value=pd.Timestamp(
1900, 1, 1
).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues]
min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues]
max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues]
timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()),
)

Expand Down
Loading

0 comments on commit 2b1103a

Please sign in to comment.