feat: support pandas 2.1

dimastbk · Sep 4, 2023 · 70314b8 · 70314b8
1 parent 8d86688
commit 70314b8
Show file tree

Hide file tree

Showing 7 changed files with 677 additions and 153 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -13,19 +13,33 @@ on:
 
 jobs:
   test:
-    name: test ${{ matrix.python-version }} rust stable
+    name: test ${{ matrix.python-version }} pandas ${{ matrix.pandas-version }}
     strategy:
       fail-fast: false
       matrix:
         python-version:
-        - '3.8'
         - '3.9'
         - '3.10'
         - '3.11'
-        - '3.12'
-        - pypy3.8
-        - pypy3.9
-        - pypy3.10
+        pandas-version:
+        - '2.0.*'
+        - '2.1.*'
+        include:
+        - python-version: '3.8'
+          pandas-version: '2.0.*'
+        # https://github.com/pandas-dev/pandas/issues/53665
+        - python-version: '3.12'
+          pandas-version: 'none'
+        # https://github.com/pandas-dev/pandas/issues/42509
+        - python-version: 'pypy3.8'
+          pandas-version: 'none'
+        # https://github.com/pandas-dev/pandas/issues/42509
+        - python-version: 'pypy3.9'
+          pandas-version: 'none'
+        # https://github.com/pandas-dev/pandas/issues/42509
+        - python-version: 'pypy3.10'
+          pandas-version: 'none'
+
 
     runs-on: ubuntu-latest
 
@@ -61,7 +75,13 @@ jobs:
           py
           ${{ runner.os }}
           ${{ env.pythonLocation }}
-    - run: pip install pytest
+
+    - run: |
+        if [ ${{ matrix.pandas-version }} == 'none' ]; then
+          pip install pytest
+        else
+          pip install pytest pandas[excel]==${{ matrix.pandas-version }}
+        fi
       if: steps.cache-py.outputs.cache-hit != 'true'
 
     - run: pip install -e .

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,8 +33,10 @@ authors = ["Dmitriy <[email protected]>"]
 
 [tool.poetry.dependencies]
 python = "^3.8"
+packaging = "^23.1"
 
 [tool.poetry.group.dev.dependencies]
 maturin = "^1.0.1"
 pre-commit = "^3.0.1"
 pytest = "^7.2.1"
+pandas = {version = "^2.0.0", extras = ["excel"]}
diff --git a/python/python_calamine/_python_calamine.pyi b/python/python_calamine/_python_calamine.pyi
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
 import enum
-from datetime import date, datetime, time
+from datetime import date, datetime, time, timedelta
 from os import PathLike
 from typing import Protocol
 
-ValueT = int | float | str | bool | time | date | datetime
+ValueT = int | float | str | bool | time | date | datetime | timedelta
 
 class ReadBuffer(Protocol):
     def seek(self) -> int: ...

diff --git a/python/python_calamine/pandas.py b/python/python_calamine/pandas.py
@@ -1,27 +1,37 @@
 from __future__ import annotations
 
-from datetime import date, datetime, time
-from typing import Union, cast
+from datetime import date, datetime, time, timedelta
+from importlib.metadata import version
+from typing import TYPE_CHECKING, Union, cast
 
 import pandas as pd
-from pandas._typing import FilePath, ReadBuffer, Scalar, StorageOptions
+from packaging.version import Version, parse
+from pandas._typing import Scalar
 from pandas.compat._optional import import_optional_dependency
 from pandas.core.shared_docs import _shared_docs
 from pandas.io.excel import ExcelFile
 from pandas.io.excel._base import BaseExcelReader
 from pandas.util._decorators import doc
 
-_ValueT = Union[int, float, str, bool, time, date, datetime]
+if TYPE_CHECKING:
+    from pandas._typing import FilePath, ReadBuffer, StorageOptions
+    from python_calamine import CalamineSheet, CalamineWorkbook
+
+_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta]
+
+
+PANDAS_VERSION = parse(version("pandas"))
 
 
 class CalamineExcelReader(BaseExcelReader):
-    _sheet_names: list[str] | None = None
+    book: CalamineWorkbook
 
     @doc(storage_options=_shared_docs["storage_options"])
     def __init__(
         self,
         filepath_or_buffer: FilePath | ReadBuffer[bytes],
-        storage_options: StorageOptions = None,
+        storage_options: StorageOptions | None = None,
+        engine_kwargs: dict | None = None,
     ) -> None:
         """
         Reader using calamine engine (xlsx/xls/xlsb/ods).
@@ -31,37 +41,61 @@ def __init__(
         filepath_or_buffer : str, path to be parsed or
             an open readable stream.
         {storage_options}
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
         """
         import_optional_dependency("python_calamine")
-        super().__init__(filepath_or_buffer, storage_options=storage_options)
+        if PANDAS_VERSION >= Version("2.1.0"):
+            super().__init__(
+                filepath_or_buffer,
+                storage_options=storage_options,
+                engine_kwargs=engine_kwargs,
+            )
+        elif PANDAS_VERSION >= Version("2.0.0"):
+            super().__init__(
+                filepath_or_buffer,
+                storage_options=storage_options,
+            )
+        else:
+            raise ValueError("Pandas >= 2 is only supported")
 
     @property
-    def _workbook_class(self):
+    def _workbook_class(self) -> type[CalamineWorkbook]:
         from python_calamine import CalamineWorkbook
 
         return CalamineWorkbook
 
-    def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
+    def load_workbook(
+        self,
+        filepath_or_buffer: FilePath | ReadBuffer[bytes],
+        engine_kwargs: dict | None = None,
+    ) -> CalamineWorkbook:
         from python_calamine import load_workbook
 
-        return load_workbook(filepath_or_buffer)
+        return load_workbook(filepath_or_buffer, **(engine_kwargs or {}))
 
     @property
     def sheet_names(self) -> list[str]:
-        return self.book.sheet_names  # pyright: ignore
+        from python_calamine import SheetTypeEnum
+
+        return [
+            sheet.name
+            for sheet in self.book.sheets_metadata
+            if sheet.typ == SheetTypeEnum.WorkSheet
+        ]
 
-    def get_sheet_by_name(self, name: str):
+    def get_sheet_by_name(self, name: str) -> CalamineSheet:
         self.raise_if_bad_sheet_by_name(name)
-        return self.book.get_sheet_by_name(name)  # pyright: ignore
+        return self.book.get_sheet_by_name(name)
 
-    def get_sheet_by_index(self, index: int):
+    def get_sheet_by_index(self, index: int) -> CalamineSheet:
         self.raise_if_bad_sheet_by_index(index)
-        return self.book.get_sheet_by_index(index)  # pyright: ignore
+        return self.book.get_sheet_by_index(index)
 
     def get_sheet_data(
-        self, sheet, file_rows_needed: int | None = None
+        self, sheet: CalamineSheet, file_rows_needed: int | None = None
     ) -> list[list[Scalar]]:
-        def _convert_cell(value: _ValueT) -> Scalar:
+        def _convert_cell(value: _CellValueT) -> Scalar:
             if isinstance(value, float):
                 val = int(value)
                 if val == value:
@@ -70,19 +104,21 @@ def _convert_cell(value: _ValueT) -> Scalar:
                     return value
             elif isinstance(value, date):
                 return pd.Timestamp(value)
+            elif isinstance(value, timedelta):
+                return pd.Timedelta(value)
             elif isinstance(value, time):
                 # cast needed here because Scalar doesn't include datetime.time
                 return cast(Scalar, value)
 
             return value
 
-        rows: list[list[_ValueT]] = sheet.to_python(
-            skip_empty_area=False, nrows=file_rows_needed
-        )
+        rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False)
         data: list[list[Scalar]] = []
 
         for row in rows:
             data.append([_convert_cell(cell) for cell in row])
+            if file_rows_needed is not None and len(data) >= file_rows_needed:
+                break
 
         return data
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,81 @@
+from datetime import datetime, time
+
+import pytest
+
+
+@pytest.fixture
+def pandas_monkeypatch():
+    from python_calamine.pandas import pandas_monkeypatch
+
+    pandas_monkeypatch()
+    yield
+
+
+@pytest.fixture
+def expected_df_ods():
+    import pandas as pd
+
+    return pd.DataFrame(
+        [
+            [
+                "String",
+                1,
+                1.1,
+                True,
+                False,
+                pd.Timestamp("2010-10-10"),
+                datetime(2010, 10, 10, 10, 10, 10),
+                time(10, 10, 10),
+                time(10, 10, 10, 100000),
+                # duration (255:10:10) isn't supported
+                # see https://github.com/tafia/calamine/pull/288 and https://github.com/chronotope/chrono/issues/579
+                "PT255H10M10S",
+            ],
+        ],
+        columns=[
+            "Unnamed: 0",
+            "Unnamed: 1",
+            "Unnamed: 2",
+            "Unnamed: 3",
+            "Unnamed: 4",
+            "Unnamed: 5",
+            "Unnamed: 6",
+            "Unnamed: 7",
+            "Unnamed: 8",
+            "Unnamed: 9",
+        ],
+    )
+
+
+@pytest.fixture
+def expected_df_excel():
+    import pandas as pd
+
+    return pd.DataFrame(
+        [
+            [
+                "String",
+                1,
+                1.1,
+                True,
+                False,
+                pd.Timestamp("2010-10-10"),
+                datetime(2010, 10, 10, 10, 10, 10),
+                time(10, 10, 10),
+                pd.Timedelta(hours=10, minutes=10, seconds=10, microseconds=100000),
+                pd.Timedelta(hours=255, minutes=10, seconds=10),
+            ],
+        ],
+        columns=[
+            "Unnamed: 0",
+            "Unnamed: 1",
+            "Unnamed: 2",
+            "Unnamed: 3",
+            "Unnamed: 4",
+            "Unnamed: 5",
+            "Unnamed: 6",
+            "Unnamed: 7",
+            "Unnamed: 8",
+            "Unnamed: 9",
+        ],
+    )