Skip to content

Commit

Permalink
feat: support pandas 2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
dimastbk committed Sep 4, 2023
1 parent 8d86688 commit 9e20f5d
Show file tree
Hide file tree
Showing 7 changed files with 675 additions and 153 deletions.
32 changes: 25 additions & 7 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,33 @@ on:

jobs:
test:
name: test ${{ matrix.python-version }} rust stable
name: test ${{ matrix.python-version }} ${{ matrix.pandas-version }} rust stable
strategy:
fail-fast: false
matrix:
python-version:
- '3.8'
- '3.9'
- '3.10'
- '3.11'
- '3.12'
- pypy3.8
- pypy3.9
- pypy3.10
pandas-version:
- '2.0.*'
- '2.1.*'
include:
- python-version: '3.8'
pandas-version: '2.0.*'
# https://github.com/pandas-dev/pandas/issues/53665
- python-version: '3.12'
pandas-version: 'none'
# https://github.com/pandas-dev/pandas/issues/42509
- python-version: 'pypy3.8'
pandas-version: 'none'
# https://github.com/pandas-dev/pandas/issues/42509
- python-version: 'pypy3.9'
pandas-version: 'none'
# https://github.com/pandas-dev/pandas/issues/42509
- python-version: 'pypy3.10'
pandas-version: 'none'


runs-on: ubuntu-latest

Expand Down Expand Up @@ -61,8 +75,12 @@ jobs:
py
${{ runner.os }}
${{ env.pythonLocation }}
- run: pip install pytest pandas[excel]==${{ matrix.pandas-version }}
if: steps.cache-py.outputs.cache-hit != 'true' && ${{ matrix.pandas-version }} != 'none'

- run: pip install pytest
if: steps.cache-py.outputs.cache-hit != 'true'
if: steps.cache-py.outputs.cache-hit != 'true' && ${{ matrix.pandas-version }} == 'none'

- run: pip install -e .
env:
Expand Down
521 changes: 397 additions & 124 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ authors = ["Dmitriy <[email protected]>"]

[tool.poetry.dependencies]
python = "^3.8"
packaging = "^23.1"

[tool.poetry.group.dev.dependencies]
maturin = "^1.0.1"
pre-commit = "^3.0.1"
pytest = "^7.2.1"
pandas = {version = "^2.0.0", extras = ["excel"]}
4 changes: 2 additions & 2 deletions python/python_calamine/_python_calamine.pyi
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from __future__ import annotations

import enum
from datetime import date, datetime, time
from datetime import date, datetime, time, timedelta
from os import PathLike
from typing import Protocol

ValueT = int | float | str | bool | time | date | datetime
ValueT = int | float | str | bool | time | date | datetime | timedelta

class ReadBuffer(Protocol):
def seek(self) -> int: ...
Expand Down
76 changes: 56 additions & 20 deletions python/python_calamine/pandas.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,37 @@
from __future__ import annotations

from datetime import date, datetime, time
from typing import Union, cast
from datetime import date, datetime, time, timedelta
from importlib.metadata import version
from typing import TYPE_CHECKING, Union, cast

import pandas as pd
from pandas._typing import FilePath, ReadBuffer, Scalar, StorageOptions
from packaging.version import Version, parse
from pandas._typing import Scalar
from pandas.compat._optional import import_optional_dependency
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel import ExcelFile
from pandas.io.excel._base import BaseExcelReader
from pandas.util._decorators import doc

_ValueT = Union[int, float, str, bool, time, date, datetime]
if TYPE_CHECKING:
from pandas._typing import FilePath, ReadBuffer, StorageOptions
from python_calamine import CalamineSheet, CalamineWorkbook

_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta]


PANDAS_VERSION = parse(version("pandas"))


class CalamineExcelReader(BaseExcelReader):
_sheet_names: list[str] | None = None
book: CalamineWorkbook

@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using calamine engine (xlsx/xls/xlsb/ods).
Expand All @@ -31,37 +41,61 @@ def __init__(
filepath_or_buffer : str, path to be parsed or
an open readable stream.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("python_calamine")
super().__init__(filepath_or_buffer, storage_options=storage_options)
if PANDAS_VERSION >= Version("2.1.0"):
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
elif PANDAS_VERSION >= Version("2.0.0"):
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
)
else:
raise ValueError("Pandas >= 2 is only supported")

@property
def _workbook_class(self):
def _workbook_class(self) -> type[CalamineWorkbook]:
from python_calamine import CalamineWorkbook

return CalamineWorkbook

def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
def load_workbook(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
engine_kwargs: dict | None = None,
) -> CalamineWorkbook:
from python_calamine import load_workbook

return load_workbook(filepath_or_buffer)
return load_workbook(filepath_or_buffer, **(engine_kwargs or {}))

@property
def sheet_names(self) -> list[str]:
return self.book.sheet_names # pyright: ignore
from python_calamine import SheetTypeEnum

return [
sheet.name
for sheet in self.book.sheets_metadata
if sheet.typ == SheetTypeEnum.WorkSheet
]

def get_sheet_by_name(self, name: str):
def get_sheet_by_name(self, name: str) -> CalamineSheet:
self.raise_if_bad_sheet_by_name(name)
return self.book.get_sheet_by_name(name) # pyright: ignore
return self.book.get_sheet_by_name(name)

def get_sheet_by_index(self, index: int):
def get_sheet_by_index(self, index: int) -> CalamineSheet:
self.raise_if_bad_sheet_by_index(index)
return self.book.get_sheet_by_index(index) # pyright: ignore
return self.book.get_sheet_by_index(index)

def get_sheet_data(
self, sheet, file_rows_needed: int | None = None
self, sheet: CalamineSheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
def _convert_cell(value: _ValueT) -> Scalar:
def _convert_cell(value: _CellValueT) -> Scalar:
if isinstance(value, float):
val = int(value)
if val == value:
Expand All @@ -70,19 +104,21 @@ def _convert_cell(value: _ValueT) -> Scalar:
return value
elif isinstance(value, date):
return pd.Timestamp(value)
elif isinstance(value, timedelta):
return pd.Timedelta(value)
elif isinstance(value, time):
# cast needed here because Scalar doesn't include datetime.time
return cast(Scalar, value)

return value

rows: list[list[_ValueT]] = sheet.to_python(
skip_empty_area=False, nrows=file_rows_needed
)
rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False)
data: list[list[Scalar]] = []

for row in rows:
data.append([_convert_cell(cell) for cell in row])
if file_rows_needed is not None and len(data) >= file_rows_needed:
break

return data

Expand Down
81 changes: 81 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from datetime import datetime, time

import pytest


@pytest.fixture
def pandas_monkeypatch():
from python_calamine.pandas import pandas_monkeypatch

pandas_monkeypatch()
yield


@pytest.fixture
def expected_df_ods():
import pandas as pd

return pd.DataFrame(
[
[
"String",
1,
1.1,
True,
False,
pd.Timestamp("2010-10-10"),
datetime(2010, 10, 10, 10, 10, 10),
time(10, 10, 10),
time(10, 10, 10, 100000),
# duration (255:10:10) isn't supported
# see https://github.com/tafia/calamine/pull/288 and https://github.com/chronotope/chrono/issues/579
"PT255H10M10S",
],
],
columns=[
"Unnamed: 0",
"Unnamed: 1",
"Unnamed: 2",
"Unnamed: 3",
"Unnamed: 4",
"Unnamed: 5",
"Unnamed: 6",
"Unnamed: 7",
"Unnamed: 8",
"Unnamed: 9",
],
)


@pytest.fixture
def expected_df_excel():
import pandas as pd

return pd.DataFrame(
[
[
"String",
1,
1.1,
True,
False,
pd.Timestamp("2010-10-10"),
datetime(2010, 10, 10, 10, 10, 10),
time(10, 10, 10),
pd.Timedelta(hours=10, minutes=10, seconds=10, microseconds=100000),
pd.Timedelta(hours=255, minutes=10, seconds=10),
],
],
columns=[
"Unnamed: 0",
"Unnamed: 1",
"Unnamed: 2",
"Unnamed: 3",
"Unnamed: 4",
"Unnamed: 5",
"Unnamed: 6",
"Unnamed: 7",
"Unnamed: 8",
"Unnamed: 9",
],
)
Loading

0 comments on commit 9e20f5d

Please sign in to comment.