Skip to content

Commit

Permalink
fix(python): Ensure NoDataError raised consistently between engines…
Browse files Browse the repository at this point in the history
… for Excel reads (#19712)
  • Loading branch information
max-muoto authored Nov 10, 2024
1 parent efde5e5 commit 2c6bae1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
10 changes: 5 additions & 5 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ def _drop_null_data(
If `drop_empty_rows` is set to `False`, empty rows are not dropped.
"""
null_cols = []
null_cols: list[str] = []
for col_name in df.columns:
# note that if multiple unnamed columns are found then all but the first one
# will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
Expand Down Expand Up @@ -955,15 +955,15 @@ def _read_spreadsheet_calamine(
):
df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)]

df = _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)

# note: even if we applied parser dtypes we still re-apply schema_overrides
# natively as we can refine integer/float types, temporal precision, etc.
if schema_overrides:
df = df.cast(dtypes=schema_overrides)

df = _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)

# standardise on string dtype for null columns in empty frame
if df.is_empty():
df = df.cast({Null: String})
Expand Down
21 changes: 14 additions & 7 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from collections.abc import Sequence
from pathlib import Path

from polars._typing import ExcelSpreadsheetEngine, SelectorType
from polars._typing import ExcelSpreadsheetEngine, SchemaDict, SelectorType

# pytestmark = pytest.mark.slow()

Expand Down Expand Up @@ -918,24 +918,31 @@ def test_excel_freeze_panes() -> None:


@pytest.mark.parametrize(
("read_spreadsheet", "source"),
("read_spreadsheet", "source", "schema_overrides"),
[
(pl.read_excel, "path_xlsx_empty"),
(pl.read_excel, "path_xlsb_empty"),
(pl.read_excel, "path_xls_empty"),
(pl.read_ods, "path_ods_empty"),
(pl.read_excel, "path_xlsx_empty", None),
(pl.read_excel, "path_xlsb_empty", None),
(pl.read_excel, "path_xls_empty", None),
(pl.read_ods, "path_ods_empty", None),
# Test with schema overrides, to ensure they don't interfere with
# raising NoDataErrors.
(pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}),
(pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}),
(pl.read_excel, "path_xls_empty", {"a": pl.Int64}),
(pl.read_ods, "path_ods_empty", {"a": pl.Int64}),
],
)
def test_excel_empty_sheet(
read_spreadsheet: Callable[..., pl.DataFrame],
source: str,
request: pytest.FixtureRequest,
schema_overrides: SchemaDict | None,
) -> None:
ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods"
read_spreadsheet = pl.read_ods if ods else pl.read_excel # type: ignore[assignment]

with pytest.raises(NoDataError, match="empty Excel sheet"):
read_spreadsheet(empty_spreadsheet_path)
read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides)

engine_params = [{}] if ods else [{"engine": "calamine"}]
for params in engine_params:
Expand Down

0 comments on commit 2c6bae1

Please sign in to comment.