diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 1d4bb5fe90b6..58ed20c2d4de 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -818,7 +818,7 @@ def _drop_null_data( If `drop_empty_rows` is set to `False`, empty rows are not dropped. """ - null_cols = [] + null_cols: list[str] = [] for col_name in df.columns: # note that if multiple unnamed columns are found then all but the first one # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) @@ -997,15 +997,15 @@ def _read_spreadsheet_calamine( ): df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)] + df = _drop_null_data( + df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + ) + # note: even if we applied parser dtypes we still re-apply schema_overrides # natively as we can refine integer/float types, temporal precision, etc. if schema_overrides: df = df.cast(dtypes=schema_overrides) - df = _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows - ) - # standardise on string dtype for null columns in empty frame if df.is_empty(): df = df.cast({Null: String}) diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index b7b03a0bd02e..86e45ce7edfe 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: from collections.abc import Sequence - from polars._typing import ExcelSpreadsheetEngine, SelectorType + from polars._typing import ExcelSpreadsheetEngine, SchemaDict, SelectorType # pytestmark = pytest.mark.slow() @@ -919,24 +919,31 @@ def test_excel_freeze_panes() -> None: @pytest.mark.parametrize( - ("read_spreadsheet", "source"), + ("read_spreadsheet", "source", "schema_overrides"), [ - (pl.read_excel, "path_xlsx_empty"), - (pl.read_excel, "path_xlsb_empty"), - (pl.read_excel, "path_xls_empty"), - (pl.read_ods, "path_ods_empty"), + (pl.read_excel, "path_xlsx_empty", None), + (pl.read_excel, "path_xlsb_empty", None), + (pl.read_excel, "path_xls_empty", None), + (pl.read_ods, "path_ods_empty", None), + # Test with schema overrides, to ensure they don't interfere with + # raising NoDataErrors. + (pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}), + (pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}), + (pl.read_excel, "path_xls_empty", {"a": pl.Int64}), + (pl.read_ods, "path_ods_empty", {"a": pl.Int64}), ], ) def test_excel_empty_sheet( read_spreadsheet: Callable[..., pl.DataFrame], source: str, request: pytest.FixtureRequest, + schema_overrides: SchemaDict | None, ) -> None: ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods" read_spreadsheet = pl.read_ods if ods else pl.read_excel # type: ignore[assignment] with pytest.raises(NoDataError, match="empty Excel sheet"): - read_spreadsheet(empty_spreadsheet_path) + read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides) engine_params = [{}] if ods else [{"engine": "calamine"}] for params in engine_params: