From efde5e52157fb2cde140a4d14f440b541ce17d88 Mon Sep 17 00:00:00 2001 From: Max Muoto Date: Sat, 9 Nov 2024 23:24:03 -0600 Subject: [PATCH] refactor: Remove Dead Excel Code (#19710) --- py-polars/polars/io/spreadsheet/functions.py | 42 -------------------- py-polars/tests/unit/io/test_spreadsheet.py | 41 +------------------ 2 files changed, 1 insertion(+), 82 deletions(-) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 1d4bb5fe90b6..91910e8996fa 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -495,48 +495,6 @@ def read_ods( ) -def _identify_from_magic_bytes(data: IO[bytes] | bytes) -> str | None: - if isinstance(data, bytes): - data = BytesIO(data) - - xls_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # excel 97-2004 - xlsx_bytes = b"PK\x03\x04" # xlsx/openoffice (zipped xml) - - initial_position = data.tell() - try: - magic_bytes = data.read(8) - if magic_bytes == xls_bytes: - return "xls" - elif magic_bytes[:4] == xlsx_bytes: - return "xlsx" - except UnicodeDecodeError: - pass - finally: - data.seek(initial_position) - return None - - -def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None: - """Use file extension (and magic bytes) to identify Workbook type.""" - if not isinstance(wb, (str, Path)): - # raw binary data (bytesio, etc) - return _identify_from_magic_bytes(wb) - else: - p = Path(wb) - ext = p.suffix[1:].lower() - - # unambiguous file extensions - if ext in ("xlsx", "xlsm", "xlsb"): - return ext - elif ext[:2] == "od": - return "ods" - - # check magic bytes to resolve ambiguity (eg: xls/xlsx, or no extension) - with p.open("rb") as f: - magic_bytes = BytesIO(f.read(8)) - return _identify_from_magic_bytes(magic_bytes) - - def _read_spreadsheet( sheet_id: int | Sequence[int] | None, sheet_name: str | list[str] | tuple[str] | None, diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index b7b03a0bd02e..acef1ef7f1a5 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -4,7 +4,6 @@ from collections import OrderedDict from datetime import date, datetime from io import BytesIO -from pathlib import Path from typing import TYPE_CHECKING, Any, Callable import pytest @@ -12,12 +11,12 @@ import polars as pl import polars.selectors as cs from polars.exceptions import NoDataError, ParameterCollisionError -from polars.io.spreadsheet.functions import _identify_workbook from polars.testing import assert_frame_equal, assert_series_equal from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES if TYPE_CHECKING: from collections.abc import Sequence + from pathlib import Path from polars._typing import ExcelSpreadsheetEngine, SelectorType @@ -1028,44 +1027,6 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None assert_frame_equal(df.select(reversed_cols), read_df) -@pytest.mark.parametrize( - ("path", "file_type"), - [ - ("path_xls", "xls"), - ("path_xlsx", "xlsx"), - ("path_xlsb", "xlsb"), - ], -) -def test_identify_workbook( - path: str, file_type: str, request: pytest.FixtureRequest -) -> None: - # identify from file path - spreadsheet_path = request.getfixturevalue(path) - assert _identify_workbook(spreadsheet_path) == file_type - - # note that we can't distinguish between xlsx and xlsb - # from the magic bytes block alone (so we default to xlsx) - if file_type == "xlsb": - file_type = "xlsx" - - # identify from IO[bytes] - with Path.open(spreadsheet_path, "rb") as f: - assert _identify_workbook(f) == file_type - assert isinstance(pl.read_excel(f, engine="calamine"), pl.DataFrame) - - # identify from bytes - with Path.open(spreadsheet_path, "rb") as f: - raw_data = f.read() - assert _identify_workbook(raw_data) == file_type - assert isinstance(pl.read_excel(raw_data, engine="calamine"), pl.DataFrame) - - # identify from BytesIO - with Path.open(spreadsheet_path, "rb") as f: - bytesio_data = BytesIO(f.read()) - assert _identify_workbook(bytesio_data) == file_type - assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame) - - def test_drop_empty_rows(path_empty_rows_excel: Path) -> None: df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv") assert df1.shape == (8, 4)