Skip to content

Commit

Permalink
refactor: Remove Dead Excel Code (#19710)
Browse files Browse the repository at this point in the history
  • Loading branch information
max-muoto authored Nov 10, 2024
1 parent e276eb8 commit efde5e5
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 82 deletions.
42 changes: 0 additions & 42 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,48 +495,6 @@ def read_ods(
)


def _identify_from_magic_bytes(data: IO[bytes] | bytes) -> str | None:
if isinstance(data, bytes):
data = BytesIO(data)

xls_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # excel 97-2004
xlsx_bytes = b"PK\x03\x04" # xlsx/openoffice (zipped xml)

initial_position = data.tell()
try:
magic_bytes = data.read(8)
if magic_bytes == xls_bytes:
return "xls"
elif magic_bytes[:4] == xlsx_bytes:
return "xlsx"
except UnicodeDecodeError:
pass
finally:
data.seek(initial_position)
return None


def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None:
"""Use file extension (and magic bytes) to identify Workbook type."""
if not isinstance(wb, (str, Path)):
# raw binary data (bytesio, etc)
return _identify_from_magic_bytes(wb)
else:
p = Path(wb)
ext = p.suffix[1:].lower()

# unambiguous file extensions
if ext in ("xlsx", "xlsm", "xlsb"):
return ext
elif ext[:2] == "od":
return "ods"

# check magic bytes to resolve ambiguity (eg: xls/xlsx, or no extension)
with p.open("rb") as f:
magic_bytes = BytesIO(f.read(8))
return _identify_from_magic_bytes(magic_bytes)


def _read_spreadsheet(
sheet_id: int | Sequence[int] | None,
sheet_name: str | list[str] | tuple[str] | None,
Expand Down
41 changes: 1 addition & 40 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from collections import OrderedDict
from datetime import date, datetime
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable

import pytest

import polars as pl
import polars.selectors as cs
from polars.exceptions import NoDataError, ParameterCollisionError
from polars.io.spreadsheet.functions import _identify_workbook
from polars.testing import assert_frame_equal, assert_series_equal
from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES

if TYPE_CHECKING:
from collections.abc import Sequence
from pathlib import Path

from polars._typing import ExcelSpreadsheetEngine, SelectorType

Expand Down Expand Up @@ -1028,44 +1027,6 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None
assert_frame_equal(df.select(reversed_cols), read_df)


@pytest.mark.parametrize(
("path", "file_type"),
[
("path_xls", "xls"),
("path_xlsx", "xlsx"),
("path_xlsb", "xlsb"),
],
)
def test_identify_workbook(
path: str, file_type: str, request: pytest.FixtureRequest
) -> None:
# identify from file path
spreadsheet_path = request.getfixturevalue(path)
assert _identify_workbook(spreadsheet_path) == file_type

# note that we can't distinguish between xlsx and xlsb
# from the magic bytes block alone (so we default to xlsx)
if file_type == "xlsb":
file_type = "xlsx"

# identify from IO[bytes]
with Path.open(spreadsheet_path, "rb") as f:
assert _identify_workbook(f) == file_type
assert isinstance(pl.read_excel(f, engine="calamine"), pl.DataFrame)

# identify from bytes
with Path.open(spreadsheet_path, "rb") as f:
raw_data = f.read()
assert _identify_workbook(raw_data) == file_type
assert isinstance(pl.read_excel(raw_data, engine="calamine"), pl.DataFrame)

# identify from BytesIO
with Path.open(spreadsheet_path, "rb") as f:
bytesio_data = BytesIO(f.read())
assert _identify_workbook(bytesio_data) == file_type
assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame)


def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
assert df1.shape == (8, 4)
Expand Down

0 comments on commit efde5e5

Please sign in to comment.