diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 97a7f8ae639a..28d1c766cd1f 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -1819,11 +1819,6 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any: """ Return the dataframe as a scalar, or return the element at the given row/column. - Notes - ----- - If row/col not provided, this is equivalent to ``df[0,0]``, with a check that - the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. - Parameters ---------- row @@ -1835,6 +1830,11 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any: -------- row: Get the values of a single row, either by index or by predicate. + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index ecbf01a4d9c2..dd971c6b1ffb 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -25,7 +25,7 @@ def read_excel( *, sheet_id: None = ..., sheet_name: str, - engine: Literal["xlsx2csv", "openpyxl"] | None = ..., + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., xlsx2csv_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = None, @@ -40,7 +40,7 @@ def read_excel( *, sheet_id: None = ..., sheet_name: None = ..., - engine: Literal["xlsx2csv", "openpyxl"] | None = ..., + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., xlsx2csv_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = None, @@ -55,7 +55,7 @@ def read_excel( *, sheet_id: int, sheet_name: str, - engine: Literal["xlsx2csv", "openpyxl"] | None = ..., + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., xlsx2csv_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = None, @@ -72,7 +72,7 @@ def read_excel( *, sheet_id: Literal[0] | Sequence[int], sheet_name: None = ..., - engine: Literal["xlsx2csv", "openpyxl"] | None = ..., + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., xlsx2csv_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = None, @@ -87,7 +87,7 @@ def read_excel( *, sheet_id: int, sheet_name: None = ..., - engine: Literal["xlsx2csv", "openpyxl"] | None = ..., + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., xlsx2csv_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = None, @@ -102,7 +102,7 @@ def read_excel( *, sheet_id: None, sheet_name: list[str] | tuple[str], - engine: Literal["xlsx2csv", "openpyxl"] | None = ..., + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., xlsx2csv_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = None, @@ -116,7 +116,7 @@ def read_excel( *, sheet_id: int | Sequence[int] | None = None, sheet_name: str | list[str] | tuple[str] | None = None, - engine: Literal["xlsx2csv", "openpyxl"] | None = None, + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = None, xlsx2csv_options: dict[str, Any] | None = None, read_csv_options: dict[str, Any] | None = None, schema_overrides: SchemaDict | None = None, @@ -125,13 +125,10 @@ def read_excel( """ Read Excel (XLSX) spreadsheet data into a DataFrame. - If using the ``xlsx2csv`` engine, converts an Excel sheet with - ``xlsx2csv.Xlsx2csv().convert()`` to CSV and parses the CSV output with - :func:`read_csv`. You can pass additional options to ``read_csv_options`` to - influence parsing behaviour. - - When using the ``openpyxl`` engine, reads an Excel sheet with - ``openpyxl.load_workbook(source)``. + .. versionadded:: 0.19.4 + Added support for "pyxlsb" engine for reading Excel Binary Workbooks (.xlsb). + .. versionadded:: 0.19.3 + Added support for "openpyxl" engine, and added ``schema_overrides`` parameter. Parameters ---------- @@ -149,14 +146,17 @@ def read_excel( engine Library used to parse the spreadsheet file; defaults to "xlsx2csv" if not set. - * "xlsx2csv": the fastest engine; converts the data to an in-memory CSV first - and then uses the polars ``read_csv`` method to parse the result. You can - pass `xlsx2csv_options` and/or `read_csv_options` to refine the conversion. - * "openpyxl": slower than ``xlsx2csv`` but supports additional automatic type - inference; potentially useful if you are unable to parse your sheet with - the ``xlsx2csv`` engine. - * "odf": this engine is only used for OpenOffice files; it will be used - automatically for files with the ".ods" extension. + * "xlsx2csv": the fastest engine; converts the data to an in-memory CSV before + using the native polars ``read_csv`` method to parse the result. You can + pass ``xlsx2csv_options`` and ``read_csv_options`` to refine the conversion. + * "openpyxl": this engine is significantly slower than ``xlsx2csv`` but supports + additional automatic type inference; potentially useful if you are otherwise + unable to parse your sheet with the (default) ``xlsx2csv`` engine in + ocnjunction with the ``schema_overrides`` parameter. + * "pyxlsb": this engine is used for Excel Binary Workbooks (`.xlsb` files). + Note that you have to use ``schema_overrides`` to correctly load date/datetime + columns (or these will be read as floats representing offset Julian values). + xlsx2csv_options Extra options passed to ``xlsx2csv.Xlsx2csv()``, e.g. ``{"skip_empty_lines": True}`` @@ -171,9 +171,19 @@ def read_excel( When there is no data in the sheet,``NoDataError`` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. + Notes + ----- + When using the default ``xlsx2csv`` engine the target Excel sheet is first converted + to CSV using ``xlsx2csv.Xlsx2csv(source).convert()`` and then parsed with Polars' + :func:`read_csv` function. You can pass additional options to ``read_csv_options`` + to influence this part of the parsing pipeline. + Returns ------- - DataFrame, or a ``{sheetname: DataFrame, ...}`` dict if reading multiple sheets. + DataFrame + If reading a single sheet. + dict + If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned. Examples -------- @@ -331,12 +341,13 @@ def read_ods( that have a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``BytesIO``). sheet_id - Sheet number(s) to convert (set ``0`` to load all sheets as DataFrames) and - return a ``{sheetname:frame,}`` dict. (Defaults to `1` if neither this nor - `sheet_name` are specified). Can also take a sequence of sheet numbers. + Sheet number(s) to convert, starting from 1 (set ``0`` to load *all* worksheets + as DataFrames) and return a ``{sheetname:frame,}`` dict. (Defaults to ``1`` if + neither this nor `sheet_name` are specified). Can also take a sequence of sheet + numbers. sheet_name - Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more - than one is given then a ``{sheetname:frame,}`` dict is returned. + Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If + more than one is given then a ``{sheetname:frame,}`` dict is returned. schema_overrides Support type specification or override of one or more columns. raise_if_empty @@ -383,7 +394,7 @@ def _read_spreadsheet( sheet_id: int | Sequence[int] | None, sheet_name: str | list[str] | tuple[str] | None, source: str | BytesIO | Path | BinaryIO | bytes, - engine: Literal["xlsx2csv", "openpyxl", "ods"] | None, + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"] | None, engine_options: dict[str, Any] | None = None, read_csv_options: dict[str, Any] | None = None, schema_overrides: SchemaDict | None = None, @@ -395,60 +406,63 @@ def _read_spreadsheet( f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})" ) - if engine_options is None: - engine_options = {} - # establish the reading function, parser, and available worksheets reader_fn, parser, worksheets = _initialise_spreadsheet_parser( - engine, source, engine_options + engine, source, engine_options or {} ) - # use the parser to read data from one or more sheets - if ( - sheet_id == 0 - or isinstance(sheet_id, Sequence) - or (sheet_name and not isinstance(sheet_name, str)) - ): - # read multiple sheets by id - sheet_ids = sheet_id or () - sheet_names = sheet_name or () - return { - sheet["name"]: reader_fn( + # determine which named worksheets to read + if sheet_id is None and sheet_name is None: + sheet_names = [worksheets[0]["name"]] + return_multi = False + else: + return_multi = ( + sheet_id == 0 + or isinstance(sheet_id, Sequence) + or (isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str)) + ) + ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or () + names = (sheet_name,) if isinstance(sheet_name, str) else sheet_name or () + sheet_names = [ + ws["name"] + for ws in worksheets + if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names) + ] + + # read data from the indicated sheet(s) + try: + parsed_sheets = { + name: reader_fn( parser=parser, - sheet_id=sheet["index"], - sheet_name=None, + sheet_name=name, read_csv_options=read_csv_options, schema_overrides=schema_overrides, raise_if_empty=raise_if_empty, ) - for sheet in worksheets - if sheet_id == 0 or sheet["index"] in sheet_ids or sheet["name"] in sheet_names # type: ignore[operator] + for name in sheet_names } - else: - # read a specific sheet by id or name - if sheet_name is None: - sheet_id = sheet_id or 1 - - return reader_fn( - parser=parser, - sheet_id=sheet_id, - sheet_name=sheet_name, - read_csv_options=read_csv_options, - schema_overrides=schema_overrides, - raise_if_empty=raise_if_empty, - ) + finally: + if hasattr(parser, "close"): + parser.close() + + if return_multi: + return parsed_sheets + return next(iter(parsed_sheets.values())) def _initialise_spreadsheet_parser( - engine: Literal["xlsx2csv", "openpyxl", "ods"] | None, + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"] | None, source: str | BytesIO | Path | BinaryIO | bytes, engine_options: dict[str, Any], ) -> tuple[Callable[..., pl.DataFrame], Any, list[dict[str, Any]]]: """Instantiate the indicated spreadsheet parser and establish related properties.""" if isinstance(source, (str, Path)): source = normalize_filepath(source) - if engine is None and str(source).lower().endswith(".ods"): - engine = "ods" + if engine is None: + if (src := str(source).lower()).endswith(".ods"): + engine = "ods" + elif src.endswith(".xlsb"): + engine = "pyxlsb" if engine == "xlsx2csv" or engine is None: # default try: @@ -472,6 +486,24 @@ def _initialise_spreadsheet_parser( sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)] return _read_spreadsheet_openpyxl, parser, sheets + elif engine == "pyxlsb": + try: + import pyxlsb + except ImportError: + raise ImportError( + "Required package not installed\n\nPlease run `pip install pyxlsb`" + ) from None + try: + parser = pyxlsb.open_workbook(source, **engine_options) + except KeyError as err: + if "no item named 'xl/_rels/workbook.bin.rels'" in str(err): + raise TypeError(f"Invalid Excel Binary Workbook: {source!r}") from None + raise + sheets = [ + {"index": i + 1, "name": name} for i, name in enumerate(parser.sheets) + ] + return _read_spreadsheet_pyxlsb, parser, sheets + elif engine == "ods": try: import ezodf @@ -543,7 +575,6 @@ def _drop_unnamed_null_columns(df: pl.DataFrame) -> pl.DataFrame: def _read_spreadsheet_ods( parser: Any, - sheet_id: int | None, sheet_name: str | None, read_csv_options: dict[str, Any] | None, schema_overrides: SchemaDict | None, @@ -552,9 +583,7 @@ def _read_spreadsheet_ods( ) -> pl.DataFrame: """Use the 'ezodf' library to read data from the given worksheet.""" sheets = parser.sheets - if sheet_id is not None: - ws = sheets[sheet_id - 1] - elif sheet_name is not None: + if sheet_name is not None: ws = next((s for s in sheets if s.name == sheet_name), None) if ws is None: raise ValueError(f"Sheet {sheet_name!r} not found") @@ -594,6 +623,7 @@ def _read_spreadsheet_ods( schema=headers, schema_overrides=overrides, ) + if raise_if_empty and len(df) == 0 and len(df.columns) == 0: raise NoDataError( "Empty Excel sheet; if you want to read this as " @@ -602,17 +632,21 @@ def _read_spreadsheet_ods( if strptime_cols: df = df.with_columns( - F.col(nm).str.strptime(dtype) # type: ignore[arg-type] + ( + F.col(nm).str.replace("[T ]00:00:00$", "") + if dtype == Date + else F.col(nm) + ).str.strptime( + dtype # type: ignore[arg-type] + ) for nm, dtype in strptime_cols.items() ) - df.columns = headers return _drop_unnamed_null_columns(df) def _read_spreadsheet_openpyxl( parser: Any, - sheet_id: int | None, sheet_name: str | None, read_csv_options: dict[str, Any] | None, schema_overrides: SchemaDict | None, @@ -620,13 +654,7 @@ def _read_spreadsheet_openpyxl( raise_if_empty: bool, ) -> pl.DataFrame: """Use the 'openpyxl' library to read data from the given worksheet.""" - # read requested sheet if provided on kwargs, otherwise read active sheet - if sheet_name is not None: - ws = parser[sheet_name] - elif sheet_id is not None: - ws = parser.worksheets[sheet_id - 1] - else: - ws = parser.active + ws = parser[sheet_name] # prefer detection of actual table objects; otherwise read # data in the used worksheet range, dropping null columns @@ -649,9 +677,57 @@ def _read_spreadsheet_openpyxl( series_data = [ pl.Series(name, [cell.value for cell in column_data]) for name, column_data in zip(header, zip(*rows_iter)) + if name ] df = pl.DataFrame( - {s.name: s for s in series_data if s.name}, + {s.name: s for s in series_data}, + schema_overrides=schema_overrides, + ) + if raise_if_empty and len(df) == 0 and len(df.columns) == 0: + raise NoDataError( + "Empty Excel sheet; if you want to read this as " + "an empty DataFrame, set `raise_if_empty=False`" + ) + return _drop_unnamed_null_columns(df) + + +def _read_spreadsheet_pyxlsb( + parser: Any, + sheet_name: str | None, + read_csv_options: dict[str, Any] | None, + schema_overrides: SchemaDict | None, + *, + raise_if_empty: bool, +) -> pl.DataFrame: + from pyxlsb import convert_date + + ws = parser.get_sheet(sheet_name) + try: + # establish header/data rows + header: list[str | None] = [] + rows_iter = ws.rows() + for row in rows_iter: + row_values = [cell.v for cell in row] + if any(v is not None for v in row_values): + header.extend(row_values) + break + + # load data rows as series + series_data = [ + pl.Series(name, [cell.v for cell in column_data]) + for name, column_data in zip(header, zip(*rows_iter)) + if name + ] + finally: + ws.close() + + if schema_overrides: + for idx, s in enumerate(series_data): + if schema_overrides.get(s.name) in (Datetime, Date): + series_data[idx] = s.map_elements(convert_date) + + df = pl.DataFrame( + {s.name: s for s in series_data}, schema_overrides=schema_overrides, ) if raise_if_empty and len(df) == 0 and len(df.columns) == 0: @@ -664,7 +740,6 @@ def _read_spreadsheet_openpyxl( def _read_spreadsheet_xlsx2csv( parser: Any, - sheet_id: int | None, sheet_name: str | None, read_csv_options: dict[str, Any] | None, schema_overrides: SchemaDict | None, @@ -675,7 +750,6 @@ def _read_spreadsheet_xlsx2csv( csv_buffer = StringIO() parser.convert( outfile=csv_buffer, - sheetid=sheet_id, sheetname=sheet_name, ) return _csv_buffer_to_frame( diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index e4e2398cdbba..80d9a20457c3 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -5483,10 +5483,6 @@ def update( """ Update the values in this `LazyFrame` with the non-null values in `other`. - Notes - ----- - This is syntactic sugar for a left/inner join + coalesce - Warnings -------- This functionality is experimental and may change without it being considered a @@ -5503,6 +5499,10 @@ def update( 'left' will keep the left table rows as is. 'inner' will remove rows that are not found in other + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + Examples -------- >>> df = pl.DataFrame( diff --git a/py-polars/polars/utils/show_versions.py b/py-polars/polars/utils/show_versions.py index f20185c9ca5d..74f10f64a648 100644 --- a/py-polars/polars/utils/show_versions.py +++ b/py-polars/polars/utils/show_versions.py @@ -14,35 +14,36 @@ def show_versions() -> None: -------- >>> pl.show_versions() # doctest: +SKIP --------Version info--------- - Polars: 0.17.11 - Index type: UInt32 - Platform: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 - Python: 3.11.3 (main, Apr 15 2023, 14:44:51) [GCC 11.3.0] - \b + Polars: 0.19.3 + Index type: UInt32 + Platform: macOS-13.5.2-arm64-arm-64bit + Python: 3.11.5 (main, Aug 24 2023, 15:09:45) [Clang 14.0.3 (clang-1403.0.22.14.1)] ----Optional dependencies---- - numpy: 1.24.2 - pandas: 2.0.0 - pyarrow: 11.0.0 - connectorx: - deltalake: 0.8.1 - fsspec: 2023.4.0 - matplotlib: 3.7.1 - xlsx2csv: 0.8.1 - xlsxwriter: 3.1.0 - """ + adbc_driver_sqlite: 0.6.0 + cloudpickle: 2.2.1 + connectorx: 0.3.2 + deltalake: 0.10.1 + fsspec: 2023.9.1 + gevent: 23.9.1 + matplotlib: 3.8.0 + numpy: 1.26.0 + openpyxl: 3.1.2 + pandas: 2.1.0 + pyarrow: 13.0.0 + pydantic: 2.3.0 + pyiceberg: 0.5.0 + pyxlsb: + sqlalchemy: 2.0.21 + xlsx2csv: 0.8.1 + xlsxwriter: 3.1.4 + + """ # noqa: W505 # note: we import 'platform' here as a micro-optimisation for initial import import platform - # optional dependencies deps = _get_dependency_info() - - # determine key length for alignment - keylen = ( - max( - len(x) for x in [*deps.keys(), "Polars", "Index type", "Platform", "Python"] - ) - + 1 - ) + core_properties = ("Polars", "Index type", "Platform", "Python") + keylen = max(len(x) for x in [*core_properties, *deps.keys()]) + 1 print("--------Version info---------") print(f"{'Polars:':{keylen}s} {get_polars_version()}") @@ -66,10 +67,12 @@ def _get_dependency_info() -> dict[str, str]: "gevent", "matplotlib", "numpy", + "openpyxl", "pandas", "pyarrow", "pydantic", "pyiceberg", + "pyxlsb", "sqlalchemy", "xlsx2csv", "xlsxwriter", diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index 7b75e280c38c..8288522d002d 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -38,23 +38,24 @@ Changelog = "https://github.com/pola-rs/polars/releases" [project.optional-dependencies] # NOTE: keep this list in sync with show_versions() and requirements-dev.txt -pyarrow = ["pyarrow >= 7.0.0"] -pandas = ["pyarrow >= 7.0.0", "pandas"] -numpy = ["numpy >= 1.16.0"] -fsspec = ["fsspec"] +adbc = ["adbc_driver_sqlite"] +cloudpickle = ["cloudpickle"] connectorx = ["connectorx"] -xlsx2csv = ["xlsx2csv >= 0.8.0"] -openpyxl = ["openpyxl >= 3.0.0"] deltalake = ["deltalake >= 0.10.0"] -timezone = ["backports.zoneinfo; python_version < '3.9'", "tzdata; platform_system == 'Windows'"] +fsspec = ["fsspec"] +gevent = ["gevent"] matplotlib = ["matplotlib"] +numpy = ["numpy >= 1.16.0"] +openpyxl = ["openpyxl >= 3.0.0"] +pandas = ["pyarrow >= 7.0.0", "pandas"] +pyarrow = ["pyarrow >= 7.0.0"] pydantic = ["pydantic"] pyiceberg = ["pyiceberg >= 0.5.0"] +pyxlsb = ["pyxlsb >= 1.0"] sqlalchemy = ["sqlalchemy", "pandas"] +timezone = ["backports.zoneinfo; python_version < '3.9'", "tzdata; platform_system == 'Windows'"] +xlsx2csv = ["xlsx2csv >= 0.8.0"] xlsxwriter = ["xlsxwriter"] -adbc = ["adbc_driver_sqlite"] -cloudpickle = ["cloudpickle"] -gevent = ["gevent"] all = [ "polars[pyarrow,pandas,numpy,fsspec,connectorx,xlsx2csv,deltalake,timezone,matplotlib,pydantic,pyiceberg,sqlalchemy,xlsxwriter,adbc,cloudpickle,gevent]", ] @@ -88,6 +89,7 @@ module = [ "polars.polars", "pyarrow.*", "pydantic", + "pyxlsb", "sqlalchemy.*", "xlsx2csv", "xlsxwriter.*", diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 0a45cad3e538..3cab5a05ae3a 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -28,6 +28,7 @@ s3fs[boto3] ezodf lxml openpyxl +pyxlsb xlsx2csv XlsxWriter # Deltalake diff --git a/py-polars/tests/unit/io/files/empty.xlsb b/py-polars/tests/unit/io/files/empty.xlsb new file mode 100644 index 000000000000..b4567f6e722c Binary files /dev/null and b/py-polars/tests/unit/io/files/empty.xlsb differ diff --git a/py-polars/tests/unit/io/files/example.ods b/py-polars/tests/unit/io/files/example.ods index 7f217f1d86dc..eff940590cd8 100644 Binary files a/py-polars/tests/unit/io/files/example.ods and b/py-polars/tests/unit/io/files/example.ods differ diff --git a/py-polars/tests/unit/io/files/example.xlsb b/py-polars/tests/unit/io/files/example.xlsb new file mode 100644 index 000000000000..4d89a9cbacf2 Binary files /dev/null and b/py-polars/tests/unit/io/files/example.xlsb differ diff --git a/py-polars/tests/unit/io/files/example.xlsx b/py-polars/tests/unit/io/files/example.xlsx index 4cbd3730865d..13e80e618fa2 100644 Binary files a/py-polars/tests/unit/io/files/example.xlsx and b/py-polars/tests/unit/io/files/example.xlsx differ diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 9f872f678f73..7e5f8051f8c5 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -19,55 +19,73 @@ @pytest.fixture() -def excel_file_path(io_files_path: Path) -> Path: +def path_xlsx(io_files_path: Path) -> Path: return io_files_path / "example.xlsx" @pytest.fixture() -def empty_excel_file_path(io_files_path: Path) -> Path: +def path_xlsx_empty(io_files_path: Path) -> Path: return io_files_path / "empty.xlsx" @pytest.fixture() -def openoffice_file_path(io_files_path: Path) -> Path: +def path_xlsb(io_files_path: Path) -> Path: + return io_files_path / "example.xlsb" + + +@pytest.fixture() +def path_xlsb_empty(io_files_path: Path) -> Path: + return io_files_path / "empty.xlsb" + + +@pytest.fixture() +def path_ods(io_files_path: Path) -> Path: return io_files_path / "example.ods" @pytest.fixture() -def empty_openoffice_file_path(io_files_path: Path) -> Path: +def path_ods_empty(io_files_path: Path) -> Path: return io_files_path / "empty.ods" @pytest.mark.parametrize( - ("read_spreadsheet", "source", "params"), + ("read_spreadsheet", "source", "engine_params"), [ - (pl.read_excel, "excel_file_path", {"engine": "xlsx2csv"}), - (pl.read_excel, "excel_file_path", {"engine": "openpyxl"}), - (pl.read_ods, "openoffice_file_path", {}), + (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + (pl.read_ods, "path_ods", {}), ], ) def test_read_spreadsheet( read_spreadsheet: Callable[..., pl.DataFrame], source: str, - params: dict[str, str], + engine_params: dict[str, str], request: pytest.FixtureRequest, ) -> None: - df = read_spreadsheet( - source=request.getfixturevalue(source), - sheet_name="test1", - sheet_id=None, - **params, - ) - expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]}) - assert_frame_equal(df, expected) + sheet_params: dict[str, Any] + + for sheet_params in ( # type: ignore[assignment] + {"sheet_name": None, "sheet_id": None}, + {"sheet_name": "test1"}, + {"sheet_id": 1}, + ): + df = read_spreadsheet( + source=request.getfixturevalue(source), + **engine_params, + **sheet_params, + ) + expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]}) + assert_frame_equal(df, expected) @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [ - (pl.read_excel, "excel_file_path", {"engine": "xlsx2csv"}), - (pl.read_excel, "excel_file_path", {"engine": "openpyxl"}), - (pl.read_ods, "openoffice_file_path", {}), + (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + (pl.read_ods, "path_ods", {}), ], ) def test_read_excel_multi_sheets( @@ -102,9 +120,10 @@ def test_read_excel_multi_sheets( @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [ - (pl.read_excel, "excel_file_path", {"engine": "xlsx2csv"}), - (pl.read_excel, "excel_file_path", {"engine": "openpyxl"}), - (pl.read_ods, "openoffice_file_path", {}), + (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + (pl.read_ods, "path_ods", {}), ], ) def test_read_excel_all_sheets( @@ -119,7 +138,7 @@ def test_read_excel_all_sheets( sheet_id=0, **params, ) - assert len(frames) == (3 if str(spreadsheet_path).endswith("ods") else 4) + assert len(frames) == (4 if str(spreadsheet_path).endswith("ods") else 5) expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]}) expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]}) @@ -146,7 +165,8 @@ def test_read_excel_all_sheets( ], ) def test_basic_datatypes_read_excel( - engine: Literal["xlsx2csv", "openpyxl"], schema_overrides: SchemaDict | None + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"], + schema_overrides: SchemaDict | None, ) -> None: df = pl.DataFrame( { @@ -173,7 +193,7 @@ def test_basic_datatypes_read_excel( @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) -def test_write_excel_bytes(engine: Literal["xlsx2csv", "openpyxl"]) -> None: +def test_write_excel_bytes(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> None: df = pl.DataFrame({"A": [1, 2, 3, 4, 5]}) excel_bytes = BytesIO() @@ -183,9 +203,9 @@ def test_write_excel_bytes(engine: Literal["xlsx2csv", "openpyxl"]) -> None: assert_frame_equal(df, df_read) -def test_schema_overrides_11161(excel_file_path: Path) -> None: +def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> None: df1 = pl.read_excel( - excel_file_path, + path_xlsx, sheet_name="test4", schema_overrides={"cardinality": pl.UInt16}, ).drop_nulls() @@ -196,7 +216,7 @@ def test_schema_overrides_11161(excel_file_path: Path) -> None: } df2 = pl.read_excel( - excel_file_path, + path_xlsx, sheet_name="test4", read_csv_options={"dtypes": {"cardinality": pl.UInt16}}, ).drop_nulls() @@ -207,7 +227,7 @@ def test_schema_overrides_11161(excel_file_path: Path) -> None: } df3 = pl.read_excel( - excel_file_path, + path_xlsx, sheet_name="test4", schema_overrides={"cardinality": pl.UInt16}, read_csv_options={ @@ -223,10 +243,31 @@ def test_schema_overrides_11161(excel_file_path: Path) -> None: "iter_groups": pl.Float32, } + for workbook_path in (path_xlsx, path_xlsb, path_ods): + df4 = pl.read_excel( + workbook_path, + sheet_name="test5", + schema_overrides={"dtm": pl.Datetime("ns"), "dt": pl.Date}, + ) + assert_frame_equal( + df4, + pl.DataFrame( + { + "dtm": [ + datetime(1999, 12, 31, 10, 30, 45), + datetime(2010, 10, 11, 12, 13, 14), + ], + "dt": [date(2024, 1, 1), date(2018, 8, 7)], + "val": [1.5, -0.5], + }, + schema={"dtm": pl.Datetime("ns"), "dt": pl.Date, "val": pl.Float64}, + ), + ) + with pytest.raises(ParameterCollisionError): # cannot specify 'cardinality' in both schema_overrides and read_csv_options pl.read_excel( - excel_file_path, + path_xlsx, sheet_name="test4", schema_overrides={"cardinality": pl.UInt16}, read_csv_options={"dtypes": {"cardinality": pl.Int32}}, @@ -238,16 +279,22 @@ def test_unsupported_engine() -> None: pl.read_excel(None, engine="foo") # type: ignore[call-overload] +def test_unsupported_binary_workbook(path_xlsx: Path, path_xlsb: Path) -> None: + with pytest.raises(Exception, match="Invalid Excel Binary Workbook"): + pl.read_excel(path_xlsx, engine="pyxlsb") + + with pytest.raises(Exception, match="does not support binary format"): + pl.read_excel(path_xlsb, engine="openpyxl") + + @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) -def test_read_excel_all_sheets_with_sheet_name( - excel_file_path: Path, engine: str -) -> None: +def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None: with pytest.raises( ValueError, match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)", ): pl.read_excel( # type: ignore[call-overload] - excel_file_path, + path_xlsx, sheet_id=1, sheet_name="Sheet1", engine=engine, @@ -386,7 +433,9 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None: @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) -def test_excel_compound_types(engine: Literal["xlsx2csv", "openpyxl"]) -> None: +def test_excel_compound_types( + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] +) -> None: df = pl.DataFrame( {"x": [[1, 2], [3, 4], [5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]} ).select("x", pl.struct(["y", "z"])) @@ -403,7 +452,7 @@ def test_excel_compound_types(engine: Literal["xlsx2csv", "openpyxl"]) -> None: @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) -def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl"]) -> None: +def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> None: from xlsxwriter import Workbook # note that we don't (quite) expect sparkline export to round-trip as we @@ -545,8 +594,9 @@ def test_excel_freeze_panes() -> None: @pytest.mark.parametrize( ("read_spreadsheet", "source"), [ - (pl.read_excel, "empty_excel_file_path"), - (pl.read_ods, "empty_openoffice_file_path"), + (pl.read_excel, "path_xlsx_empty"), + (pl.read_excel, "path_xlsb_empty"), + (pl.read_ods, "path_ods_empty"), ], ) def test_excel_empty_sheet( @@ -573,7 +623,7 @@ def test_excel_empty_sheet( ) def test_excel_hidden_columns( hidden_columns: list[str] | SelectorType, - engine: Literal["xlsx2csv", "openpyxl"], + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"], ) -> None: df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})