diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index c406883..50320e6 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -213,7 +213,7 @@ def load_sheet( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet lazily by index or name. @@ -249,7 +249,8 @@ def load_sheet( `A,B,C,D,E` and `A,C,E,F`) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used - :param dtypes: An optional dict of dtypes. Keys can be column indices or names + :param dtypes: An optional dtype (for all columns) + or dict of dtypes with keys as column indices or names. """ return ExcelSheet( self._reader.load_sheet( @@ -288,7 +289,7 @@ def load_table( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., ) -> ExcelTable: ... @typing.overload @@ -303,7 +304,7 @@ def load_table( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., ) -> pa.RecordBatch: ... def load_table( @@ -317,7 +318,7 @@ def load_table( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: bool = False, ) -> ExcelTable | pa.RecordBatch: """Loads a table by name. @@ -351,7 +352,8 @@ def load_table( `A,B,C,D,E` and `A,C,E,F`) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used - :param dtypes: An optional dict of dtypes. Keys can be column indices or names + :param dtypes: An optional dtype (for all columns) + or dict of dtypes with keys as column indices or names. """ output = self._reader.load_table( # type:ignore[call-overload,misc] name=name, @@ -380,7 +382,7 @@ def load_sheet_eager( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, ) -> pa.RecordBatch: """Loads a sheet eagerly by index or name. @@ -413,7 +415,7 @@ def load_sheet_by_name( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by name. @@ -442,7 +444,7 @@ def load_sheet_by_idx( schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by index. diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 41a2b2e..1363116 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -8,7 +8,7 @@ import pyarrow as pa DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] DTypeMap = dict[str | int, DType] ColumnNameFrom = Literal["provided", "looked_up", "generated"] -DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"] +DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"] SheetVisible = Literal["visible", "hidden", "veryhidden"] class ColumnInfo: @@ -109,7 +109,7 @@ class _ExcelReader: schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., ) -> _ExcelSheet: ... @typing.overload @@ -124,7 +124,7 @@ class _ExcelReader: schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., ) -> pa.RecordBatch: ... @typing.overload @@ -139,7 +139,7 @@ class _ExcelReader: schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., ) -> _ExcelTable: ... @typing.overload @@ -154,7 +154,7 @@ class _ExcelReader: schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, - dtypes: DTypeMap | None = None, + dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., ) -> pa.RecordBatch: ... @property diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py index 144dc82..85c7c01 100644 --- a/python/tests/test_dtypes.py +++ b/python/tests/test_dtypes.py @@ -257,3 +257,60 @@ def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None: assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3 assert pl_df["Asset ID"].dtype == pl.Float64 assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2 + + +def test_one_dtype_for_all() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) + sheet = excel_reader.load_sheet(0, dtypes="string") + assert sheet.available_columns == [ + fastexcel.ColumnInfo( + name="Employee ID", + index=0, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + fastexcel.ColumnInfo( + name="Employee Name", + index=1, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + fastexcel.ColumnInfo( + name="Date", + index=2, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + fastexcel.ColumnInfo( + name="Details", + index=3, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + fastexcel.ColumnInfo( + name="Asset ID", + index=4, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + fastexcel.ColumnInfo( + name="Mixed dates", + index=5, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + fastexcel.ColumnInfo( + name="Mixed bools", + index=6, + dtype="string", + dtype_from="provided_for_all", + column_name_from="looked_up", + ), + ] + assert sheet.to_polars().dtypes == [pl.String] * 7 diff --git a/src/types/dtype.rs b/src/types/dtype.rs index 27c254c..c7d630f 100644 --- a/src/types/dtype.rs +++ b/src/types/dtype.rs @@ -85,6 +85,39 @@ impl FromPyObject<'_> for DType { pub(crate) type DTypeMap = HashMap; +pub(crate) enum DTypes { + All(DType), + Map(DTypeMap), +} + +impl FromStr for DTypes { + type Err = FastExcelError; + + fn from_str(dtypes: &str) -> FastExcelResult { + Ok(DTypes::All(DType::from_str(dtypes)?)) + } +} + +impl FromPyObject<'_> for DTypes { + fn extract_bound(py_dtypes: &Bound<'_, PyAny>) -> PyResult { + if let Ok(py_dtypes_str) = py_dtypes.extract::() { + py_dtypes_str.parse() + } else { + Ok(DTypes::Map(py_dtypes.extract::()?)) + } + .into_pyresult() + } +} + +impl ToPyObject for DTypes { + fn to_object(&self, py: Python<'_>) -> PyObject { + match self { + DTypes::All(dtype) => dtype.to_object(py), + DTypes::Map(dtype_map) => dtype_map.to_object(py), + } + } +} + impl From<&DType> for ArrowDataType { fn from(dtype: &DType) -> Self { match dtype { diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs index d0b0b69..0be6ca8 100644 --- a/src/types/python/excelreader.rs +++ b/src/types/python/excelreader.rs @@ -17,7 +17,7 @@ use crate::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, types::{ - dtype::{DTypeCoercion, DTypeMap}, + dtype::{DTypeCoercion, DTypes}, idx_or_name::IdxOrName, python::excelsheet::table::{extract_table_names, extract_table_range}, }, @@ -140,7 +140,7 @@ impl ExcelReader { header: Header, sample_rows: Option, selected_columns: &SelectedColumns, - dtypes: Option<&DTypeMap>, + dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, ) -> FastExcelResult { let offset = header.offset() + pagination.offset(); @@ -182,7 +182,7 @@ impl ExcelReader { schema_sample_rows: Option, dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'_, PyAny>>, - dtypes: Option, + dtypes: Option, eager: bool, py: Python<'_>, ) -> PyResult { @@ -258,7 +258,7 @@ impl ExcelReader { schema_sample_rows: Option, dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'_, PyAny>>, - dtypes: Option, + dtypes: Option, eager: bool, py: Python<'_>, ) -> PyResult { @@ -346,7 +346,7 @@ impl ExcelReader { schema_sample_rows: Option, dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'_, PyAny>>, - dtypes: Option, + dtypes: Option, eager: bool, py: Python<'_>, ) -> PyResult { @@ -416,7 +416,7 @@ impl ExcelReader { schema_sample_rows: Option, dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'_, PyAny>>, - dtypes: Option, + dtypes: Option, eager: bool, py: Python<'_>, ) -> PyResult { diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs index a8c5b06..c3c3978 100644 --- a/src/types/python/excelsheet/column_info.rs +++ b/src/types/python/excelsheet/column_info.rs @@ -10,7 +10,7 @@ use crate::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, types::{ - dtype::{get_dtype_for_column, DType, DTypeCoercion, DTypeMap}, + dtype::{get_dtype_for_column, DType, DTypeCoercion, DTypes}, idx_or_name::IdxOrName, }, }; @@ -52,6 +52,7 @@ impl Display for ColumnNameFrom { #[derive(Debug, Clone, PartialEq)] pub(crate) enum DTypeFrom { + ProvidedForAll, ProvidedByIndex, ProvidedByName, Guessed, @@ -60,6 +61,7 @@ pub(crate) enum DTypeFrom { impl Display for DTypeFrom { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(match self { + DTypeFrom::ProvidedForAll => "provided_for_all", DTypeFrom::ProvidedByIndex => "provided_by_index", DTypeFrom::ProvidedByName => "provided_by_name", DTypeFrom::Guessed => "guessed", @@ -72,6 +74,7 @@ impl FromStr for DTypeFrom { fn from_str(s: &str) -> FastExcelResult { match s { + "provided_for_all" => Ok(Self::ProvidedForAll), "provided_by_index" => Ok(Self::ProvidedByIndex), "provided_by_name" => Ok(Self::ProvidedByName), "guessed" => Ok(Self::Guessed), @@ -238,19 +241,24 @@ impl ColumnInfoBuilder { data: &D, start_row: usize, end_row: usize, - specified_dtypes: Option<&DTypeMap>, + specified_dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, ) -> FastExcelResult<(DType, DTypeFrom)> { specified_dtypes .and_then(|dtypes| { - // if we have dtypes, look the dtype up by index, and fall back on a lookup by name - // (done in this order because copying an usize is cheaper than cloning a string) - if let Some(dtype) = dtypes.get(&self.index.into()) { - Some((*dtype, DTypeFrom::ProvidedByIndex)) - } else { - dtypes - .get(&self.name.clone().into()) - .map(|dtype| (*dtype, DTypeFrom::ProvidedByName)) + match dtypes { + DTypes::All(dtype) => Some((*dtype, DTypeFrom::ProvidedForAll)), + DTypes::Map(dtypes) => { + // if we have dtypes, look the dtype up by index, and fall back on a lookup by name + // (done in this order because copying an usize is cheaper than cloning a string) + if let Some(dtype) = dtypes.get(&self.index.into()) { + Some((*dtype, DTypeFrom::ProvidedByIndex)) + } else { + dtypes + .get(&self.name.clone().into()) + .map(|dtype| (*dtype, DTypeFrom::ProvidedByName)) + } + } } }) .map(FastExcelResult::Ok) @@ -266,7 +274,7 @@ impl ColumnInfoBuilder { data: &D, start_row: usize, end_row: usize, - specified_dtypes: Option<&DTypeMap>, + specified_dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, ) -> FastExcelResult { let (dtype, dtype_from) = self @@ -456,7 +464,7 @@ pub(crate) fn build_available_columns( data: &D, start_row: usize, end_row: usize, - specified_dtypes: Option<&DTypeMap>, + specified_dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, ) -> FastExcelResult> { let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len()); diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs index 9082c85..52d7b55 100644 --- a/src/types/python/excelsheet/mod.rs +++ b/src/types/python/excelsheet/mod.rs @@ -17,7 +17,7 @@ use crate::{ error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, - types::{dtype::DTypeMap, idx_or_name::IdxOrName}, + types::{dtype::DTypes, idx_or_name::IdxOrName}, }; use crate::{types::dtype::DTypeCoercion, utils::schema::get_schema_sample_rows}; @@ -347,7 +347,7 @@ pub(crate) struct ExcelSheet { dtype_coercion: DTypeCoercion, selected_columns: Vec, available_columns: Vec, - dtypes: Option, + dtypes: Option, } impl ExcelSheet { @@ -364,7 +364,7 @@ impl ExcelSheet { schema_sample_rows: Option, dtype_coercion: DTypeCoercion, selected_columns: SelectedColumns, - dtypes: Option, + dtypes: Option, ) -> FastExcelResult { let available_columns_info = build_available_columns_info(&data, &selected_columns, &header)?; diff --git a/src/types/python/table.rs b/src/types/python/table.rs index b3df0dc..4aa2339 100644 --- a/src/types/python/table.rs +++ b/src/types/python/table.rs @@ -18,7 +18,7 @@ use crate::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, types::{ - dtype::{DType, DTypeCoercion, DTypeMap}, + dtype::{DType, DTypeCoercion, DTypes}, python::excelsheet::column_info::build_available_columns, }, utils::schema::get_schema_sample_rows, @@ -40,7 +40,7 @@ pub(crate) struct ExcelTable { table: Table, header: Header, pagination: Pagination, - dtypes: Option, + dtypes: Option, dtype_coercion: DTypeCoercion, height: Option, total_height: Option, @@ -55,7 +55,7 @@ impl ExcelTable { schema_sample_rows: Option, dtype_coercion: DTypeCoercion, selected_columns: SelectedColumns, - dtypes: Option, + dtypes: Option, ) -> FastExcelResult { let available_columns_info = build_available_columns_info(table.data(), &selected_columns, &header)?;