feat: add support for one dtype for all columns (#299)

* feat: add support for one dtype for all columns * test: check column info
ToucanToco · Oct 14, 2024 · a3dae6d · a3dae6d
1 parent 4105aed
commit a3dae6d
Show file tree

Hide file tree

Showing 8 changed files with 138 additions and 38 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -213,7 +213,7 @@ def load_sheet(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet lazily by index or name.
 
@@ -249,7 +249,8 @@ def load_sheet(
                               `A,B,C,D,E` and `A,C,E,F`)
                             - A callable, a function that takes a column and returns a boolean
                               indicating whether the column should be used
-        :param dtypes: An optional dict of dtypes. Keys can be column indices or names
+        :param dtypes: An optional dtype (for all columns)
+                       or dict of dtypes with keys as column indices or names.
         """
         return ExcelSheet(
             self._reader.load_sheet(
@@ -288,7 +289,7 @@ def load_table(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: Literal[False] = ...,
     ) -> ExcelTable: ...
     @typing.overload
@@ -303,7 +304,7 @@ def load_table(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...
     def load_table(
@@ -317,7 +318,7 @@ def load_table(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: bool = False,
     ) -> ExcelTable | pa.RecordBatch:
         """Loads a table by name.
@@ -351,7 +352,8 @@ def load_table(
                               `A,B,C,D,E` and `A,C,E,F`)
                             - A callable, a function that takes a column and returns a boolean
                               indicating whether the column should be used
-        :param dtypes: An optional dict of dtypes. Keys can be column indices or names
+        :param dtypes: An optional dtype (for all columns)
+                       or dict of dtypes with keys as column indices or names.
         """
         output = self._reader.load_table(  # type:ignore[call-overload,misc]
             name=name,
@@ -380,7 +382,7 @@ def load_sheet_eager(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
     ) -> pa.RecordBatch:
         """Loads a sheet eagerly by index or name.
 
@@ -413,7 +415,7 @@ def load_sheet_by_name(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by name.
 
@@ -442,7 +444,7 @@ def load_sheet_by_idx(
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by index.
 

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -8,7 +8,7 @@ import pyarrow as pa
 DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
 DTypeMap = dict[str | int, DType]
 ColumnNameFrom = Literal["provided", "looked_up", "generated"]
-DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"]
+DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"]
 SheetVisible = Literal["visible", "hidden", "veryhidden"]
 
 class ColumnInfo:
@@ -109,7 +109,7 @@ class _ExcelReader:
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: Literal[False] = ...,
     ) -> _ExcelSheet: ...
     @typing.overload
@@ -124,7 +124,7 @@ class _ExcelReader:
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...
     @typing.overload
@@ -139,7 +139,7 @@ class _ExcelReader:
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: Literal[False] = ...,
     ) -> _ExcelTable: ...
     @typing.overload
@@ -154,7 +154,7 @@ class _ExcelReader:
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
-        dtypes: DTypeMap | None = None,
+        dtypes: DType | DTypeMap | None = None,
         eager: Literal[True] = ...,
     ) -> pa.RecordBatch: ...
     @property

diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
@@ -257,3 +257,60 @@ def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
     assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3
     assert pl_df["Asset ID"].dtype == pl.Float64
     assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2
+
+
+def test_one_dtype_for_all() -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
+    sheet = excel_reader.load_sheet(0, dtypes="string")
+    assert sheet.available_columns == [
+        fastexcel.ColumnInfo(
+            name="Employee ID",
+            index=0,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+        fastexcel.ColumnInfo(
+            name="Employee Name",
+            index=1,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+        fastexcel.ColumnInfo(
+            name="Date",
+            index=2,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+        fastexcel.ColumnInfo(
+            name="Details",
+            index=3,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+        fastexcel.ColumnInfo(
+            name="Asset ID",
+            index=4,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+        fastexcel.ColumnInfo(
+            name="Mixed dates",
+            index=5,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+        fastexcel.ColumnInfo(
+            name="Mixed bools",
+            index=6,
+            dtype="string",
+            dtype_from="provided_for_all",
+            column_name_from="looked_up",
+        ),
+    ]
+    assert sheet.to_polars().dtypes == [pl.String] * 7
diff --git a/src/types/dtype.rs b/src/types/dtype.rs
@@ -85,6 +85,39 @@ impl FromPyObject<'_> for DType {
 
 pub(crate) type DTypeMap = HashMap<IdxOrName, DType>;
 
+pub(crate) enum DTypes {
+    All(DType),
+    Map(DTypeMap),
+}
+
+impl FromStr for DTypes {
+    type Err = FastExcelError;
+
+    fn from_str(dtypes: &str) -> FastExcelResult<Self> {
+        Ok(DTypes::All(DType::from_str(dtypes)?))
+    }
+}
+
+impl FromPyObject<'_> for DTypes {
+    fn extract_bound(py_dtypes: &Bound<'_, PyAny>) -> PyResult<Self> {
+        if let Ok(py_dtypes_str) = py_dtypes.extract::<String>() {
+            py_dtypes_str.parse()
+        } else {
+            Ok(DTypes::Map(py_dtypes.extract::<DTypeMap>()?))
+        }
+        .into_pyresult()
+    }
+}
+
+impl ToPyObject for DTypes {
+    fn to_object(&self, py: Python<'_>) -> PyObject {
+        match self {
+            DTypes::All(dtype) => dtype.to_object(py),
+            DTypes::Map(dtype_map) => dtype_map.to_object(py),
+        }
+    }
+}
+
 impl From<&DType> for ArrowDataType {
     fn from(dtype: &DType) -> Self {
         match dtype {

diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs
@@ -17,7 +17,7 @@ use crate::{
         py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
     },
     types::{
-        dtype::{DTypeCoercion, DTypeMap},
+        dtype::{DTypeCoercion, DTypes},
         idx_or_name::IdxOrName,
         python::excelsheet::table::{extract_table_names, extract_table_range},
     },
@@ -140,7 +140,7 @@ impl ExcelReader {
         header: Header,
         sample_rows: Option<usize>,
         selected_columns: &SelectedColumns,
-        dtypes: Option<&DTypeMap>,
+        dtypes: Option<&DTypes>,
         dtype_coercion: &DTypeCoercion,
     ) -> FastExcelResult<RecordBatch> {
         let offset = header.offset() + pagination.offset();
@@ -182,7 +182,7 @@ impl ExcelReader {
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
         use_columns: Option<&Bound<'_, PyAny>>,
-        dtypes: Option<DTypeMap>,
+        dtypes: Option<DTypes>,
         eager: bool,
         py: Python<'_>,
     ) -> PyResult<PyObject> {
@@ -258,7 +258,7 @@ impl ExcelReader {
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
         use_columns: Option<&Bound<'_, PyAny>>,
-        dtypes: Option<DTypeMap>,
+        dtypes: Option<DTypes>,
         eager: bool,
         py: Python<'_>,
     ) -> PyResult<PyObject> {
@@ -346,7 +346,7 @@ impl ExcelReader {
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
         use_columns: Option<&Bound<'_, PyAny>>,
-        dtypes: Option<DTypeMap>,
+        dtypes: Option<DTypes>,
         eager: bool,
         py: Python<'_>,
     ) -> PyResult<PyObject> {
@@ -416,7 +416,7 @@ impl ExcelReader {
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
         use_columns: Option<&Bound<'_, PyAny>>,
-        dtypes: Option<DTypeMap>,
+        dtypes: Option<DTypes>,
         eager: bool,
         py: Python<'_>,
     ) -> PyResult<PyObject> {

diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs
@@ -10,7 +10,7 @@ use crate::{
         py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
     },
     types::{
-        dtype::{get_dtype_for_column, DType, DTypeCoercion, DTypeMap},
+        dtype::{get_dtype_for_column, DType, DTypeCoercion, DTypes},
         idx_or_name::IdxOrName,
     },
 };
@@ -52,6 +52,7 @@ impl Display for ColumnNameFrom {
 
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) enum DTypeFrom {
+    ProvidedForAll,
     ProvidedByIndex,
     ProvidedByName,
     Guessed,
@@ -60,6 +61,7 @@ pub(crate) enum DTypeFrom {
 impl Display for DTypeFrom {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.write_str(match self {
+            DTypeFrom::ProvidedForAll => "provided_for_all",
             DTypeFrom::ProvidedByIndex => "provided_by_index",
             DTypeFrom::ProvidedByName => "provided_by_name",
             DTypeFrom::Guessed => "guessed",
@@ -72,6 +74,7 @@ impl FromStr for DTypeFrom {
 
     fn from_str(s: &str) -> FastExcelResult<Self> {
         match s {
+            "provided_for_all" => Ok(Self::ProvidedForAll),
             "provided_by_index" => Ok(Self::ProvidedByIndex),
             "provided_by_name" => Ok(Self::ProvidedByName),
             "guessed" => Ok(Self::Guessed),
@@ -238,19 +241,24 @@ impl ColumnInfoBuilder {
         data: &D,
         start_row: usize,
         end_row: usize,
-        specified_dtypes: Option<&DTypeMap>,
+        specified_dtypes: Option<&DTypes>,
         dtype_coercion: &DTypeCoercion,
     ) -> FastExcelResult<(DType, DTypeFrom)> {
         specified_dtypes
             .and_then(|dtypes| {
-                // if we have dtypes, look the dtype up by index, and fall back on a lookup by name
-                // (done in this order because copying an usize is cheaper than cloning a string)
-                if let Some(dtype) = dtypes.get(&self.index.into()) {
-                    Some((*dtype, DTypeFrom::ProvidedByIndex))
-                } else {
-                    dtypes
-                        .get(&self.name.clone().into())
-                        .map(|dtype| (*dtype, DTypeFrom::ProvidedByName))
+                match dtypes {
+                    DTypes::All(dtype) => Some((*dtype, DTypeFrom::ProvidedForAll)),
+                    DTypes::Map(dtypes) => {
+                        // if we have dtypes, look the dtype up by index, and fall back on a lookup by name
+                        // (done in this order because copying an usize is cheaper than cloning a string)
+                        if let Some(dtype) = dtypes.get(&self.index.into()) {
+                            Some((*dtype, DTypeFrom::ProvidedByIndex))
+                        } else {
+                            dtypes
+                                .get(&self.name.clone().into())
+                                .map(|dtype| (*dtype, DTypeFrom::ProvidedByName))
+                        }
+                    }
                 }
             })
             .map(FastExcelResult::Ok)
@@ -266,7 +274,7 @@ impl ColumnInfoBuilder {
         data: &D,
         start_row: usize,
         end_row: usize,
-        specified_dtypes: Option<&DTypeMap>,
+        specified_dtypes: Option<&DTypes>,
         dtype_coercion: &DTypeCoercion,
     ) -> FastExcelResult<ColumnInfo> {
         let (dtype, dtype_from) = self
@@ -456,7 +464,7 @@ pub(crate) fn build_available_columns<D: CalamineDataProvider>(
     data: &D,
     start_row: usize,
     end_row: usize,
-    specified_dtypes: Option<&DTypeMap>,
+    specified_dtypes: Option<&DTypes>,
     dtype_coercion: &DTypeCoercion,
 ) -> FastExcelResult<Vec<ColumnInfo>> {
     let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len());

diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs
@@ -17,7 +17,7 @@ use crate::{
     error::{
         py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
     },
-    types::{dtype::DTypeMap, idx_or_name::IdxOrName},
+    types::{dtype::DTypes, idx_or_name::IdxOrName},
 };
 use crate::{types::dtype::DTypeCoercion, utils::schema::get_schema_sample_rows};
 
@@ -347,7 +347,7 @@ pub(crate) struct ExcelSheet {
     dtype_coercion: DTypeCoercion,
     selected_columns: Vec<ColumnInfo>,
     available_columns: Vec<ColumnInfo>,
-    dtypes: Option<DTypeMap>,
+    dtypes: Option<DTypes>,
 }
 
 impl ExcelSheet {
@@ -364,7 +364,7 @@ impl ExcelSheet {
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
         selected_columns: SelectedColumns,
-        dtypes: Option<DTypeMap>,
+        dtypes: Option<DTypes>,
     ) -> FastExcelResult<Self> {
         let available_columns_info =
             build_available_columns_info(&data, &selected_columns, &header)?;