Skip to content

Commit

Permalink
feat: add support for one dtype for all columns (#299)
Browse files Browse the repository at this point in the history
* feat: add support for one dtype for all columns

* test: check column info
  • Loading branch information
PrettyWood authored Oct 14, 2024
1 parent 4105aed commit a3dae6d
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 38 deletions.
20 changes: 11 additions & 9 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def load_sheet(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet lazily by index or name.
Expand Down Expand Up @@ -249,7 +249,8 @@ def load_sheet(
`A,B,C,D,E` and `A,C,E,F`)
- A callable, a function that takes a column and returns a boolean
indicating whether the column should be used
:param dtypes: An optional dict of dtypes. Keys can be column indices or names
:param dtypes: An optional dtype (for all columns)
or dict of dtypes with keys as column indices or names.
"""
return ExcelSheet(
self._reader.load_sheet(
Expand Down Expand Up @@ -288,7 +289,7 @@ def load_table(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
) -> ExcelTable: ...
@typing.overload
Expand All @@ -303,7 +304,7 @@ def load_table(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
def load_table(
Expand All @@ -317,7 +318,7 @@ def load_table(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: bool = False,
) -> ExcelTable | pa.RecordBatch:
"""Loads a table by name.
Expand Down Expand Up @@ -351,7 +352,8 @@ def load_table(
`A,B,C,D,E` and `A,C,E,F`)
- A callable, a function that takes a column and returns a boolean
indicating whether the column should be used
:param dtypes: An optional dict of dtypes. Keys can be column indices or names
:param dtypes: An optional dtype (for all columns)
or dict of dtypes with keys as column indices or names.
"""
output = self._reader.load_table( # type:ignore[call-overload,misc]
name=name,
Expand Down Expand Up @@ -380,7 +382,7 @@ def load_sheet_eager(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
) -> pa.RecordBatch:
"""Loads a sheet eagerly by index or name.
Expand Down Expand Up @@ -413,7 +415,7 @@ def load_sheet_by_name(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Expand Down Expand Up @@ -442,7 +444,7 @@ def load_sheet_by_idx(
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Expand Down
10 changes: 5 additions & 5 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import pyarrow as pa
DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap = dict[str | int, DType]
ColumnNameFrom = Literal["provided", "looked_up", "generated"]
DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"]
DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"]
SheetVisible = Literal["visible", "hidden", "veryhidden"]

class ColumnInfo:
Expand Down Expand Up @@ -109,7 +109,7 @@ class _ExcelReader:
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelSheet: ...
@typing.overload
Expand All @@ -124,7 +124,7 @@ class _ExcelReader:
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
@typing.overload
Expand All @@ -139,7 +139,7 @@ class _ExcelReader:
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelTable: ...
@typing.overload
Expand All @@ -154,7 +154,7 @@ class _ExcelReader:
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
@property
Expand Down
57 changes: 57 additions & 0 deletions python/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,60 @@ def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3
assert pl_df["Asset ID"].dtype == pl.Float64
assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2


def test_one_dtype_for_all() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0, dtypes="string")
assert sheet.available_columns == [
fastexcel.ColumnInfo(
name="Employee ID",
index=0,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Employee Name",
index=1,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Date",
index=2,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Details",
index=3,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Asset ID",
index=4,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Mixed dates",
index=5,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Mixed bools",
index=6,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
]
assert sheet.to_polars().dtypes == [pl.String] * 7
33 changes: 33 additions & 0 deletions src/types/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,39 @@ impl FromPyObject<'_> for DType {

pub(crate) type DTypeMap = HashMap<IdxOrName, DType>;

pub(crate) enum DTypes {
All(DType),
Map(DTypeMap),
}

impl FromStr for DTypes {
type Err = FastExcelError;

fn from_str(dtypes: &str) -> FastExcelResult<Self> {
Ok(DTypes::All(DType::from_str(dtypes)?))
}
}

impl FromPyObject<'_> for DTypes {
fn extract_bound(py_dtypes: &Bound<'_, PyAny>) -> PyResult<Self> {
if let Ok(py_dtypes_str) = py_dtypes.extract::<String>() {
py_dtypes_str.parse()
} else {
Ok(DTypes::Map(py_dtypes.extract::<DTypeMap>()?))
}
.into_pyresult()
}
}

impl ToPyObject for DTypes {
fn to_object(&self, py: Python<'_>) -> PyObject {
match self {
DTypes::All(dtype) => dtype.to_object(py),
DTypes::Map(dtype_map) => dtype_map.to_object(py),
}
}
}

impl From<&DType> for ArrowDataType {
fn from(dtype: &DType) -> Self {
match dtype {
Expand Down
12 changes: 6 additions & 6 deletions src/types/python/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use crate::{
py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
},
types::{
dtype::{DTypeCoercion, DTypeMap},
dtype::{DTypeCoercion, DTypes},
idx_or_name::IdxOrName,
python::excelsheet::table::{extract_table_names, extract_table_range},
},
Expand Down Expand Up @@ -140,7 +140,7 @@ impl ExcelReader {
header: Header,
sample_rows: Option<usize>,
selected_columns: &SelectedColumns,
dtypes: Option<&DTypeMap>,
dtypes: Option<&DTypes>,
dtype_coercion: &DTypeCoercion,
) -> FastExcelResult<RecordBatch> {
let offset = header.offset() + pagination.offset();
Expand Down Expand Up @@ -182,7 +182,7 @@ impl ExcelReader {
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
use_columns: Option<&Bound<'_, PyAny>>,
dtypes: Option<DTypeMap>,
dtypes: Option<DTypes>,
eager: bool,
py: Python<'_>,
) -> PyResult<PyObject> {
Expand Down Expand Up @@ -258,7 +258,7 @@ impl ExcelReader {
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
use_columns: Option<&Bound<'_, PyAny>>,
dtypes: Option<DTypeMap>,
dtypes: Option<DTypes>,
eager: bool,
py: Python<'_>,
) -> PyResult<PyObject> {
Expand Down Expand Up @@ -346,7 +346,7 @@ impl ExcelReader {
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
use_columns: Option<&Bound<'_, PyAny>>,
dtypes: Option<DTypeMap>,
dtypes: Option<DTypes>,
eager: bool,
py: Python<'_>,
) -> PyResult<PyObject> {
Expand Down Expand Up @@ -416,7 +416,7 @@ impl ExcelReader {
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
use_columns: Option<&Bound<'_, PyAny>>,
dtypes: Option<DTypeMap>,
dtypes: Option<DTypes>,
eager: bool,
py: Python<'_>,
) -> PyResult<PyObject> {
Expand Down
32 changes: 20 additions & 12 deletions src/types/python/excelsheet/column_info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
},
types::{
dtype::{get_dtype_for_column, DType, DTypeCoercion, DTypeMap},
dtype::{get_dtype_for_column, DType, DTypeCoercion, DTypes},
idx_or_name::IdxOrName,
},
};
Expand Down Expand Up @@ -52,6 +52,7 @@ impl Display for ColumnNameFrom {

#[derive(Debug, Clone, PartialEq)]
pub(crate) enum DTypeFrom {
ProvidedForAll,
ProvidedByIndex,
ProvidedByName,
Guessed,
Expand All @@ -60,6 +61,7 @@ pub(crate) enum DTypeFrom {
impl Display for DTypeFrom {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
DTypeFrom::ProvidedForAll => "provided_for_all",
DTypeFrom::ProvidedByIndex => "provided_by_index",
DTypeFrom::ProvidedByName => "provided_by_name",
DTypeFrom::Guessed => "guessed",
Expand All @@ -72,6 +74,7 @@ impl FromStr for DTypeFrom {

fn from_str(s: &str) -> FastExcelResult<Self> {
match s {
"provided_for_all" => Ok(Self::ProvidedForAll),
"provided_by_index" => Ok(Self::ProvidedByIndex),
"provided_by_name" => Ok(Self::ProvidedByName),
"guessed" => Ok(Self::Guessed),
Expand Down Expand Up @@ -238,19 +241,24 @@ impl ColumnInfoBuilder {
data: &D,
start_row: usize,
end_row: usize,
specified_dtypes: Option<&DTypeMap>,
specified_dtypes: Option<&DTypes>,
dtype_coercion: &DTypeCoercion,
) -> FastExcelResult<(DType, DTypeFrom)> {
specified_dtypes
.and_then(|dtypes| {
// if we have dtypes, look the dtype up by index, and fall back on a lookup by name
// (done in this order because copying an usize is cheaper than cloning a string)
if let Some(dtype) = dtypes.get(&self.index.into()) {
Some((*dtype, DTypeFrom::ProvidedByIndex))
} else {
dtypes
.get(&self.name.clone().into())
.map(|dtype| (*dtype, DTypeFrom::ProvidedByName))
match dtypes {
DTypes::All(dtype) => Some((*dtype, DTypeFrom::ProvidedForAll)),
DTypes::Map(dtypes) => {
// if we have dtypes, look the dtype up by index, and fall back on a lookup by name
// (done in this order because copying an usize is cheaper than cloning a string)
if let Some(dtype) = dtypes.get(&self.index.into()) {
Some((*dtype, DTypeFrom::ProvidedByIndex))
} else {
dtypes
.get(&self.name.clone().into())
.map(|dtype| (*dtype, DTypeFrom::ProvidedByName))
}
}
}
})
.map(FastExcelResult::Ok)
Expand All @@ -266,7 +274,7 @@ impl ColumnInfoBuilder {
data: &D,
start_row: usize,
end_row: usize,
specified_dtypes: Option<&DTypeMap>,
specified_dtypes: Option<&DTypes>,
dtype_coercion: &DTypeCoercion,
) -> FastExcelResult<ColumnInfo> {
let (dtype, dtype_from) = self
Expand Down Expand Up @@ -456,7 +464,7 @@ pub(crate) fn build_available_columns<D: CalamineDataProvider>(
data: &D,
start_row: usize,
end_row: usize,
specified_dtypes: Option<&DTypeMap>,
specified_dtypes: Option<&DTypes>,
dtype_coercion: &DTypeCoercion,
) -> FastExcelResult<Vec<ColumnInfo>> {
let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len());
Expand Down
6 changes: 3 additions & 3 deletions src/types/python/excelsheet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use crate::{
error::{
py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
},
types::{dtype::DTypeMap, idx_or_name::IdxOrName},
types::{dtype::DTypes, idx_or_name::IdxOrName},
};
use crate::{types::dtype::DTypeCoercion, utils::schema::get_schema_sample_rows};

Expand Down Expand Up @@ -347,7 +347,7 @@ pub(crate) struct ExcelSheet {
dtype_coercion: DTypeCoercion,
selected_columns: Vec<ColumnInfo>,
available_columns: Vec<ColumnInfo>,
dtypes: Option<DTypeMap>,
dtypes: Option<DTypes>,
}

impl ExcelSheet {
Expand All @@ -364,7 +364,7 @@ impl ExcelSheet {
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
selected_columns: SelectedColumns,
dtypes: Option<DTypeMap>,
dtypes: Option<DTypes>,
) -> FastExcelResult<Self> {
let available_columns_info =
build_available_columns_info(&data, &selected_columns, &header)?;
Expand Down
Loading

0 comments on commit a3dae6d

Please sign in to comment.