Skip to content

Commit

Permalink
fix: header_row is now the exact index of the sheet (#297)
Browse files Browse the repository at this point in the history
  • Loading branch information
PrettyWood authored Oct 14, 2024
1 parent 8c9559f commit 4105aed
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 26 deletions.
15 changes: 9 additions & 6 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def load_sheet(
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
Expand All @@ -227,8 +227,11 @@ def load_sheet(
If `None`, all rows are loaded
:param skip_rows: Specifies how many rows should be skipped after the `header_row`.
Any rows before the `header_row` are automatically skipped.
If `header_row` is `None`, it skips the number of rows from the
start of the sheet.
If `header_row` is `None`:
- if `skip_rows` is `None` (default): it skips all empty rows
at the beginning of the sheet.
- if `skip_rows` is a number, it skips the specified number
of rows from the start of the sheet.
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
Expand Down Expand Up @@ -372,7 +375,7 @@ def load_sheet_eager(
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
Expand Down Expand Up @@ -405,7 +408,7 @@ def load_sheet_by_name(
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
Expand Down Expand Up @@ -434,7 +437,7 @@ def load_sheet_by_idx(
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
Expand Down
4 changes: 2 additions & 2 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class _ExcelReader:
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
Expand All @@ -119,7 +119,7 @@ class _ExcelReader:
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
Expand Down
Binary file added python/tests/fixtures/no-header.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion python/tests/test_durations.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_sheet_with_different_time_types() -> None:

def test_sheet_with_offset_header_row_and_durations() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("single-sheet-skip-rows-durations.xlsx"))
sheet = excel_reader.load_sheet(0, header_row=9)
sheet = excel_reader.load_sheet(0, header_row=10)

pd_df = sheet.to_pandas()
pl_df = sheet.to_polars()
Expand Down
31 changes: 29 additions & 2 deletions python/tests/test_fastexcel.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,8 @@ def test_sheets_with_custom_headers():
def test_sheets_with_skipping_headers():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=1, column_names=["Bugs"])
sheet_by_idx = excel_reader.load_sheet(1, header_row=1, column_names=["Bugs"])
sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None, column_names=["Bugs"])
sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["Bugs"])

# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
Expand Down Expand Up @@ -552,3 +552,30 @@ def test_sheet_with_decimal_numbers() -> None:
sheet2.to_polars(),
pl.DataFrame({"Decimals": ["28.14", "29.02"]}),
)


@pytest.mark.parametrize(
"header_row, skip_rows, expected",
[
(0, None, {"a": ["b"], "0": [1.0]}), # default
(None, 0, {"__UNNAMED__0": [None, None, "a", "b"], "__UNNAMED__1": [None, None, 0.0, 1.0]}),
(None, None, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
(0, 0, {"__UNNAMED__0": [None, "a", "b"], "__UNNAMED__1": [None, 0.0, 1.0]}),
(0, 1, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
(None, 2, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
(None, 3, {"__UNNAMED__0": ["b"], "__UNNAMED__1": [1.0]}),
(1, 0, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
(2, 0, {"a": ["b"], "0": [1.0]}),
(2, None, {"a": ["b"], "0": [1.0]}),
(2, 1, {"a": [], "0": []}),
],
)
def test_header_row_and_skip_rows(
header_row: int | None, skip_rows: int, expected: dict[str, Any]
) -> None:
pl_assert_frame_equal(
fastexcel.read_excel(path_for_fixture("no-header.xlsx"))
.load_sheet(0, header_row=header_row, skip_rows=skip_rows)
.to_polars(),
pl.DataFrame(expected),
)
59 changes: 44 additions & 15 deletions src/types/python/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use arrow::{pyarrow::ToPyArrow, record_batch::RecordBatch};
use pyo3::{prelude::PyObject, pyclass, pymethods, Bound, IntoPy, PyAny, PyResult, Python};

use calamine::{
open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, Range, Reader, ReaderRef,
Sheet as CalamineSheet, Sheets, Table,
open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, HeaderRow, Range, Reader,
ReaderRef, Sheet as CalamineSheet, Sheets, Table,
};

use crate::{
Expand Down Expand Up @@ -73,6 +73,19 @@ impl ExcelSheets {
)
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
match self {
Self::File(sheets) => {
sheets.with_header_row(header_row);
self
}
Self::Bytes(ref mut sheets) => {
sheets.with_header_row(header_row);
self
}
}
}

fn worksheet_range_ref(&mut self, name: &str) -> FastExcelResult<Range<DataRef>> {
match self {
ExcelSheets::File(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?),
Expand Down Expand Up @@ -164,7 +177,7 @@ impl ExcelReader {
sheet_meta: CalamineSheet,
header_row: Option<usize>,
column_names: Option<Vec<String>>,
skip_rows: usize,
skip_rows: Option<usize>,
n_rows: Option<usize>,
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
Expand All @@ -173,14 +186,28 @@ impl ExcelReader {
eager: bool,
py: Python<'_>,
) -> PyResult<PyObject> {
let header = Header::new(header_row, column_names);
// calamine `header_row` is the first row of the range to be read.
// For us `header_row` can be `None` (meaning there is no header and we should start reading
// the data at the beginning)
let calamine_header_row = match (header_row, skip_rows) {
(None, None) | (Some(0), None) => HeaderRow::FirstNonEmptyRow,
(None, Some(_)) => HeaderRow::Row(0),
(Some(row), _) => HeaderRow::Row(row as u32),
};
// And our header row is simply the first row of the data if defined.
let data_header_row = header_row.and(Some(0));

let header = Header::new(data_header_row, column_names);
let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?;

if eager && self.sheets.supports_by_ref() {
let range = self
.sheets
.with_header_row(calamine_header_row)
.worksheet_range_ref(&sheet_meta.name)
.into_pyresult()?;
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let pagination =
Pagination::new(skip_rows.unwrap_or(0), n_rows, &range).into_pyresult()?;
Self::load_sheet_eager(
&range.into(),
pagination,
Expand All @@ -195,9 +222,11 @@ impl ExcelReader {
} else {
let range = self
.sheets
.with_header_row(calamine_header_row)
.worksheet_range(&sheet_meta.name)
.into_pyresult()?;
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let pagination =
Pagination::new(skip_rows.unwrap_or(0), n_rows, &range).into_pyresult()?;
let sheet = ExcelSheet::try_new(
sheet_meta,
range.into(),
Expand Down Expand Up @@ -298,7 +327,7 @@ impl ExcelReader {
*,
header_row = 0,
column_names = None,
skip_rows = 0,
skip_rows = None,
n_rows = None,
schema_sample_rows = 1_000,
dtype_coercion = DTypeCoercion::Coerce,
Expand All @@ -312,7 +341,7 @@ impl ExcelReader {
idx_or_name: &Bound<'_, PyAny>,
header_row: Option<usize>,
column_names: Option<Vec<String>>,
skip_rows: usize,
skip_rows: Option<usize>,
n_rows: Option<usize>,
schema_sample_rows: Option<usize>,
dtype_coercion: DTypeCoercion,
Expand All @@ -337,14 +366,14 @@ impl ExcelReader {
}
}
IdxOrName::Idx(idx) => self
.sheet_metadata .get(idx)
.sheet_metadata
.get(idx)
.ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into())
.with_context(|| { format!(
"Sheet index {idx} is out of range. File has {} sheets.",
self.sheet_metadata.len()
)
})
,
.with_context(|| format!(
"Sheet index {idx} is out of range. File has {} sheets.",
self.sheet_metadata.len()
)
),
})
.into_pyresult()?.to_owned();

Expand Down

0 comments on commit 4105aed

Please sign in to comment.