fix: header_row is now the exact index of the sheet (#297)

ToucanToco · Oct 14, 2024 · 4105aed · 4105aed
1 parent 8c9559f
commit 4105aed
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 26 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -208,7 +208,7 @@ def load_sheet(
         *,
         header_row: int | None = 0,
         column_names: list[str] | None = None,
-        skip_rows: int = 0,
+        skip_rows: int | None = None,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -227,8 +227,11 @@ def load_sheet(
                        If `None`, all rows are loaded
         :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
                           Any rows before the `header_row` are automatically skipped.
-                          If `header_row` is `None`, it skips the number of rows from the
-                          start of the sheet.
+                          If `header_row` is `None`:
+                            - if `skip_rows` is `None` (default): it skips all empty rows
+                            at the beginning of the sheet.
+                            - if `skip_rows` is a number, it skips the specified number
+                            of rows from the start of the sheet.
         :param schema_sample_rows: Specifies how many rows should be used to determine
                                    the dtype of a column.
                                    If `None`, all rows will be used.
@@ -372,7 +375,7 @@ def load_sheet_eager(
         *,
         header_row: int | None = 0,
         column_names: list[str] | None = None,
-        skip_rows: int = 0,
+        skip_rows: int | None = None,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -405,7 +408,7 @@ def load_sheet_by_name(
         *,
         header_row: int | None = 0,
         column_names: list[str] | None = None,
-        skip_rows: int = 0,
+        skip_rows: int | None = None,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -434,7 +437,7 @@ def load_sheet_by_idx(
         *,
         header_row: int | None = 0,
         column_names: list[str] | None = None,
-        skip_rows: int = 0,
+        skip_rows: int | None = None,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -104,7 +104,7 @@ class _ExcelReader:
         *,
         header_row: int | None = 0,
         column_names: list[str] | None = None,
-        skip_rows: int = 0,
+        skip_rows: int | None = None,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",
@@ -119,7 +119,7 @@ class _ExcelReader:
         *,
         header_row: int | None = 0,
         column_names: list[str] | None = None,
-        skip_rows: int = 0,
+        skip_rows: int | None = None,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         dtype_coercion: Literal["coerce", "strict"] = "coerce",

diff --git a/python/tests/fixtures/no-header.xlsx b/python/tests/fixtures/no-header.xlsx
diff --git a/python/tests/test_durations.py b/python/tests/test_durations.py
@@ -65,7 +65,7 @@ def test_sheet_with_different_time_types() -> None:
 
 def test_sheet_with_offset_header_row_and_durations() -> None:
     excel_reader = fastexcel.read_excel(path_for_fixture("single-sheet-skip-rows-durations.xlsx"))
-    sheet = excel_reader.load_sheet(0, header_row=9)
+    sheet = excel_reader.load_sheet(0, header_row=10)
 
     pd_df = sheet.to_pandas()
     pl_df = sheet.to_polars()

diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py
@@ -238,8 +238,8 @@ def test_sheets_with_custom_headers():
 def test_sheets_with_skipping_headers():
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
     assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
-    sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=1, column_names=["Bugs"])
-    sheet_by_idx = excel_reader.load_sheet(1, header_row=1, column_names=["Bugs"])
+    sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None, column_names=["Bugs"])
+    sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["Bugs"])
 
     # Metadata
     assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
@@ -552,3 +552,30 @@ def test_sheet_with_decimal_numbers() -> None:
         sheet2.to_polars(),
         pl.DataFrame({"Decimals": ["28.14", "29.02"]}),
     )
+
+
+@pytest.mark.parametrize(
+    "header_row, skip_rows, expected",
+    [
+        (0, None, {"a": ["b"], "0": [1.0]}),  # default
+        (None, 0, {"__UNNAMED__0": [None, None, "a", "b"], "__UNNAMED__1": [None, None, 0.0, 1.0]}),
+        (None, None, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
+        (0, 0, {"__UNNAMED__0": [None, "a", "b"], "__UNNAMED__1": [None, 0.0, 1.0]}),
+        (0, 1, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
+        (None, 2, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
+        (None, 3, {"__UNNAMED__0": ["b"], "__UNNAMED__1": [1.0]}),
+        (1, 0, {"__UNNAMED__0": ["a", "b"], "__UNNAMED__1": [0.0, 1.0]}),
+        (2, 0, {"a": ["b"], "0": [1.0]}),
+        (2, None, {"a": ["b"], "0": [1.0]}),
+        (2, 1, {"a": [], "0": []}),
+    ],
+)
+def test_header_row_and_skip_rows(
+    header_row: int | None, skip_rows: int, expected: dict[str, Any]
+) -> None:
+    pl_assert_frame_equal(
+        fastexcel.read_excel(path_for_fixture("no-header.xlsx"))
+        .load_sheet(0, header_row=header_row, skip_rows=skip_rows)
+        .to_polars(),
+        pl.DataFrame(expected),
+    )
diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs
@@ -7,8 +7,8 @@ use arrow::{pyarrow::ToPyArrow, record_batch::RecordBatch};
 use pyo3::{prelude::PyObject, pyclass, pymethods, Bound, IntoPy, PyAny, PyResult, Python};
 
 use calamine::{
-    open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, Range, Reader, ReaderRef,
-    Sheet as CalamineSheet, Sheets, Table,
+    open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, HeaderRow, Range, Reader,
+    ReaderRef, Sheet as CalamineSheet, Sheets, Table,
 };
 
 use crate::{
@@ -73,6 +73,19 @@ impl ExcelSheets {
         )
     }
 
+    fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
+        match self {
+            Self::File(sheets) => {
+                sheets.with_header_row(header_row);
+                self
+            }
+            Self::Bytes(ref mut sheets) => {
+                sheets.with_header_row(header_row);
+                self
+            }
+        }
+    }
+
     fn worksheet_range_ref(&mut self, name: &str) -> FastExcelResult<Range<DataRef>> {
         match self {
             ExcelSheets::File(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?),
@@ -164,7 +177,7 @@ impl ExcelReader {
         sheet_meta: CalamineSheet,
         header_row: Option<usize>,
         column_names: Option<Vec<String>>,
-        skip_rows: usize,
+        skip_rows: Option<usize>,
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
@@ -173,14 +186,28 @@ impl ExcelReader {
         eager: bool,
         py: Python<'_>,
     ) -> PyResult<PyObject> {
-        let header = Header::new(header_row, column_names);
+        // calamine `header_row` is the first row of the range to be read.
+        // For us `header_row` can be `None` (meaning there is no header and we should start reading
+        // the data at the beginning)
+        let calamine_header_row = match (header_row, skip_rows) {
+            (None, None) | (Some(0), None) => HeaderRow::FirstNonEmptyRow,
+            (None, Some(_)) => HeaderRow::Row(0),
+            (Some(row), _) => HeaderRow::Row(row as u32),
+        };
+        // And our header row is simply the first row of the data if defined.
+        let data_header_row = header_row.and(Some(0));
+
+        let header = Header::new(data_header_row, column_names);
         let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?;
+
         if eager && self.sheets.supports_by_ref() {
             let range = self
                 .sheets
+                .with_header_row(calamine_header_row)
                 .worksheet_range_ref(&sheet_meta.name)
                 .into_pyresult()?;
-            let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
+            let pagination =
+                Pagination::new(skip_rows.unwrap_or(0), n_rows, &range).into_pyresult()?;
             Self::load_sheet_eager(
                 &range.into(),
                 pagination,
@@ -195,9 +222,11 @@ impl ExcelReader {
         } else {
             let range = self
                 .sheets
+                .with_header_row(calamine_header_row)
                 .worksheet_range(&sheet_meta.name)
                 .into_pyresult()?;
-            let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
+            let pagination =
+                Pagination::new(skip_rows.unwrap_or(0), n_rows, &range).into_pyresult()?;
             let sheet = ExcelSheet::try_new(
                 sheet_meta,
                 range.into(),
@@ -298,7 +327,7 @@ impl ExcelReader {
         *,
         header_row = 0,
         column_names = None,
-        skip_rows = 0,
+        skip_rows = None,
         n_rows = None,
         schema_sample_rows = 1_000,
         dtype_coercion = DTypeCoercion::Coerce,
@@ -312,7 +341,7 @@ impl ExcelReader {
         idx_or_name: &Bound<'_, PyAny>,
         header_row: Option<usize>,
         column_names: Option<Vec<String>>,
-        skip_rows: usize,
+        skip_rows: Option<usize>,
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
         dtype_coercion: DTypeCoercion,
@@ -337,14 +366,14 @@ impl ExcelReader {
                     }
                 }
                 IdxOrName::Idx(idx) => self
-                    .sheet_metadata                    .get(idx)
+                    .sheet_metadata
+                    .get(idx)
                     .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into())
-                    .with_context(|| {                         format!(
-                            "Sheet index {idx} is out of range. File has {} sheets.",
-                            self.sheet_metadata.len()
-                        )
-                    })
-                    ,
+                    .with_context(|| format!(
+                        "Sheet index {idx} is out of range. File has {} sheets.",
+                        self.sheet_metadata.len()
+                    )
+                ),
             })
             .into_pyresult()?.to_owned();