feat: add an option to disable automatic type coercion (#248)

* fix: make test.py executable Signed-off-by: Luka Peschke <[email protected]> * feat: add an option to disable automatic type coercion closes #247 Signed-off-by: Luka Peschke <[email protected]> --------- Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Jul 4, 2024 · 12e8628 · 12e8628
1 parent f962672
commit 12e8628
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 13 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -128,6 +128,7 @@ def load_sheet(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
         dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
@@ -146,6 +147,11 @@ def load_sheet(
         :param schema_sample_rows: Specifies how many rows should be used to determine
                                    the dtype of a column.
                                    If `None`, all rows will be used.
+        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
+                               will try to coerce different dtypes in a column to the same one,
+                               whereas `strict` will raise an error in case a column contains
+                               several dtypes. Note that this only applies to columns whose dtype
+                               is guessed, i.e. not specified via `dtypes`.
         :param use_columns: Specifies the columns to use. Can either be:
                             - `None` to select all columns
                             - A list of strings and ints, the column names and/or indices
@@ -165,6 +171,7 @@ def load_sheet(
                 skip_rows=skip_rows,
                 n_rows=n_rows,
                 schema_sample_rows=schema_sample_rows,
+                dtype_coercion=dtype_coercion,
                 use_columns=use_columns,
                 dtypes=dtypes,
                 eager=False,
@@ -180,6 +187,7 @@ def load_sheet_eager(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | None = None,
         dtypes: DTypeMap | None = None,
     ) -> pa.RecordBatch:
@@ -197,6 +205,7 @@ def load_sheet_eager(
             skip_rows=skip_rows,
             n_rows=n_rows,
             schema_sample_rows=schema_sample_rows,
+            dtype_coercion=dtype_coercion,
             use_columns=use_columns,
             dtypes=dtypes,
             eager=True,
@@ -211,6 +220,7 @@ def load_sheet_by_name(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
         dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
@@ -225,6 +235,7 @@ def load_sheet_by_name(
             skip_rows=skip_rows,
             n_rows=n_rows,
             schema_sample_rows=schema_sample_rows,
+            dtype_coercion=dtype_coercion,
             use_columns=use_columns,
             dtypes=dtypes,
         )
@@ -238,6 +249,7 @@ def load_sheet_by_idx(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
         dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
@@ -252,6 +264,7 @@ def load_sheet_by_idx(
             skip_rows=skip_rows,
             n_rows=n_rows,
             schema_sample_rows=schema_sample_rows,
+            dtype_coercion=dtype_coercion,
             use_columns=use_columns,
             dtypes=dtypes,
         )

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -72,6 +72,7 @@ class _ExcelReader:
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
         dtypes: DTypeMap | None = None,
         eager: Literal[False] = ...,
@@ -86,6 +87,7 @@ class _ExcelReader:
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
+        dtype_coercion: Literal["coerce", "strict"] = "coerce",
         use_columns: list[str] | list[int] | str | None = None,
         dtypes: DTypeMap | None = None,
         eager: Literal[True] = ...,

diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
 from datetime import date, datetime
-from typing import Any
+from typing import Any, Literal
 
 import fastexcel
+import numpy as np
 import pandas as pd
 import polars as pl
 import pytest
@@ -190,3 +191,66 @@ def test_sheet_datetime_conversion(
     pl_df = sheet.to_polars()
     assert pl_df["Date"].dtype == expected_pl_dtype
     assert pl_df["Date"].to_list() == [expected] * 9
+
+
+@pytest.mark.parametrize("eager", [True, False])
+@pytest.mark.parametrize("dtype_coercion", ["coerce", None])
+def test_dtype_coercion_behavior__coerce(
+    dtype_coercion: Literal["coerce"] | None, eager: bool
+) -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
+
+    kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {}
+    sheet = (
+        excel_reader.load_sheet_eager(0, **kwargs)  # type:ignore[arg-type]
+        if eager
+        else excel_reader.load_sheet(0, **kwargs).to_arrow()  # type:ignore[arg-type]
+    )
+
+    pd_df = sheet.to_pandas()
+    assert pd_df["Mixed dates"].dtype == "object"
+    assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
+
+    pl_df = pl.from_arrow(data=sheet)
+    assert isinstance(pl_df, pl.DataFrame)
+    assert pl_df["Mixed dates"].dtype == pl.Utf8
+    assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
+
+
+@pytest.mark.parametrize("eager", [True, False])
+def test_dtype_coercion_behavior__strict_sampling_eveything(eager: bool) -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
+
+    with pytest.raises(
+        fastexcel.UnsupportedColumnTypeCombinationError, match="type coercion is strict"
+    ):
+        if eager:
+            excel_reader.load_sheet_eager(0, dtype_coercion="strict")
+        else:
+            excel_reader.load_sheet(0, dtype_coercion="strict").to_arrow()
+
+
+@pytest.mark.parametrize("eager", [True, False])
+def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
+
+    sheet = (
+        excel_reader.load_sheet_eager(0, dtype_coercion="strict", schema_sample_rows=5)
+        if eager
+        else excel_reader.load_sheet(0, dtype_coercion="strict", schema_sample_rows=5).to_arrow()
+    )
+
+    pd_df = sheet.to_pandas()
+    assert pd_df["Mixed dates"].dtype == "datetime64[ms]"
+    assert (
+        pd_df["Mixed dates"].to_list() == [pd.Timestamp("2023-07-21 00:00:00")] * 6 + [pd.NaT] * 3
+    )
+    assert pd_df["Asset ID"].dtype == "float64"
+    assert pd_df["Asset ID"].replace(np.nan, None).to_list() == [84444.0] * 7 + [None] * 2
+
+    pl_df = pl.from_arrow(data=sheet)
+    assert isinstance(pl_df, pl.DataFrame)
+    assert pl_df["Mixed dates"].dtype == pl.Datetime
+    assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3
+    assert pl_df["Asset ID"].dtype == pl.Float64
+    assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2
diff --git a/src/types/dtype.rs b/src/types/dtype.rs
@@ -101,6 +101,41 @@ impl From<&DType> for ArrowDataType {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)]
+pub(crate) enum DTypeCoercion {
+    Coerce,
+    Strict,
+}
+
+impl FromStr for DTypeCoercion {
+    type Err = FastExcelError;
+
+    fn from_str(raw_dtype_coercion: &str) -> FastExcelResult<Self> {
+        match raw_dtype_coercion {
+            "coerce" => Ok(Self::Coerce),
+            "strict" => Ok(Self::Strict),
+            _ => Err(FastExcelErrorKind::InvalidParameters(format!(
+                "unsupported dtype_coercion: \"{raw_dtype_coercion}\""
+            ))
+            .into()),
+        }
+    }
+}
+
+impl FromPyObject<'_> for DTypeCoercion {
+    fn extract_bound(py_dtype_coercion: &Bound<'_, PyAny>) -> PyResult<Self> {
+        if let Ok(dtype_coercion_pystr) = py_dtype_coercion.extract::<&PyString>() {
+            dtype_coercion_pystr.to_str()?.parse()
+        } else {
+            Err(FastExcelErrorKind::InvalidParameters(format!(
+                "{py_dtype_coercion:?} cannot be converted to str"
+            ))
+            .into())
+        }
+        .into_pyresult()
+    }
+}
+
 /// All the possible string values that should be considered as NULL
 const NULL_STRING_VALUES: [&str; 19] = [
     "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
@@ -203,6 +238,7 @@ pub(crate) fn get_dtype_for_column<DT: CellType + Debug + DataType>(
     start_row: usize,
     end_row: usize,
     col: usize,
+    dtype_coercion: &DTypeCoercion,
 ) -> FastExcelResult<DType> {
     let mut column_types = (start_row..end_row)
         .map(|row| get_cell_dtype(data, row, col))
@@ -214,6 +250,14 @@ pub(crate) fn get_dtype_for_column<DT: CellType + Debug + DataType>(
     if column_types.is_empty() {
         // If no type apart from NULL was found, it's a NULL column
         Ok(DType::Null)
+    } else if matches!(dtype_coercion, &DTypeCoercion::Strict) && column_types.len() != 1 {
+        // If dtype coercion is strict and we do not have a single dtype, it's an error
+        Err(
+            FastExcelErrorKind::UnsupportedColumnTypeCombination(format!(
+                "type coercion is strict and column contains {column_types:?}"
+            ))
+            .into(),
+        )
     } else if column_types.len() == 1 {
         // If a single non-null type was found, return it
         Ok(column_types.into_iter().next().unwrap())
@@ -288,15 +332,65 @@ mod tests {
     #[case(7, 11, DType::Float)]
     // int + bool
     #[case(10, 12, DType::Int)]
-    fn get_arrow_column_type_multi_dtype_ok(
+    fn get_arrow_column_type_multi_dtype_ok_coerce(
         range: Range<CalData>,
         #[case] start_row: usize,
         #[case] end_row: usize,
         #[case] expected: DType,
     ) {
         assert_eq!(
-            get_dtype_for_column(&range, start_row, end_row, 0).unwrap(),
+            get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Coerce).unwrap(),
             expected
         );
     }
+
+    #[rstest]
+    // pure bool
+    #[case(0, 2, DType::Bool)]
+    // pure int
+    #[case(3, 4, DType::Int)]
+    // pure float
+    #[case(4, 5, DType::Float)]
+    // pure string
+    #[case(5, 6, DType::String)]
+    // empty + null + int
+    #[case(6, 9, DType::Int)]
+    fn get_arrow_column_type_multi_dtype_ok_strict(
+        range: Range<CalData>,
+        #[case] start_row: usize,
+        #[case] end_row: usize,
+        #[case] expected: DType,
+    ) {
+        assert_eq!(
+            get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict).unwrap(),
+            expected
+        );
+    }
+
+    #[rstest]
+    // pure int + float
+    #[case(3, 5)]
+    // float + string
+    #[case(4, 6)]
+    // int + float + string
+    #[case(3, 6)]
+    // null + int + float + string + empty + null
+    #[case(2, 8)]
+    // int + float + null
+    #[case(7, 10)]
+    // int + float + bool + null
+    #[case(7, 11)]
+    // int + bool
+    #[case(10, 12)]
+    fn get_arrow_column_type_multi_dtype_ko_strict(
+        range: Range<CalData>,
+        #[case] start_row: usize,
+        #[case] end_row: usize,
+    ) {
+        let result = get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict);
+        assert!(matches!(
+            result.unwrap_err().kind,
+            FastExcelErrorKind::UnsupportedColumnTypeCombination(_)
+        ));
+    }
 }
diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs
@@ -17,7 +17,10 @@ use crate::{
     error::{
         py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
     },
-    types::{dtype::DTypeMap, idx_or_name::IdxOrName},
+    types::{
+        dtype::{DTypeCoercion, DTypeMap},
+        idx_or_name::IdxOrName,
+    },
 };
 
 use crate::utils::schema::get_schema_sample_rows;
@@ -108,6 +111,7 @@ impl ExcelReader {
         sample_rows: Option<usize>,
         selected_columns: &SelectedColumns,
         dtypes: Option<&DTypeMap>,
+        dtype_coercion: &DTypeCoercion,
     ) -> FastExcelResult<RecordBatch> {
         let offset = header.offset() + pagination.offset();
         let limit = {
@@ -129,6 +133,7 @@ impl ExcelReader {
             offset,
             sample_rows_limit,
             dtypes,
+            dtype_coercion,
         )?;
 
         let fields = available_columns
@@ -150,6 +155,7 @@ impl ExcelReader {
         skip_rows: usize,
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
+        dtype_coercion: DTypeCoercion,
         use_columns: Option<&Bound<'_, PyAny>>,
         dtypes: Option<DTypeMap>,
         eager: bool,
@@ -167,6 +173,7 @@ impl ExcelReader {
                 schema_sample_rows,
                 &selected_columns,
                 dtypes.as_ref(),
+                &dtype_coercion,
             )
             .into_pyresult()
             .and_then(|rb| rb.to_pyarrow(py))
@@ -179,6 +186,7 @@ impl ExcelReader {
                 header,
                 pagination,
                 schema_sample_rows,
+                dtype_coercion,
                 selected_columns,
                 dtypes,
             )
@@ -224,6 +232,7 @@ impl ExcelReader {
         skip_rows = 0,
         n_rows = None,
         schema_sample_rows = 1_000,
+        dtype_coercion = DTypeCoercion::Coerce,
         use_columns = None,
         dtypes = None,
         eager = false,
@@ -237,6 +246,7 @@ impl ExcelReader {
         skip_rows: usize,
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
+        dtype_coercion: DTypeCoercion,
         use_columns: Option<&Bound<'_, PyAny>>,
         dtypes: Option<DTypeMap>,
         eager: bool,
@@ -278,6 +288,7 @@ impl ExcelReader {
             skip_rows,
             n_rows,
             schema_sample_rows,
+            dtype_coercion,
             use_columns,
             dtypes,
             eager,