Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python)!: Properly apply strict parameter in Series constructor #16939

Merged
merged 10 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions crates/polars-core/src/datatypes/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ pub enum DataType {
Int64,
Float32,
Float64,
#[cfg(feature = "dtype-decimal")]
/// Fixed point decimal type optional precision and non-negative scale.
/// This is backed by a signed 128-bit integer which allows for up to 38 significant digits.
#[cfg(feature = "dtype-decimal")]
Decimal(Option<usize>, Option<usize>), // precision/scale; scale being None means "infer"
/// String data
String,
Expand All @@ -76,14 +76,14 @@ pub enum DataType {
Array(Box<DataType>, usize),
/// A nested list with a variable size in each row
List(Box<DataType>),
#[cfg(feature = "object")]
/// A generic type that can be used in a `Series`
/// &'static str can be used to determine/set inner type
#[cfg(feature = "object")]
Object(&'static str, Option<Arc<ObjectRegistry>>),
Null,
#[cfg(feature = "dtype-categorical")]
// The RevMapping has the internal state.
// This is ignored with comparisons, hashing etc.
#[cfg(feature = "dtype-categorical")]
Categorical(Option<Arc<RevMapping>>, CategoricalOrdering),
#[cfg(feature = "dtype-categorical")]
Enum(Option<Arc<RevMapping>>, CategoricalOrdering),
Expand Down Expand Up @@ -140,6 +140,7 @@ impl PartialEq for DataType {
(UnknownKind::Int(_), UnknownKind::Int(_)) => true,
_ => l == r,
},
// TODO: Add Decimal equality
_ => std::mem::discriminant(self) == std::mem::discriminant(other),
}
}
Expand Down
9 changes: 9 additions & 0 deletions crates/polars-core/src/series/any_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,17 @@ impl Series {
let dtype = if strict {
get_first_non_null_dtype(values)
} else {
// Currently does not work correctly for Decimal because equality is not implemented.
any_values_to_supertype(values)?
};

// TODO: Remove this when Decimal data type equality is implemented.
#[cfg(feature = "dtype-decimal")]
if !strict && dtype.is_decimal() {
let dtype = DataType::Decimal(None, None);
return Self::from_any_values_and_dtype(name, values, &dtype, strict);
}

Self::from_any_values_and_dtype(name, values, &dtype, strict)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
datetime(2025, 12, 4),
datetime(2025, 12, 5),
],
"d": [1, 2.0, float("nan"), -42, None],
"d": [1.0, 2.0, float("nan"), -42.0, None],
}
)
# --8<-- [end:setup]
Expand Down
2 changes: 1 addition & 1 deletion docs/src/python/user-guide/getting-started/joins.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{
"a": range(8),
"b": np.random.rand(8),
"d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
"d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
}
)

Expand Down
47 changes: 10 additions & 37 deletions py-polars/polars/_utils/construction/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import contextlib
from datetime import date, datetime, time, timedelta
from decimal import Decimal as PyDecimal
from itertools import islice
from typing import (
TYPE_CHECKING,
Expand All @@ -27,8 +26,6 @@
)
from polars._utils.wrap import wrap_s
from polars.datatypes import (
INTEGER_DTYPES,
TEMPORAL_DTYPES,
Array,
Boolean,
Categorical,
Expand Down Expand Up @@ -293,44 +290,20 @@ def _construct_series_with_fallbacks(
constructor: Callable[[str, Sequence[Any], bool], PySeries],
name: str,
values: Sequence[Any],
target_dtype: PolarsDataType | None,
dtype: PolarsDataType | None,
*,
strict: bool,
) -> PySeries:
"""Construct Series, with fallbacks for basic type mismatch (eg: bool/int)."""
while True:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auch

try:
return constructor(name, values, strict)
except TypeError as exc:
str_exc = str(exc)

# from x to float
# error message can be:
# - integers: "'float' object cannot be interpreted as an integer"
if "'float'" in str_exc and (
# we do not accept float values as int/temporal, as it causes silent
# information loss; the caller should explicitly cast in this case.
target_dtype not in (INTEGER_DTYPES | TEMPORAL_DTYPES)
):
constructor = py_type_to_constructor(float)

# from x to string
# error message can be:
# - integers: "'str' object cannot be interpreted as an integer"
# - floats: "must be real number, not str"
elif "'str'" in str_exc or str_exc == "must be real number, not str":
constructor = py_type_to_constructor(str)

# from x to int
# error message can be:
# - bools: "'int' object cannot be converted to 'PyBool'"
elif str_exc == "'int' object cannot be converted to 'PyBool'":
constructor = py_type_to_constructor(int)

elif "decimal.Decimal" in str_exc:
constructor = py_type_to_constructor(PyDecimal)
else:
raise
try:
return constructor(name, values, strict)
except TypeError:
if dtype is None:
return PySeries.new_from_any_values(name, values, strict=strict)
else:
return PySeries.new_from_any_values_and_dtype(
name, values, dtype, strict=strict
)


def iterable_to_pyseries(
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9487,7 +9487,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series:

>>> df = pl.DataFrame(
... {
... "a": ["foo", "bar", 2],
... "a": ["foo", "bar", None],
... "b": [1, 2, 3],
... "c": [1.0, 2.0, 3.0],
... }
Expand Down
8 changes: 4 additions & 4 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2964,7 +2964,7 @@ def max(self) -> Self:

Examples
--------
>>> df = pl.DataFrame({"a": [-1, float("nan"), 1]})
>>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]})
>>> df.select(pl.col("a").max())
shape: (1, 1)
┌─────┐
Expand All @@ -2983,7 +2983,7 @@ def min(self) -> Self:

Examples
--------
>>> df = pl.DataFrame({"a": [-1, float("nan"), 1]})
>>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]})
>>> df.select(pl.col("a").min())
shape: (1, 1)
┌──────┐
Expand All @@ -3005,7 +3005,7 @@ def nan_max(self) -> Self:

Examples
--------
>>> df = pl.DataFrame({"a": [0, float("nan")]})
>>> df = pl.DataFrame({"a": [0.0, float("nan")]})
>>> df.select(pl.col("a").nan_max())
shape: (1, 1)
┌─────┐
Expand All @@ -3027,7 +3027,7 @@ def nan_min(self) -> Self:

Examples
--------
>>> df = pl.DataFrame({"a": [0, float("nan")]})
>>> df = pl.DataFrame({"a": [0.0, float("nan")]})
>>> df.select(pl.col("a").nan_min())
shape: (1, 1)
┌─────┐
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def _read_spreadsheet_openpyxl(
# the non-strings will become null, so we handle the cast here
values = [str(v) if (v is not None) else v for v in values]

s = pl.Series(name, values, dtype=dtype)
s = pl.Series(name, values, dtype=dtype, strict=False)
series_data.append(s)

df = pl.DataFrame(
Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1958,7 +1958,7 @@ def nan_max(self) -> int | float | date | datetime | timedelta | str:
>>> s.nan_max()
4

>>> s = pl.Series("a", [1, float("nan"), 4])
>>> s = pl.Series("a", [1.0, float("nan"), 4.0])
>>> s.nan_max()
nan
"""
Expand All @@ -1977,7 +1977,7 @@ def nan_min(self) -> int | float | date | datetime | timedelta | str:
>>> s.nan_min()
1

>>> s = pl.Series("a", [1, float("nan"), 4])
>>> s = pl.Series("a", [1.0, float("nan"), 4.0])
>>> s.nan_min()
nan
"""
Expand Down Expand Up @@ -4730,7 +4730,7 @@ def fill_nan(self, value: int | float | Expr | None) -> Series:

Examples
--------
>>> s = pl.Series("a", [1, 2, 3, float("nan")])
>>> s = pl.Series("a", [1.0, 2.0, 3.0, float("nan")])
>>> s.fill_nan(0)
shape: (4,)
Series: 'a' [f64]
Expand Down
50 changes: 19 additions & 31 deletions py-polars/src/series/construction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ impl PySeries {
#[pymethods]
impl PySeries {
#[staticmethod]
fn new_opt_bool(name: &str, values: &Bound<PyAny>, strict: bool) -> PyResult<Self> {
fn new_opt_bool(name: &str, values: &Bound<PyAny>, _strict: bool) -> PyResult<Self> {
let len = values.len()?;
let mut builder = BooleanChunkedBuilder::new(name, len);

Expand All @@ -107,25 +107,18 @@ impl PySeries {
if value.is_none() {
builder.append_null()
} else {
match value.extract::<bool>() {
Ok(v) => builder.append_value(v),
Err(e) => {
if strict {
return Err(e);
}
builder.append_null()
},
}
let v = value.extract::<bool>()?;
builder.append_value(v)
}
}
let ca = builder.finish();

let ca = builder.finish();
let s = ca.into_series();
Ok(s.into())
}
}

fn new_primitive<'a, T>(name: &str, values: &'a Bound<PyAny>, strict: bool) -> PyResult<PySeries>
fn new_primitive<'a, T>(name: &str, values: &'a Bound<PyAny>, _strict: bool) -> PyResult<PySeries>
where
T: PolarsNumericType,
ChunkedArray<T>: IntoSeries,
Expand All @@ -139,19 +132,12 @@ where
if value.is_none() {
builder.append_null()
} else {
match value.extract::<T::Native>() {
Ok(v) => builder.append_value(v),
Err(e) => {
if strict {
return Err(e);
}
builder.append_null()
},
}
let v = value.extract::<T::Native>()?;
builder.append_value(v)
}
}
let ca = builder.finish();

let ca = builder.finish();
let s = ca.into_series();
Ok(s.into())
}
Expand Down Expand Up @@ -243,9 +229,11 @@ impl PySeries {

for res in values.iter()? {
let value = res?;
match value.extract::<Cow<str>>() {
Ok(v) => builder.append_value(v),
Err(_) => builder.append_null(),
if value.is_none() {
builder.append_null()
} else {
let v = value.extract::<Cow<str>>()?;
builder.append_value(v)
}
}

Expand All @@ -261,9 +249,11 @@ impl PySeries {

for res in values.iter()? {
let value = res?;
match value.extract::<&[u8]>() {
Ok(v) => builder.append_value(v),
Err(_) => builder.append_null(),
if value.is_none() {
builder.append_null()
} else {
let v = value.extract::<&[u8]>()?;
builder.append_value(v)
}
}

Expand All @@ -274,9 +264,7 @@ impl PySeries {

#[staticmethod]
fn new_decimal(name: &str, values: &Bound<PyAny>, strict: bool) -> PyResult<Self> {
// Create a fake dtype with a placeholder "none" scale, to be inferred later.
let dtype = DataType::Decimal(None, None);
Self::new_from_any_values_and_dtype(name, values, Wrap(dtype), strict)
Self::new_from_any_values(name, values, strict)
}

#[staticmethod]
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/constructors/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ def test_from_dicts_list_struct_without_inner_dtype_5611() -> None:


def test_from_dict_upcast_primitive() -> None:
df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]})
df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}, strict=False)
assert df.dtypes == [pl.Float64, pl.Float64]


Expand Down
5 changes: 1 addition & 4 deletions py-polars/tests/unit/constructors/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,7 @@ def test_df_init_strict() -> None:

df = pl.DataFrame(data, schema=schema, strict=False)

# TODO: This should result in a Float Series without nulls
# https://github.com/pola-rs/polars/issues/14427
assert df["a"].to_list() == [1, 2, None]

assert df["a"].to_list() == [1, 2, 3]
assert df["a"].dtype == pl.Int8


Expand Down
5 changes: 4 additions & 1 deletion py-polars/tests/unit/constructors/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ def test_sequence_of_series_with_dtype(dtype: pl.PolarsDataType | None) -> None:
def test_upcast_primitive_and_strings(
values: list[Any], dtype: pl.PolarsDataType, expected_dtype: pl.PolarsDataType
) -> None:
assert pl.Series(values, dtype=dtype).dtype == expected_dtype
with pytest.raises(TypeError):
pl.Series(values, dtype=dtype, strict=True)

assert pl.Series(values, dtype=dtype, strict=False).dtype == expected_dtype


def test_preserve_decimal_precision() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def test_df_getitem_col_mixed_inputs(input: list[Any], match: str) -> None:
@pytest.mark.parametrize(
("input", "match"),
[
([0.0, 1.0], "'float' object cannot be interpreted as an integer"),
([0.0, 1.0], "unexpected value while building Series of type Int64"),
(
pl.Series([[1, 2], [3, 4]]),
"cannot treat Series of type List\\(Int64\\) as indices",
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_df_serde_enum() -> None:
[
([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), shape=3)),
([["a", "b"], [None, None]], pl.Array(pl.Utf8, shape=2)),
([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, shape=3)),
([[True, False, None], [None, None, None]], pl.Array(pl.Boolean, shape=3)),
(
[[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]],
pl.List(pl.Array(pl.Int32(), shape=3)),
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/datatypes/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def permutations_int_dec_none() -> list[tuple[D | int | None, ...]]:
D("-0.01"),
D("1.2345678"),
D("500"),
# -1, # TODO: Address in https://github.com/pola-rs/polars/issues/14427
-1,
None,
]
)
Expand Down
Loading