From f61af0f8010f443f156778822711e4a7b6bf3d53 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 12 Jul 2024 15:54:17 +0200 Subject: [PATCH] refactor: Fix struct outer validity;fmt;is_in;cast;cmp (#17590) --- .../src/chunked_array/comparison/mod.rs | 5 +- .../src/chunked_array/iterator/mod.rs | 55 ------------ .../src/chunked_array/ops/any_value.rs | 30 +++++++ .../src/chunked_array/ops/downcast.rs | 42 +++++++++- .../src/chunked_array/struct_/mod.rs | 45 +++------- .../src/chunked_array/strings/json_path.rs | 3 +- crates/polars-ops/src/series/ops/cut.rs | 2 +- .../polars/_utils/construction/series.py | 17 +++- py-polars/polars/dataframe/frame.py | 2 +- py-polars/polars/expr/expr.py | 49 ++++++----- py-polars/polars/expr/string.py | 18 ++-- py-polars/polars/series/string.py | 2 +- py-polars/src/conversion/mod.rs | 5 -- py-polars/src/dataframe/general.rs | 20 ++++- py-polars/src/map/series.rs | 84 ++++++------------- py-polars/tests/unit/datatypes/test_struct.py | 15 +++- .../unit/operations/map/test_map_elements.py | 1 - py-polars/tests/unit/operations/test_is_in.py | 24 ++---- 18 files changed, 197 insertions(+), 222 deletions(-) diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs index 7c2cc7c0fe4d..fdbb4ab58a9e 100644 --- a/crates/polars-core/src/chunked_array/comparison/mod.rs +++ b/crates/polars-core/src/chunked_array/comparison/mod.rs @@ -656,6 +656,7 @@ where R: Fn(BooleanChunked, BooleanChunked) -> BooleanChunked, { if a.len() != b.len() || a.struct_fields().len() != b.struct_fields().len() { + // polars_ensure!(a.len() == 1 || b.len() == 1, ShapeMismatch: "length lhs: {}, length rhs: {}", a.len(), b.len()); BooleanChunked::full("", value, a.len()) } else { let (a, b) = align_chunks_binary(a, b); @@ -707,7 +708,7 @@ impl ChunkCompare<&StructChunked2> for StructChunked2 { self, rhs, |l, r| l.not_equal(r).unwrap(), - |a, b| a.not_equal(&b).unique().unwrap(), + |a, b| a | b, true, ) } @@ -717,7 +718,7 @@ impl ChunkCompare<&StructChunked2> for StructChunked2 { self, rhs, |l, r| l.not_equal_missing(r).unwrap(), - |a, b| a.not_equal_missing(&b).unique().unwrap(), + |a, b| a | b, true, ) } diff --git a/crates/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs index 0cdb077c0f98..64b7fb30bb7c 100644 --- a/crates/polars-core/src/chunked_array/iterator/mod.rs +++ b/crates/polars-core/src/chunked_array/iterator/mod.rs @@ -1,9 +1,6 @@ use arrow::array::*; use crate::prelude::*; -#[cfg(feature = "dtype-struct")] -use crate::series::iterator::SeriesIter; -use crate::utils::Container; pub mod par; @@ -421,58 +418,6 @@ impl ObjectChunked { } } -// TODO: STRUCT REFACTOR: REMOVE THIS -#[cfg(feature = "dtype-struct")] -impl<'a> IntoIterator for &'a StructChunked2 { - type Item = Option<&'a [AnyValue<'a>]>; - type IntoIter = StructIter2<'a>; - - fn into_iter(self) -> Self::IntoIter { - assert_eq!(self.n_chunks(), 1); - let fields = self.fields_as_series(); - let field_iter = fields - .iter() - .map(|s| { - let iter = s.iter(); - // SAFETY: this works as the reference is to the heap, and not to the struct. - unsafe { std::mem::transmute::, SeriesIter<'a>>(iter) } - }) - .collect(); - - StructIter2 { - field_iter, - buf: vec![], - } - } -} - -#[cfg(feature = "dtype-struct")] -pub struct StructIter2<'a> { - field_iter: Vec>, - buf: Vec>, -} - -#[cfg(feature = "dtype-struct")] -impl<'a> Iterator for StructIter2<'a> { - type Item = Option<&'a [AnyValue<'a>]>; - - fn next(&mut self) -> Option { - self.buf.clear(); - - for it in &mut self.field_iter { - self.buf.push(it.next()?); - } - - // SAFETY: - // Lifetime is bound to struct, we just cannot set the lifetime for the iterator trait - unsafe { - Some(Some(std::mem::transmute::<&'_ [AnyValue], &'a [AnyValue]>( - &self.buf, - ))) - } - } -} - /// Wrapper struct to convert an iterator of type `T` into one of type [`Option`]. It is useful to make the /// [`IntoIterator`] trait, in which every iterator shall return an [`Option`]. pub struct SomeIterator(I) diff --git a/crates/polars-core/src/chunked_array/ops/any_value.rs b/crates/polars-core/src/chunked_array/ops/any_value.rs index 1fc0ab6f12e2..944e6c132a5b 100644 --- a/crates/polars-core/src/chunked_array/ops/any_value.rs +++ b/crates/polars-core/src/chunked_array/ops/any_value.rs @@ -5,6 +5,7 @@ use polars_utils::sync::SyncPtr; use crate::chunked_array::object::extension::polars_extension::PolarsExtension; use crate::prelude::*; use crate::series::implementations::null::NullChunked; +use crate::utils::index_to_chunked_index; #[inline] #[allow(unused_variables)] @@ -317,3 +318,32 @@ impl ChunkAnyValue for NullChunked { Ok(AnyValue::Null) } } + +#[cfg(feature = "dtype-struct")] +impl ChunkAnyValue for StructChunked2 { + /// Gets AnyValue from LogicalType + fn get_any_value(&self, i: usize) -> PolarsResult> { + polars_ensure!(i < self.len(), oob = i, self.len()); + unsafe { Ok(self.get_any_value_unchecked(i)) } + } + + unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> { + let (chunk_idx, idx) = index_to_chunked_index(self.chunks.iter().map(|c| c.len()), i); + if let DataType::Struct(flds) = self.dtype() { + // SAFETY: we already have a single chunk and we are + // guarded by the type system. + unsafe { + let arr = &**self.chunks.get_unchecked(chunk_idx); + let arr = &*(arr as *const dyn Array as *const StructArray); + + if arr.is_null_unchecked(idx) { + AnyValue::Null + } else { + AnyValue::Struct(idx, arr, flds) + } + } + } else { + unreachable!() + } + } +} diff --git a/crates/polars-core/src/chunked_array/ops/downcast.rs b/crates/polars-core/src/chunked_array/ops/downcast.rs index 70702bb3f782..a10386145291 100644 --- a/crates/polars-core/src/chunked_array/ops/downcast.rs +++ b/crates/polars-core/src/chunked_array/ops/downcast.rs @@ -1,6 +1,7 @@ use std::marker::PhantomData; use arrow::array::*; +use arrow::bitmap::Bitmap; use arrow::compute::utils::combine_validities_and; use crate::prelude::*; @@ -155,9 +156,48 @@ impl ChunkedArray { unsafe { for (arr, other) in self.chunks_mut().iter_mut().zip(chunks) { let validity = combine_validities_and(arr.validity(), other.validity()); - arr.with_validity(validity); + *arr = arr.with_validity(validity); } } self.compute_len(); } + + pub(crate) fn set_outer_validity(&mut self, validity: Option) { + assert_eq!(self.chunks().len(), 1); + unsafe { + let arr = self.chunks_mut().iter_mut().next().unwrap(); + *arr = arr.with_validity(validity); + } + self.compute_len(); + } + + pub fn with_outer_validity(mut self, validity: Option) -> Self { + self.set_outer_validity(validity); + self + } + + pub fn with_outer_validity_chunked(mut self, validity: BooleanChunked) -> Self { + assert_eq!(self.len(), validity.len()); + if !self + .chunks + .iter() + .zip(validity.chunks.iter()) + .map(|(a, b)| a.len() == b.len()) + .all_equal() + || self.chunks.len() != validity.chunks().len() + { + let ca = self.rechunk(); + let validity = validity.rechunk(); + ca.with_outer_validity_chunked(validity) + } else { + unsafe { + for (arr, valid) in self.chunks_mut().iter_mut().zip(validity.downcast_iter()) { + assert!(valid.validity().is_none()); + *arr = arr.with_validity(Some(valid.values().clone())) + } + } + self.compute_len(); + self + } + } } diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 66b05c7664f9..547a63bd85de 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -3,7 +3,6 @@ mod frame; use std::fmt::Write; use arrow::array::StructArray; -use arrow::bitmap::Bitmap; use arrow::compute::utils::combine_validities_and; use arrow::legacy::utils::CustomIterTools; use polars_error::{polars_ensure, PolarsResult}; @@ -14,7 +13,7 @@ use crate::chunked_array::ChunkedArray; use crate::prelude::sort::arg_sort_multiple::{_get_rows_encoded_arr, _get_rows_encoded_ca}; use crate::prelude::*; use crate::series::Series; -use crate::utils::{index_to_chunked_index, Container}; +use crate::utils::Container; pub type StructChunked2 = ChunkedArray; @@ -171,7 +170,11 @@ impl StructChunked2 { }) .collect::>>()?; - Self::from_series(self.name(), &new_fields).map(|ca| ca.into_series()) + let mut out = Self::from_series(self.name(), &new_fields)?; + if self.null_count > 0 { + out.merge_validities(self.chunks()); + } + Ok(out.into_series()) }, DataType::String => { let ca = self.clone(); @@ -223,7 +226,11 @@ impl StructChunked2 { } }) .collect::>>()?; - Self::from_series(self.name(), &fields).map(|ca| ca.into_series()) + let mut out = Self::from_series(self.name(), &fields)?; + if self.null_count > 0 { + out.merge_validities(self.chunks()); + } + Ok(out.into_series()) }, } } @@ -248,27 +255,6 @@ impl StructChunked2 { self.cast_with_options(dtype, CastOptions::NonStrict) } - /// Gets AnyValue from LogicalType - pub(crate) fn get_any_value(&self, i: usize) -> PolarsResult> { - polars_ensure!(i < self.len(), oob = i, self.len()); - unsafe { Ok(self.get_any_value_unchecked(i)) } - } - - pub(crate) unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> { - let (chunk_idx, idx) = index_to_chunked_index(self.chunks.iter().map(|c| c.len()), i); - if let DataType::Struct(flds) = self.dtype() { - // SAFETY: we already have a single chunk and we are - // guarded by the type system. - unsafe { - let arr = &**self.chunks.get_unchecked(chunk_idx); - let arr = &*(arr as *const dyn Array as *const StructArray); - AnyValue::Struct(idx, arr, flds) - } - } else { - unreachable!() - } - } - pub fn _apply_fields(&self, mut func: F) -> PolarsResult where F: FnMut(&Series) -> Series, @@ -341,15 +327,6 @@ impl StructChunked2 { self.compute_len(); } - pub(crate) fn set_outer_validity(&mut self, validity: Option) { - assert_eq!(self.chunks().len(), 1); - unsafe { - let arr = self.downcast_iter_mut().next().unwrap(); - arr.set_validity(validity) - } - self.compute_len(); - } - pub fn unnest(mut self) -> DataFrame { self.propagate_nulls(); diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index ca9fa07f6ac3..911016049819 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -189,8 +189,6 @@ mod tests { assert_eq!(ca.json_infer(Some(2)).unwrap(), expected_dtype); } - // STRUCT REFACTOR - #[ignore] #[test] fn test_json_decode() { let s = Series::new( @@ -212,6 +210,7 @@ mod tests { ], ) .unwrap() + .with_outer_validity_chunked(BooleanChunked::new("", [false, true, true, false])) .into_series(); let expected_dtype = expected_series.dtype().clone(); diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index 209fb8f39779..b7293198fc9b 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -159,7 +159,7 @@ pub fn qcut( } mod test { - // STRUCT REFACTOR + // This need metadata in fields #[ignore] #[test] fn test_map_cats_fast_unique() { diff --git a/py-polars/polars/_utils/construction/series.py b/py-polars/polars/_utils/construction/series.py index 7ab7ba283f1d..f13b9f5b0ec5 100644 --- a/py-polars/polars/_utils/construction/series.py +++ b/py-polars/polars/_utils/construction/series.py @@ -149,13 +149,26 @@ def sequence_to_pyseries( return pyseries elif dtype == Struct: + # This is very bad. Goes via rows? And needs to do outer nullability separate. + # It also has two data passes. + # TODO: eventually go into struct builder struct_schema = dtype.to_schema() if isinstance(dtype, Struct) else None empty = {} # type: ignore[var-annotated] + + data = [] + invalid = [] + for i, v in enumerate(values): + if v is None: + invalid.append(i) + data.append(empty) + else: + data.append(v) + return plc.sequence_to_pydf( - data=[(empty if v is None else v) for v in values], + data=data, schema=struct_schema, orient="row", - ).to_struct(name) + ).to_struct(name, invalid) if python_dtype is None: if value is None: diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 43450fc6e753..3158c3e61435 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -10518,7 +10518,7 @@ def to_struct(self, name: str = "") -> Series: {5,"five"} ] """ - return wrap_s(self._df.to_struct(name)) + return wrap_s(self._df.to_struct(name, [])) def unnest( self, diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 2491c7c22a0b..81597ac5b7ac 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -4094,32 +4094,31 @@ def rle_id(self) -> Expr: This functionality is especially useful for defining a new group for every time a column's value changes, rather than for every distinct value of that column. + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 1, 1, 1], + ... "b": ["x", "x", None, "y", "y"], + ... } + ... ) + >>> df.with_columns( + ... rle_id_a=pl.col("a").rle_id(), + ... rle_id_ab=pl.struct("a", "b").rle_id(), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────────┬───────────┐ + │ a ┆ b ┆ rle_id_a ┆ rle_id_ab │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪══════════╪═══════════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴──────────┴───────────┘ """ - # STRUCT REFACTOR - # Examples - # -------- - # >>> df = pl.DataFrame( - # ... { - # ... "a": [1, 2, 1, 1, 1], - # ... "b": ["x", "x", None, "y", "y"], - # ... } - # ... ) - # >>> df.with_columns( - # ... rle_id_a=pl.col("a").rle_id(), - # ... rle_id_ab=pl.struct("a", "b").rle_id(), - # ... ) - # shape: (5, 4) - # ┌─────┬──────┬──────────┬───────────┐ - # │ a ┆ b ┆ rle_id_a ┆ rle_id_ab │ - # │ --- ┆ --- ┆ --- ┆ --- │ - # │ i64 ┆ str ┆ u32 ┆ u32 │ - # ╞═════╪══════╪══════════╪═══════════╡ - # │ 1 ┆ x ┆ 0 ┆ 0 │ - # │ 2 ┆ x ┆ 1 ┆ 1 │ - # │ 1 ┆ null ┆ 2 ┆ 2 │ - # │ 1 ┆ y ┆ 2 ┆ 3 │ - # │ 1 ┆ y ┆ 2 ┆ 3 │ - # └─────┴──────┴──────────┴───────────┘ return self._from_pyexpr(self._pyexpr.rle_id()) def filter( diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index e34c303e8605..109aaa096893 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1226,15 +1226,15 @@ def json_decode( >>> dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)]) >>> df.with_columns(decoded=pl.col("json").str.json_decode(dtype)) shape: (3, 2) - ┌─────────────────────┬─────────────┐ - │ json ┆ decoded │ - │ --- ┆ --- │ - │ str ┆ struct[2] │ - ╞═════════════════════╪═════════════╡ - │ {"a":1, "b": true} ┆ {1,true} │ - │ null ┆ {null,null} │ - │ {"a":2, "b": false} ┆ {2,false} │ - └─────────────────────┴─────────────┘ + ┌─────────────────────┬───────────┐ + │ json ┆ decoded │ + │ --- ┆ --- │ + │ str ┆ struct[2] │ + ╞═════════════════════╪═══════════╡ + │ {"a":1, "b": true} ┆ {1,true} │ + │ null ┆ null │ + │ {"a":2, "b": false} ┆ {2,false} │ + └─────────────────────┴───────────┘ """ if dtype is not None: dtype = parse_into_dtype(dtype) diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index ddcfd1568975..1591a3c516de 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -668,7 +668,7 @@ def json_decode( Series: 'json' [struct[2]] [ {1,true} - {null,null} + null {2,false} ] """ diff --git a/py-polars/src/conversion/mod.rs b/py-polars/src/conversion/mod.rs index 908d2e65dd87..6056103b3251 100644 --- a/py-polars/src/conversion/mod.rs +++ b/py-polars/src/conversion/mod.rs @@ -65,11 +65,6 @@ pub(crate) fn reinterpret_vec(input: Vec) -> Vec { unsafe { Vec::from_raw_parts(ptr, len, cap) } } -pub(crate) fn slice_to_wrapped(slice: &[T]) -> &[Wrap] { - // SAFETY: Wrap is transparent. - unsafe { std::mem::transmute(slice) } -} - pub(crate) fn vec_extract_wrapped(buf: Vec>) -> Vec { reinterpret_vec(buf) } diff --git a/py-polars/src/dataframe/general.rs b/py-polars/src/dataframe/general.rs index 7699c8f0b823..9998e6283ae8 100644 --- a/py-polars/src/dataframe/general.rs +++ b/py-polars/src/dataframe/general.rs @@ -1,4 +1,5 @@ use either::Either; +use polars::export::arrow::bitmap::MutableBitmap; use polars::prelude::*; use polars_core::frame::*; #[cfg(feature = "pivot")] @@ -571,9 +572,22 @@ impl PyDataFrame { Ok(out.into()) } - pub fn to_struct(&self, name: &str) -> PySeries { - let s = self.df.clone().into_struct(name); - s.into_series().into() + pub fn to_struct(&self, name: &str, invalid_indices: Vec) -> PySeries { + let ca = self.df.clone().into_struct(name); + + if !invalid_indices.is_empty() { + let mut validity = MutableBitmap::with_capacity(ca.len()); + validity.extend_constant(ca.len(), true); + for i in invalid_indices { + validity.set(i, false); + } + let ca = ca.rechunk(); + ca.with_outer_validity(Some(validity.freeze())) + .into_series() + .into() + } else { + ca.into_series().into() + } } pub fn unnest(&self, columns: Vec) -> PyResult { diff --git a/py-polars/src/map/series.rs b/py-polars/src/map/series.rs index 815d9e1cd174..3565aea6005f 100644 --- a/py-polars/src/map/series.rs +++ b/py-polars/src/map/series.rs @@ -4,7 +4,6 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyBool, PyCFunction, PyFloat, PyList, PyString, PyTuple}; use super::*; -use crate::conversion::slice_to_wrapped; use crate::py_modules::SERIES; /// Find the output type and dispatch to that implementation. @@ -2237,28 +2236,16 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { } } -fn make_dict_arg(py: Python, names: &[&str], vals: &[AnyValue]) -> Py { - let dict = PyDict::new_bound(py); - for (name, val) in names.iter().zip(slice_to_wrapped(vals)) { - dict.set_item(name, val).unwrap() - } - dict.unbind() -} - -fn get_names(ca: &StructChunked2) -> Vec<&str> { - ca.struct_fields() - .iter() - .map(|s| s.name().as_str()) - .collect::>() +fn iter_struct(ca: &StructChunked2) -> impl Iterator { + (0..ca.len()).map(|i| unsafe { ca.get_any_value_unchecked(i) }) } impl<'a> ApplyLambda<'a> for StructChunked2 { fn apply_lambda_unknown(&'a self, py: Python, lambda: &Bound<'a, PyAny>) -> PyResult { - let names = get_names(self); let mut null_count = 0; - for val in self.into_iter() { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - let out = lambda.call1((arg,))?; + + for val in iter_struct(self) { + let out = lambda.call1((Wrap(val),))?; if out.is_none() { null_count += 1; continue; @@ -2272,17 +2259,14 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { fn apply_to_struct( &'a self, - py: Python, + _py: Python, lambda: &Bound<'a, PyAny>, init_null_count: usize, first_value: AnyValue<'a>, ) -> PyResult { - let names = get_names(self); - let skip = 1; - let it = self.into_iter().skip(init_null_count + skip).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - let out = lambda.call1((arg,)).unwrap(); + let it = iter_struct(self).skip(init_null_count + skip).map(|val| { + let out = lambda.call1((Wrap(val),)).unwrap(); Some(out) }); iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) @@ -2299,13 +2283,10 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { D: PyArrowPrimitiveType, D::Native: ToPyObject + FromPyObject<'a>, { - let names = get_names(self); - let skip = usize::from(first_value.is_some()); - let it = self.into_iter().skip(init_null_count + skip).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - call_lambda_and_extract(py, lambda, arg).ok() - }); + let it = iter_struct(self) + .skip(init_null_count + skip) + .map(|val| call_lambda_and_extract(py, lambda, Wrap(val)).ok()); Ok(iterator_to_primitive( it, @@ -2323,13 +2304,10 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { init_null_count: usize, first_value: Option, ) -> PyResult { - let names = get_names(self); - let skip = usize::from(first_value.is_some()); - let it = self.into_iter().skip(init_null_count + skip).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - call_lambda_and_extract(py, lambda, arg).ok() - }); + let it = iter_struct(self) + .skip(init_null_count + skip) + .map(|val| call_lambda_and_extract(py, lambda, Wrap(val)).ok()); Ok(iterator_to_bool( it, @@ -2347,13 +2325,10 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { init_null_count: usize, first_value: Option, ) -> PyResult { - let names = get_names(self); - let skip = usize::from(first_value.is_some()); - let it = self.into_iter().skip(init_null_count + skip).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - call_lambda_and_extract(py, lambda, arg).ok() - }); + let it = iter_struct(self) + .skip(init_null_count + skip) + .map(|val| call_lambda_and_extract(py, lambda, Wrap(val)).ok()); Ok(iterator_to_string( it, @@ -2372,14 +2347,10 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { dt: &DataType, ) -> PyResult { let skip = 1; - - let names = get_names(self); - let lambda = lambda.bind(py); - let it = self.into_iter().skip(init_null_count + skip).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - call_lambda_series_out(py, lambda, arg).ok() - }); + let it = iter_struct(self) + .skip(init_null_count + skip) + .map(|val| call_lambda_series_out(py, lambda, Wrap(val)).ok()); iterator_to_list( dt, it, @@ -2397,14 +2368,12 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { init_null_count: usize, first_value: AnyValue<'a>, ) -> PyResult { - let names = get_names(self); let mut avs = Vec::with_capacity(self.len()); avs.extend(std::iter::repeat(AnyValue::Null).take(init_null_count)); avs.push(first_value); - let iter = self.into_iter().skip(init_null_count + 1).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - call_lambda_and_extract::<_, Wrap>(py, lambda, arg) + let iter = iter_struct(self).skip(init_null_count + 1).map(|val| { + call_lambda_and_extract::<_, Wrap>(py, lambda, Wrap(val)) .unwrap() .0 }); @@ -2421,13 +2390,10 @@ impl<'a> ApplyLambda<'a> for StructChunked2 { init_null_count: usize, first_value: Option, ) -> PyResult> { - let names = get_names(self); - let skip = usize::from(first_value.is_some()); - let it = self.into_iter().skip(init_null_count + skip).map(|val| { - let arg = val.map(|val| make_dict_arg(py, &names, val)); - call_lambda_and_extract(py, lambda, arg).ok() - }); + let it = iter_struct(self) + .skip(init_null_count + skip) + .map(|val| call_lambda_and_extract(py, lambda, Wrap(val)).ok()); Ok(iterator_to_object( it, diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 425a666f770e..d13531277dd8 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -47,7 +47,6 @@ def test_apply_unnest() -> None: assert_frame_equal(df, expected) -@pytest.mark.skip(reason="struct-refactor") def test_struct_equality() -> None: # equal struct dimensions, equal values s1 = pl.Series("misc", [{"x": "a", "y": 0}, {"x": "b", "y": 0}]) @@ -685,8 +684,18 @@ class TestData: assert df.schema == frame_schema assert df.unnest("y").columns == ["x", "a", "b", "c"] assert df.rows() == [ - (10, {"a": None, "b": None, "c": None}), - (20, {"a": None, "b": None, "c": None}), + ( + 10, + {"a": None, "b": None, "c": None} + if empty_structs[0] is not None # type: ignore[index] + else None, + ), + ( + 20, + {"a": None, "b": None, "c": None} + if empty_structs[1] is not None # type: ignore[index] + else None, + ), ] diff --git a/py-polars/tests/unit/operations/map/test_map_elements.py b/py-polars/tests/unit/operations/map/test_map_elements.py index dcab2dc3aca3..7edc155e223f 100644 --- a/py-polars/tests/unit/operations/map/test_map_elements.py +++ b/py-polars/tests/unit/operations/map/test_map_elements.py @@ -164,7 +164,6 @@ def test_map_elements_type_propagation() -> None: ).to_dict(as_series=False) == {"a": [1, 2, 3], "b": [1.0, 2.0, None]} -@pytest.mark.skip(reason="struct-refactor") def test_empty_list_in_map_elements() -> None: df = pl.DataFrame( {"a": [[1], [1, 2], [3, 4], [5, 6]], "b": [[3], [1, 2], [1, 2], [4, 5]]} diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py index 832d638bcb78..546a58d3b6fb 100644 --- a/py-polars/tests/unit/operations/test_is_in.py +++ b/py-polars/tests/unit/operations/test_is_in.py @@ -80,27 +80,15 @@ def test_is_in_struct() -> None: } -@pytest.mark.skip(reason="struct-refactor") def test_is_in_null_prop() -> None: assert pl.Series([None], dtype=pl.Float32).is_in(pl.Series([42])).item() is None - assert ( - pl.Series([{"a": None}], dtype=pl.Struct({"a": pl.Float32})) - .is_in(pl.Series([{"a": 42}])) - .item() - is None - ) - with pytest.raises( - InvalidOperationError, - match="`is_in` cannot check for Int64 values in Boolean data", - ): - _res = pl.Series([None], dtype=pl.Boolean).is_in(pl.Series([42])).item() + assert pl.Series([{"a": None}, None], dtype=pl.Struct({"a": pl.Float32})).is_in( + pl.Series([{"a": 42}]) + ).to_list() == [False, None] - assert ( - pl.Series([{"a": None}], dtype=pl.Struct({"a": pl.Boolean})) - .is_in(pl.Series([{"a": 42}])) - .item() - is None - ) + assert pl.Series([{"a": None}, None], dtype=pl.Struct({"a": pl.Boolean})).is_in( + pl.Series([{"a": 42}]) + ).to_list() == [False, None] def test_is_in_9070() -> None: