diff --git a/crates/polars-core/src/chunked_array/builder/binary.rs b/crates/polars-core/src/chunked_array/builder/binary.rs index 119dc461c7ed..bed05a434ba1 100644 --- a/crates/polars-core/src/chunked_array/builder/binary.rs +++ b/crates/polars-core/src/chunked_array/builder/binary.rs @@ -1,3 +1,5 @@ +use polars_error::constants::LENGTH_LIMIT_MSG; + use super::*; pub struct BinaryChunkedBuilder { @@ -40,7 +42,8 @@ impl BinaryChunkedBuilder { pub fn finish(mut self) -> BinaryChunked { let arr = self.builder.as_box(); - let length = arr.len() as IdxSize; + let length = IdxSize::try_from(arr.len()).expect(LENGTH_LIMIT_MSG); + let null_count = arr.null_count() as IdxSize; ChunkedArray { field: Arc::new(self.field), @@ -48,6 +51,7 @@ impl BinaryChunkedBuilder { phantom: PhantomData, bit_settings: Default::default(), length, + null_count, } } diff --git a/crates/polars-core/src/chunked_array/builder/boolean.rs b/crates/polars-core/src/chunked_array/builder/boolean.rs index 655d94ff1a7d..407bc3abcf53 100644 --- a/crates/polars-core/src/chunked_array/builder/boolean.rs +++ b/crates/polars-core/src/chunked_array/builder/boolean.rs @@ -21,14 +21,14 @@ impl ChunkedBuilder for BooleanChunkedBuilder { fn finish(mut self) -> BooleanChunked { let arr = self.array_builder.as_box(); - let length = arr.len() as IdxSize; let mut ca = ChunkedArray { field: Arc::new(self.field), chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), - length, + length: 0, + null_count: 0, }; ca.compute_len(); ca diff --git a/crates/polars-core/src/chunked_array/builder/primitive.rs b/crates/polars-core/src/chunked_array/builder/primitive.rs index f5314a5fb62a..eae7977612fe 100644 --- a/crates/polars-core/src/chunked_array/builder/primitive.rs +++ b/crates/polars-core/src/chunked_array/builder/primitive.rs @@ -27,13 +27,13 @@ where fn finish(mut self) -> ChunkedArray { let arr = self.array_builder.as_box(); - let length = arr.len() as IdxSize; let mut ca = ChunkedArray { field: Arc::new(self.field), chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), - length, + length: 0, + null_count: 0, }; ca.compute_len(); ca diff --git a/crates/polars-core/src/chunked_array/builder/utf8.rs b/crates/polars-core/src/chunked_array/builder/utf8.rs index 49f933c790ed..1a1c793563ed 100644 --- a/crates/polars-core/src/chunked_array/builder/utf8.rs +++ b/crates/polars-core/src/chunked_array/builder/utf8.rs @@ -41,14 +41,14 @@ impl Utf8ChunkedBuilder { pub fn finish(mut self) -> Utf8Chunked { let arr = self.builder.as_box(); - let length = arr.len() as IdxSize; let mut ca = ChunkedArray { field: Arc::new(self.field), chunks: vec![arr], phantom: PhantomData, bit_settings: Default::default(), - length, + length: 0, + null_count: 0, }; ca.compute_len(); ca diff --git a/crates/polars-core/src/chunked_array/from.rs b/crates/polars-core/src/chunked_array/from.rs index 1c67b3b75963..c384dec3e241 100644 --- a/crates/polars-core/src/chunked_array/from.rs +++ b/crates/polars-core/src/chunked_array/from.rs @@ -1,3 +1,5 @@ +use polars_error::constants::LENGTH_LIMIT_MSG; + use super::*; #[allow(clippy::all)] @@ -143,10 +145,12 @@ where ); let mut length = 0; + let mut null_count = 0; let chunks = chunks .into_iter() .map(|x| { length += x.len(); + null_count += x.null_count(); Box::new(x) as Box }) .collect(); @@ -156,7 +160,8 @@ where chunks, phantom: PhantomData, bit_settings: Default::default(), - length: length.try_into().unwrap(), + length: length.try_into().expect(LENGTH_LIMIT_MSG), + null_count: null_count as IdxSize, } } @@ -184,6 +189,7 @@ where phantom: PhantomData, bit_settings: Default::default(), length: 0, + null_count: 0, }; out.compute_len(); out @@ -213,6 +219,7 @@ where phantom: PhantomData, bit_settings: Default::default(), length: 0, + null_count: 0, }; out.compute_len(); out @@ -235,6 +242,7 @@ where phantom: PhantomData, bit_settings, length: 0, + null_count: 0, }; out.compute_len(); if !keep_sorted { @@ -258,6 +266,7 @@ where phantom: PhantomData, bit_settings: Default::default(), length: 0, + null_count: 0, }; out.compute_len(); out diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index fa79caea754c..3a619c61d4a5 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -140,6 +140,7 @@ pub struct ChunkedArray { phantom: PhantomData, pub(crate) bit_settings: Settings, length: IdxSize, + null_count: IdxSize, } bitflags! { @@ -303,6 +304,7 @@ impl ChunkedArray { /// /// # Safety /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks. + /// And the `null_count` remains correct. #[inline] pub unsafe fn chunks_mut(&mut self) -> &mut Vec { &mut self.chunks @@ -316,7 +318,7 @@ impl ChunkedArray { /// Count the null values. #[inline] pub fn null_count(&self) -> usize { - self.chunks.iter().map(|arr| arr.null_count()).sum() + self.null_count as usize } /// Create a new [`ChunkedArray`] from self, where the chunks are replaced. @@ -610,6 +612,7 @@ impl Clone for ChunkedArray { phantom: PhantomData, bit_settings: self.bit_settings, length: self.length, + null_count: self.null_count, } } } diff --git a/crates/polars-core/src/chunked_array/object/builder.rs b/crates/polars-core/src/chunked_array/object/builder.rs index 351cdc58a383..a6f8b9072c98 100644 --- a/crates/polars-core/src/chunked_array/object/builder.rs +++ b/crates/polars-core/src/chunked_array/object/builder.rs @@ -59,6 +59,10 @@ where let null_bitmap: Option = self.bitmask_builder.into(); let len = self.values.len(); + let null_count = null_bitmap + .as_ref() + .map(|validity| validity.unset_bits()) + .unwrap_or(0) as IdxSize; let arr = Box::new(ObjectArray { values: Arc::new(self.values), @@ -72,6 +76,7 @@ where phantom: PhantomData, bit_settings: Default::default(), length: len as IdxSize, + null_count, } } } @@ -136,6 +141,7 @@ where phantom: PhantomData, bit_settings: Default::default(), length: len as IdxSize, + null_count: 0, } } diff --git a/crates/polars-core/src/chunked_array/ops/append.rs b/crates/polars-core/src/chunked_array/ops/append.rs index 3aff4cc9e51c..027ccb09d168 100644 --- a/crates/polars-core/src/chunked_array/ops/append.rs +++ b/crates/polars-core/src/chunked_array/ops/append.rs @@ -78,6 +78,7 @@ where update_sorted_flag_before_append::(self, other); let len = self.len(); self.length += other.length; + self.null_count += other.null_count; new_chunks(&mut self.chunks, &other.chunks, len); } } @@ -90,6 +91,7 @@ impl ListChunked { let len = self.len(); self.length += other.length; + self.null_count += other.null_count; new_chunks(&mut self.chunks, &other.chunks, len); self.set_sorted_flag(IsSorted::Not); if !other._can_fast_explode() { @@ -108,6 +110,7 @@ impl ArrayChunked { let len = self.len(); self.length += other.length; + self.null_count += other.null_count; new_chunks(&mut self.chunks, &other.chunks, len); self.set_sorted_flag(IsSorted::Not); Ok(()) @@ -120,6 +123,7 @@ impl ObjectChunked { pub fn append(&mut self, other: &Self) { let len = self.len(); self.length += other.length; + self.null_count += other.null_count; self.set_sorted_flag(IsSorted::Not); new_chunks(&mut self.chunks, &other.chunks, len); } diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index 093e6c172d95..1254363eaa75 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -220,6 +220,7 @@ impl ChunkedArray { .for_each(|arr| arrow::compute::arity_assign::unary(arr, f)) }; // can be in any order now + self.compute_len(); self.set_sorted_flag(IsSorted::Not); } } diff --git a/crates/polars-core/src/chunked_array/ops/chunkops.rs b/crates/polars-core/src/chunked_array/ops/chunkops.rs index 076dc6476702..a60502afb130 100644 --- a/crates/polars-core/src/chunked_array/ops/chunkops.rs +++ b/crates/polars-core/src/chunked_array/ops/chunkops.rs @@ -74,6 +74,11 @@ impl ChunkedArray { } } self.length = IdxSize::try_from(inner(&self.chunks)).expect(LENGTH_LIMIT_MSG); + self.null_count = self + .chunks + .iter() + .map(|arr| arr.null_count()) + .sum::() as IdxSize; if self.length <= 1 { self.set_sorted_flag(IsSorted::Ascending) diff --git a/crates/polars-core/src/chunked_array/upstream_traits.rs b/crates/polars-core/src/chunked_array/upstream_traits.rs index af24444fdf14..fac284c4615e 100644 --- a/crates/polars-core/src/chunked_array/upstream_traits.rs +++ b/crates/polars-core/src/chunked_array/upstream_traits.rs @@ -30,6 +30,7 @@ impl Default for ChunkedArray { phantom: PhantomData, bit_settings: Default::default(), length: 0, + null_count: 0, } } } @@ -330,6 +331,7 @@ impl FromIterator> for ObjectChunked { phantom: PhantomData, bit_settings: Default::default(), length: 0, + null_count: 0, }; out.compute_len(); out diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 4751f3ad2d23..0cce71998a2a 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -181,6 +181,7 @@ impl Series { /// # Safety /// The caller must ensure the length and the data types of `ArrayRef` does not change. + /// And that the null_count is updated (e.g. with a `compute_len()`) pub unsafe fn chunks_mut(&mut self) -> &mut Vec { #[allow(unused_mut)] let mut ca = self._get_inner_mut(); @@ -254,6 +255,11 @@ impl Series { Ok(self) } + /// Redo a length and null_count compute + pub fn compute_len(&mut self) { + self._get_inner_mut().compute_len() + } + /// Extend the memory backed by this array with the values from `other`. /// /// See [`ChunkedArray::extend`] and [`ChunkedArray::append`]. diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index 43ee28b644f9..c85c807096f5 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -879,6 +879,7 @@ pub fn coalesce_nulls<'a, T: PolarsDataType>( *arr_b = arr_b.with_validity(arr.validity().cloned()) } } + b.compute_len(); (Cow::Owned(a), Cow::Owned(b)) } else { (Cow::Borrowed(a), Cow::Borrowed(b)) @@ -899,6 +900,8 @@ pub fn coalesce_nulls_series(a: &Series, b: &Series) -> (Series, Series) { *arr_a = arr_a.with_validity(validity.clone()); *arr_b = arr_b.with_validity(validity); } + a.compute_len(); + b.compute_len(); (a, b) } else { (a.clone(), b.clone()) diff --git a/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs index bc2af56f18d7..e3d7125b9e27 100644 --- a/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs @@ -358,6 +358,7 @@ pub(crate) fn insert_streaming_nodes( #[cfg(feature = "dtype-categorical")] DataType::Categorical(_) => string_cache, DataType::List(inner) => allowed_dtype(inner, string_cache), + #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => fields .iter() .all(|fld| allowed_dtype(fld.data_type(), string_cache)),