diff --git a/crates/polars-arrow/src/trusted_len/push_unchecked.rs b/crates/polars-arrow/src/trusted_len/push_unchecked.rs index 5d268d070777..f3d830f76fa1 100644 --- a/crates/polars-arrow/src/trusted_len/push_unchecked.rs +++ b/crates/polars-arrow/src/trusted_len/push_unchecked.rs @@ -1,13 +1,13 @@ use super::*; pub trait TrustedLenPush { - /// Will push an item and not check if there is enough capacity + /// Will push an item and not check if there is enough capacity. /// /// # Safety /// Caller must ensure the array has enough capacity to hold `T`. unsafe fn push_unchecked(&mut self, value: T); - /// Extend the array with an iterator who's length can be trusted + /// Extend the array with an iterator who's length can be trusted. fn extend_trusted_len, J: TrustedLen>( &mut self, iter: I, @@ -16,9 +16,16 @@ pub trait TrustedLenPush { } /// # Safety - /// Caller must ensure the iterators reported length is correct + /// Caller must ensure the iterators reported length is correct. unsafe fn extend_trusted_len_unchecked>(&mut self, iter: I); + /// # Safety + /// Caller must ensure the iterators reported length is correct. + unsafe fn try_extend_trusted_len_unchecked>>( + &mut self, + iter: I, + ) -> Result<(), E>; + fn from_trusted_len_iter, J: TrustedLen>( iter: I, ) -> Self @@ -28,8 +35,28 @@ pub trait TrustedLenPush { unsafe { Self::from_trusted_len_iter_unchecked(iter) } } /// # Safety - /// Caller must ensure the iterators reported length is correct + /// Caller must ensure the iterators reported length is correct. unsafe fn from_trusted_len_iter_unchecked>(iter: I) -> Self; + + fn try_from_trusted_len_iter< + E, + I: IntoIterator, IntoIter = J>, + J: TrustedLen, + >( + iter: I, + ) -> Result + where + Self: Sized, + { + unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } + } + /// # Safety + /// Caller must ensure the iterators reported length is correct. + unsafe fn try_from_trusted_len_iter_unchecked>>( + iter: I, + ) -> Result + where + Self: Sized; } impl TrustedLenPush for Vec { @@ -55,10 +82,38 @@ impl TrustedLenPush for Vec { self.set_len(self.len() + upper) } + unsafe fn try_extend_trusted_len_unchecked>>( + &mut self, + iter: I, + ) -> Result<(), E> { + let iter = iter.into_iter(); + let upper = iter.size_hint().1.expect("must have an upper bound"); + self.reserve(upper); + + let mut dst = self.as_mut_ptr().add(self.len()); + for value in iter { + std::ptr::write(dst, value?); + dst = dst.add(1) + } + self.set_len(self.len() + upper); + Ok(()) + } + #[inline] unsafe fn from_trusted_len_iter_unchecked>(iter: I) -> Self { let mut v = vec![]; v.extend_trusted_len_unchecked(iter); v } + + unsafe fn try_from_trusted_len_iter_unchecked>>( + iter: I, + ) -> Result + where + Self: Sized, + { + let mut v = vec![]; + v.try_extend_trusted_len_unchecked(iter)?; + Ok(v) + } } diff --git a/crates/polars-core/src/chunked_array/collect.rs b/crates/polars-core/src/chunked_array/collect.rs new file mode 100644 index 000000000000..739cc0c6f5c8 --- /dev/null +++ b/crates/polars-core/src/chunked_array/collect.rs @@ -0,0 +1,171 @@ +//! Methods for collecting into a ChunkedArray. +//! +//! For types that don't have dtype parameters: +//! iter.(try_)collect_ca(_trusted) (name) +//! +//! For all types: +//! iter.(try_)collect_ca(_trusted)_like (other_df) Copies name/dtype from other_df +//! iter.(try_)collect_ca(_trusted)_with_dtype (name, df) +//! +//! The try variants work on iterators of Results, the trusted variants do not +//! check the length of the iterator. + +use std::sync::Arc; + +use polars_arrow::trusted_len::TrustedLen; + +use crate::chunked_array::ChunkedArray; +use crate::datatypes::{ + ArrayCollectIterExt, ArrayFromIter, ArrayFromIterDtype, DataType, Field, PolarsDataType, +}; + +pub trait ChunkedCollectIterExt: Iterator + Sized { + #[inline] + fn collect_ca_with_dtype(self, name: &str, dtype: DataType) -> ChunkedArray + where + T::Array: ArrayFromIterDtype, + { + let field = Arc::new(Field::new(name, dtype.clone())); + let arr = self.collect_arr_with_dtype(dtype); + ChunkedArray::from_chunk_iter_and_field(field, [arr]) + } + + #[inline] + fn collect_ca_like(self, name_dtype_src: &ChunkedArray) -> ChunkedArray + where + T::Array: ArrayFromIterDtype, + { + let field = Arc::clone(&name_dtype_src.field); + let arr = self.collect_arr_with_dtype(field.dtype.clone()); + ChunkedArray::from_chunk_iter_and_field(field, [arr]) + } + + #[inline] + fn collect_ca_trusted_with_dtype(self, name: &str, dtype: DataType) -> ChunkedArray + where + T::Array: ArrayFromIterDtype, + Self: TrustedLen, + { + let field = Arc::new(Field::new(name, dtype.clone())); + let arr = self.collect_arr_trusted_with_dtype(dtype); + ChunkedArray::from_chunk_iter_and_field(field, [arr]) + } + + #[inline] + fn collect_ca_trusted_like(self, name_dtype_src: &ChunkedArray) -> ChunkedArray + where + T::Array: ArrayFromIterDtype, + Self: TrustedLen, + { + let field = Arc::clone(&name_dtype_src.field); + let arr = self.collect_arr_trusted_with_dtype(field.dtype.clone()); + ChunkedArray::from_chunk_iter_and_field(field, [arr]) + } + + #[inline] + fn try_collect_ca_with_dtype( + self, + name: &str, + dtype: DataType, + ) -> Result, E> + where + T::Array: ArrayFromIterDtype, + Self: Iterator>, + { + let field = Arc::new(Field::new(name, dtype.clone())); + let arr = self.try_collect_arr_with_dtype(dtype)?; + Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) + } + + #[inline] + fn try_collect_ca_like( + self, + name_dtype_src: &ChunkedArray, + ) -> Result, E> + where + T::Array: ArrayFromIterDtype, + Self: Iterator>, + { + let field = Arc::clone(&name_dtype_src.field); + let arr = self.try_collect_arr_with_dtype(field.dtype.clone())?; + Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) + } + + #[inline] + fn try_collect_ca_trusted_with_dtype( + self, + name: &str, + dtype: DataType, + ) -> Result, E> + where + T::Array: ArrayFromIterDtype, + Self: Iterator> + TrustedLen, + { + let field = Arc::new(Field::new(name, dtype.clone())); + let arr = self.try_collect_arr_trusted_with_dtype(dtype)?; + Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) + } + + #[inline] + fn try_collect_ca_trusted_like( + self, + name_dtype_src: &ChunkedArray, + ) -> Result, E> + where + T::Array: ArrayFromIterDtype, + Self: Iterator> + TrustedLen, + { + let field = Arc::clone(&name_dtype_src.field); + let arr = self.try_collect_arr_trusted_with_dtype(field.dtype.clone())?; + Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) + } +} + +impl ChunkedCollectIterExt for I {} + +pub trait ChunkedCollectInferIterExt: Iterator + Sized { + #[inline] + fn collect_ca(self, name: &str) -> ChunkedArray + where + T::Array: ArrayFromIter, + { + let field = Arc::new(Field::new(name, T::get_dtype())); + let arr = self.collect_arr(); + ChunkedArray::from_chunk_iter_and_field(field, [arr]) + } + + #[inline] + fn collect_ca_trusted(self, name: &str) -> ChunkedArray + where + T::Array: ArrayFromIter, + Self: TrustedLen, + { + let field = Arc::new(Field::new(name, T::get_dtype())); + let arr = self.collect_arr_trusted(); + ChunkedArray::from_chunk_iter_and_field(field, [arr]) + } + + #[inline] + fn try_collect_ca(self, name: &str) -> Result, E> + where + T::Array: ArrayFromIter, + Self: Iterator>, + { + let field = Arc::new(Field::new(name, T::get_dtype())); + let arr = self.try_collect_arr()?; + Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) + } + + #[inline] + fn try_collect_ca_trusted(self, name: &str) -> Result, E> + where + T::Array: ArrayFromIter, + Self: Iterator> + TrustedLen, + { + let field = Arc::new(Field::new(name, T::get_dtype())); + let arr = self.try_collect_arr_trusted()?; + Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) + } +} + +impl ChunkedCollectInferIterExt for I {} diff --git a/crates/polars-core/src/chunked_array/from.rs b/crates/polars-core/src/chunked_array/from.rs index 9ad53faacbde..d4f47c5ab2c5 100644 --- a/crates/polars-core/src/chunked_array/from.rs +++ b/crates/polars-core/src/chunked_array/from.rs @@ -118,6 +118,35 @@ where unsafe { Ok(Self::from_chunks(name, chunks?)) } } + pub(crate) fn from_chunk_iter_and_field(field: Arc, chunks: I) -> Self + where + I: IntoIterator, + T: PolarsDataType::Item>, + ::Item: Array, + { + assert_eq!( + std::mem::discriminant(&T::get_dtype()), + std::mem::discriminant(&field.dtype) + ); + + let mut length = 0; + let chunks = chunks + .into_iter() + .map(|x| { + length += x.len(); + Box::new(x) as Box + }) + .collect(); + + ChunkedArray { + field, + chunks, + phantom: PhantomData, + bit_settings: Default::default(), + length: length.try_into().unwrap(), + } + } + /// Create a new [`ChunkedArray`] from existing chunks. /// /// # Safety diff --git a/crates/polars-core/src/chunked_array/list/iterator.rs b/crates/polars-core/src/chunked_array/list/iterator.rs index 8bf43fcba21f..b30a339ba812 100644 --- a/crates/polars-core/src/chunked_array/list/iterator.rs +++ b/crates/polars-core/src/chunked_array/list/iterator.rs @@ -173,14 +173,11 @@ impl ListChunked { where V: PolarsDataType, F: FnMut(Option>) -> Option + Copy, - K: ArrayFromElementIter, + V::Array: ArrayFromIter>, { // TODO! make an amortized iter that does not flatten - // SAFETY: unstable series never lives longer than the iterator. - let element_iter = unsafe { self.amortized_iter().map(f) }; - let array = K::array_from_iter(element_iter); - ChunkedArray::from_chunk_iter(self.name(), std::iter::once(array)) + unsafe { self.amortized_iter().map(f).collect_ca(self.name()) } } /// Apply a closure `F` elementwise. diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 60784cac93f1..a3e479b9f576 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -16,6 +16,7 @@ pub mod ops; pub mod arithmetic; pub mod builder; pub mod cast; +pub mod collect; pub mod comparison; pub mod float; pub mod iterator; diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index a6a096834830..0c6d8e8e885a 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -1,13 +1,9 @@ //! Implementations of the ChunkApply Trait. use std::borrow::Cow; use std::convert::TryFrom; -use std::error::Error; use arrow::array::{BooleanArray, PrimitiveArray}; use arrow::bitmap::utils::{get_bit_unchecked, set_bit_unchecked}; -use arrow::bitmap::Bitmap; -use arrow::trusted_len::TrustedLen; -use arrow::types::NativeType; use polars_arrow::bitmap::unary_mut; use crate::prelude::*; @@ -22,12 +18,11 @@ where where U: PolarsDataType, F: FnMut(T::Physical<'a>) -> K + Copy, - K: ArrayFromElementIter, + U::Array: ArrayFromIter, { let iter = self.downcast_iter().map(|arr| { - let element_iter = arr.values_iter().map(op); - let array = K::array_from_values_iter(element_iter); - array.with_validity_typed(arr.validity().cloned()) + let out: U::Array = arr.values_iter().map(op).collect_arr(); + out.with_validity_typed(arr.validity().cloned()) }); ChunkedArray::from_chunk_iter(self.name(), iter) @@ -37,28 +32,11 @@ where where U: PolarsDataType, F: FnMut(T::Physical<'a>) -> Result + Copy, - K: ArrayFromElementIter, - E: Error, + U::Array: ArrayFromIter, { let iter = self.downcast_iter().map(|arr| { let element_iter = arr.values_iter().map(op); - let array = K::try_array_from_values_iter(element_iter)?; - Ok(array.with_validity_typed(arr.validity().cloned())) - }); - - ChunkedArray::try_from_chunk_iter(self.name(), iter) - } - - pub fn try_apply_generic<'a, U, K, F, E>(&'a self, op: F) -> Result, E> - where - U: PolarsDataType, - F: FnMut(Option>) -> Result, E> + Copy, - K: ArrayFromElementIter, - E: Error, - { - let iter = self.downcast_iter().map(|arr| { - let element_iter = arr.iter().map(op); - let array = K::try_array_from_iter(element_iter)?; + let array: U::Array = element_iter.try_collect_arr()?; Ok(array.with_validity_typed(arr.validity().cloned())) }); @@ -69,55 +47,34 @@ where where U: PolarsDataType, F: FnMut(Option>) -> Option, - K: ArrayFromElementIter, + U::Array: ArrayFromIter>, { if self.null_count() == 0 { - let iter = self.downcast_iter().map(|arr| { - let element_iter = arr.values_iter().map(|x| op(Some(x))); - K::array_from_iter(element_iter) - }); + let iter = self + .downcast_iter() + .map(|arr| arr.values_iter().map(|x| op(Some(x))).collect_arr()); ChunkedArray::from_chunk_iter(self.name(), iter) } else { - let iter = self.downcast_iter().map(|arr| { - let element_iter = arr.iter().map(&mut op); - K::array_from_iter(element_iter) - }); + let iter = self + .downcast_iter() + .map(|arr| arr.iter().map(&mut op).collect_arr()); ChunkedArray::from_chunk_iter(self.name(), iter) } } -} - -fn collect_array>( - iter: I, - validity: Option, -) -> PrimitiveArray { - PrimitiveArray::from_trusted_len_values_iter(iter).with_validity(validity) -} -macro_rules! try_apply { - ($self:expr, $f:expr) => {{ - if !$self.has_validity() { - $self.into_no_null_iter().map($f).collect() - } else { - $self - .into_iter() - .map(|opt_v| opt_v.map($f).transpose()) - .collect() - } - }}; -} + pub fn try_apply_generic<'a, U, K, F, E>(&'a self, op: F) -> Result, E> + where + U: PolarsDataType, + F: FnMut(Option>) -> Result, E> + Copy, + U::Array: ArrayFromIter>, + { + let iter = self.downcast_iter().map(|arr| { + let array: U::Array = arr.iter().map(op).try_collect_arr()?; + Ok(array.with_validity_typed(arr.validity().cloned())) + }); -macro_rules! apply { - ($self:expr, $f:expr) => {{ - if !$self.has_validity() { - $self.into_no_null_iter().map($f).collect_trusted() - } else { - $self - .into_iter() - .map(|opt_v| opt_v.map($f)) - .collect_trusted() - } - }}; + ChunkedArray::try_from_chunk_iter(self.name(), iter) + } } fn apply_in_place_impl(name: &str, chunks: Vec, f: F) -> ChunkedArray @@ -215,7 +172,8 @@ where .data_views() .zip(self.iter_validities()) .map(|(slice, validity)| { - collect_array(slice.iter().copied().map(f), validity.cloned()) + let arr: T::Array = slice.iter().copied().map(f).collect_arr(); + arr.with_validity(validity.cloned()) }); ChunkedArray::from_chunk_iter(self.name(), chunks) } @@ -570,7 +528,17 @@ impl<'a> ChunkApply<'a, Series> for ListChunked { } out }; - let mut ca: ListChunked = apply!(self, &mut function); + let mut ca: ListChunked = { + if !self.has_validity() { + self.into_no_null_iter() + .map(&mut function) + .collect_trusted() + } else { + self.into_iter() + .map(|opt_v| opt_v.map(&mut function)) + .collect_trusted() + } + }; if fast_explode { ca.set_fast_explode() } @@ -595,7 +563,15 @@ impl<'a> ChunkApply<'a, Series> for ListChunked { } out }; - let ca: PolarsResult = try_apply!(self, &mut function); + let ca: PolarsResult = { + if !self.has_validity() { + self.into_no_null_iter().map(&mut function).collect() + } else { + self.into_iter() + .map(|opt_v| opt_v.map(&mut function).transpose()) + .collect() + } + }; let mut ca = ca?; if fast_explode { ca.set_fast_explode() diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs index c911963abf2b..3df7c726df7a 100644 --- a/crates/polars-core/src/chunked_array/ops/arity.rs +++ b/crates/polars-core/src/chunked_array/ops/arity.rs @@ -3,7 +3,7 @@ use std::error::Error; use arrow::array::Array; use polars_arrow::utils::combine_validities_and; -use crate::datatypes::{ArrayFromElementIter, StaticArray}; +use crate::datatypes::{ArrayCollectIterExt, ArrayFromIter, StaticArray}; use crate::prelude::{ChunkedArray, PolarsDataType}; use crate::utils::align_chunks_binary; @@ -18,7 +18,7 @@ where U: PolarsDataType, V: PolarsDataType, F: for<'a> FnMut(Option>, Option>) -> Option, - K: ArrayFromElementIter, + V::Array: ArrayFromIter>, { let (lhs, rhs) = align_chunks_binary(lhs, rhs); let iter = lhs @@ -29,7 +29,7 @@ where .iter() .zip(rhs_arr.iter()) .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)); - K::array_from_iter(element_iter) + element_iter.collect_arr() }); ChunkedArray::from_chunk_iter(lhs.name(), iter) } @@ -45,8 +45,7 @@ where U: PolarsDataType, V: PolarsDataType, F: for<'a> FnMut(Option>, Option>) -> Result, E>, - K: ArrayFromElementIter, - E: Error, + V::Array: ArrayFromIter>, { let (lhs, rhs) = align_chunks_binary(lhs, rhs); let iter = lhs @@ -57,7 +56,7 @@ where .iter() .zip(rhs_arr.iter()) .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)); - K::try_array_from_iter(element_iter) + element_iter.try_collect_arr() }); ChunkedArray::try_from_chunk_iter(lhs.name(), iter) } @@ -73,7 +72,7 @@ where U: PolarsDataType, V: PolarsDataType, F: for<'a> FnMut(T::Physical<'a>, U::Physical<'a>) -> K, - K: ArrayFromElementIter, + V::Array: ArrayFromIter, { let (lhs, rhs) = align_chunks_binary(lhs, rhs); let iter = lhs @@ -87,7 +86,7 @@ where .zip(rhs_arr.values_iter()) .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val)); - let array = K::array_from_values_iter(element_iter); + let array: V::Array = element_iter.collect_arr(); array.with_validity_typed(validity) }); ChunkedArray::from_chunk_iter(lhs.name(), iter) @@ -104,8 +103,7 @@ where U: PolarsDataType, V: PolarsDataType, F: for<'a> FnMut(T::Physical<'a>, U::Physical<'a>) -> Result, - K: ArrayFromElementIter, - E: Error, + V::Array: ArrayFromIter, { let (lhs, rhs) = align_chunks_binary(lhs, rhs); let iter = lhs @@ -119,7 +117,7 @@ where .zip(rhs_arr.values_iter()) .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val)); - let array = K::try_array_from_values_iter(element_iter)?; + let array: V::Array = element_iter.try_collect_arr()?; Ok(array.with_validity_typed(validity)) }); ChunkedArray::try_from_chunk_iter(lhs.name(), iter) diff --git a/crates/polars-core/src/chunked_array/upstream_traits.rs b/crates/polars-core/src/chunked_array/upstream_traits.rs index 458b7fbf1298..af24444fdf14 100644 --- a/crates/polars-core/src/chunked_array/upstream_traits.rs +++ b/crates/polars-core/src/chunked_array/upstream_traits.rs @@ -40,31 +40,8 @@ where T: PolarsNumericType, { fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - - let arr: PrimitiveArray = match iter.size_hint() { - (a, Some(b)) if a == b => { - // 2021-02-07: ~40% faster than builder. - // It is unsafe because we cannot be certain that the iterators length can be trusted. - // For most iterators that report the same upper bound as lower bound it is, but still - // somebody can create an iterator that incorrectly gives those bounds. - // This will not lead to UB, but will panic. - #[cfg(feature = "performant")] - unsafe { - let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter) - .to(T::get_dtype().to_arrow()); - assert_eq!(arr.len(), a); - arr - } - #[cfg(not(feature = "performant"))] - iter.collect::>() - .to(T::get_dtype().to_arrow()) - }, - _ => iter - .collect::>() - .to(T::get_dtype().to_arrow()), - }; - arr.into() + // TODO: eliminate this FromIterator implementation entirely. + iter.into_iter().collect_ca("") } } diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs deleted file mode 100644 index 2283adc90445..000000000000 --- a/crates/polars-core/src/datatypes/from_values.rs +++ /dev/null @@ -1,216 +0,0 @@ -use std::borrow::Cow; -use std::error::Error; - -use arrow::array::{ - BinaryArray, BooleanArray, MutableBinaryArray, MutableBinaryValuesArray, MutablePrimitiveArray, - MutableUtf8Array, MutableUtf8ValuesArray, PrimitiveArray, Utf8Array, -}; -use arrow::bitmap::Bitmap; -use polars_arrow::array::utf8::{BinaryFromIter, Utf8FromIter}; -use polars_arrow::prelude::FromData; -use polars_arrow::trusted_len::TrustedLen; - -use crate::datatypes::NumericNative; -use crate::prelude::StaticArray; - -pub trait ArrayFromElementIter -where - Self: Sized, -{ - type ArrayType: StaticArray; - - fn array_from_iter>>(iter: I) -> Self::ArrayType; - - fn array_from_values_iter>(iter: I) -> Self::ArrayType; - - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result; - - fn try_array_from_values_iter>>( - iter: I, - ) -> Result; -} - -impl ArrayFromElementIter for bool { - type ArrayType = BooleanArray; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { BooleanArray::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { BooleanArray::from_trusted_len_values_iter_unchecked(iter) } - } - - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - // SAFETY: guarded by `TrustedLen` trait - unsafe { BooleanArray::try_from_trusted_len_iter_unchecked(iter) } - } - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - // SAFETY: guarded by `TrustedLen` trait - let values = unsafe { Bitmap::try_from_trusted_len_iter_unchecked(iter) }?; - Ok(BooleanArray::from_data_default(values, None)) - } -} - -impl ArrayFromElementIter for T -where - T: NumericNative, -{ - type ArrayType = PrimitiveArray; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) } - } - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - // SAFETY: guarded by `TrustedLen` trait - unsafe { Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into()) } - } - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - let values: Vec<_> = iter.collect::, _>>()?; - Ok(PrimitiveArray::from_vec(values)) - } -} - -impl ArrayFromElementIter for &str { - type ArrayType = Utf8Array; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - let len = iter.size_hint().0; - Utf8Array::from_values_iter(iter, len, len * 24) - } - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableUtf8Array::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } - - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableUtf8ValuesArray::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } -} - -impl ArrayFromElementIter for String { - type ArrayType = Utf8Array; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - let len = iter.size_hint().0; - Utf8Array::from_values_iter(iter, len, len * 24) - } - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableUtf8Array::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } - - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableUtf8ValuesArray::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } -} - -impl ArrayFromElementIter for Cow<'_, str> { - type ArrayType = Utf8Array; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - let len = iter.size_hint().0; - Utf8Array::from_values_iter(iter, len, len * 24) - } - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableUtf8Array::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } - - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableUtf8ValuesArray::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } -} - -impl ArrayFromElementIter for Cow<'_, [u8]> { - type ArrayType = BinaryArray; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { BinaryArray::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - let len = iter.size_hint().0; - BinaryArray::from_values_iter(iter, len, len * 24) - } - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableBinaryArray::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } - - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - let len = iter.size_hint().0; - let mut mutable = MutableBinaryValuesArray::::with_capacities(len, len * 24); - mutable.extend_fallible(iter)?; - Ok(mutable.into()) - } -} diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index 37d2d2d24a6a..f7792c0719bb 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -12,8 +12,8 @@ mod aliases; mod any_value; mod dtype; mod field; -mod from_values; mod static_array; +mod static_array_collect; mod time_unit; use std::cmp::Ordering; @@ -32,7 +32,6 @@ use arrow::types::simd::Simd; use arrow::types::NativeType; pub use dtype::*; pub use field::*; -pub use from_values::ArrayFromElementIter; use num_traits::{Bounded, FromPrimitive, Num, NumCast, One, Zero}; use polars_arrow::data_types::IsFloat; #[cfg(feature = "serde")] @@ -42,6 +41,7 @@ use serde::{Deserialize, Serialize}; #[cfg(any(feature = "serde", feature = "serde-lazy"))] use serde::{Deserializer, Serializer}; pub use static_array::StaticArray; +pub use static_array_collect::{ArrayCollectIterExt, ArrayFromIter, ArrayFromIterDtype}; pub use time_unit::*; use crate::chunked_array::arithmetic::ArrayArithmetics; diff --git a/crates/polars-core/src/datatypes/static_array.rs b/crates/polars-core/src/datatypes/static_array.rs index 162bbd88bfbd..d4cc0aa61960 100644 --- a/crates/polars-core/src/datatypes/static_array.rs +++ b/crates/polars-core/src/datatypes/static_array.rs @@ -3,9 +3,14 @@ use arrow::bitmap::Bitmap; #[cfg(feature = "object")] use crate::chunked_array::object::{ObjectArray, ObjectValueIter}; +use crate::datatypes::static_array_collect::ArrayFromIterDtype; use crate::prelude::*; -pub trait StaticArray: Array { +pub trait StaticArray: + Array + + for<'a> ArrayFromIterDtype> + + for<'a> ArrayFromIterDtype>> +{ type ValueT<'a> where Self: 'a; @@ -55,6 +60,10 @@ pub trait StaticArray: Array { fn with_validity_typed(self, validity: Option) -> Self; } +pub trait ParameterFreeDtypeStaticArray: StaticArray { + fn get_dtype() -> DataType; +} + impl StaticArray for PrimitiveArray { type ValueT<'a> = T; type ValueIterT<'a> = std::iter::Copied>; @@ -77,6 +86,12 @@ impl StaticArray for PrimitiveArray { } } +impl ParameterFreeDtypeStaticArray for PrimitiveArray { + fn get_dtype() -> DataType { + T::PolarsType::get_dtype() + } +} + impl StaticArray for BooleanArray { type ValueT<'a> = bool; type ValueIterT<'a> = BitmapIter<'a>; @@ -99,6 +114,12 @@ impl StaticArray for BooleanArray { } } +impl ParameterFreeDtypeStaticArray for BooleanArray { + fn get_dtype() -> DataType { + DataType::Boolean + } +} + impl StaticArray for Utf8Array { type ValueT<'a> = &'a str; type ValueIterT<'a> = Utf8ValuesIter<'a, i64>; @@ -121,6 +142,12 @@ impl StaticArray for Utf8Array { } } +impl ParameterFreeDtypeStaticArray for Utf8Array { + fn get_dtype() -> DataType { + DataType::Utf8 + } +} + impl StaticArray for BinaryArray { type ValueT<'a> = &'a [u8]; type ValueIterT<'a> = BinaryValueIter<'a, i64>; @@ -143,6 +170,12 @@ impl StaticArray for BinaryArray { } } +impl ParameterFreeDtypeStaticArray for BinaryArray { + fn get_dtype() -> DataType { + DataType::Binary + } +} + impl StaticArray for ListArray { type ValueT<'a> = Box; type ValueIterT<'a> = ListValuesIter<'a, i64>; diff --git a/crates/polars-core/src/datatypes/static_array_collect.rs b/crates/polars-core/src/datatypes/static_array_collect.rs new file mode 100644 index 000000000000..27992f0f8e67 --- /dev/null +++ b/crates/polars-core/src/datatypes/static_array_collect.rs @@ -0,0 +1,861 @@ +use std::borrow::Cow; +use std::sync::Arc; + +#[cfg(feature = "dtype-array")] +use arrow::array::FixedSizeListArray; +use arrow::array::{ + Array, BinaryArray, BooleanArray, ListArray, MutableBinaryArray, MutableBinaryValuesArray, + PrimitiveArray, Utf8Array, +}; +use arrow::bitmap::Bitmap; +#[cfg(feature = "dtype-array")] +use polars_arrow::prelude::fixed_size_list::AnonymousBuilder as AnonymousFixedSizeListArrayBuilder; +use polars_arrow::prelude::list::AnonymousBuilder as AnonymousListArrayBuilder; +use polars_arrow::trusted_len::{TrustedLen, TrustedLenPush}; + +#[cfg(feature = "object")] +use crate::chunked_array::object::{ObjectArray, PolarsObject}; +use crate::datatypes::static_array::ParameterFreeDtypeStaticArray; +use crate::datatypes::{DataType, NumericNative, PolarsDataType, StaticArray}; + +pub trait ArrayFromIterDtype: Sized { + fn arr_from_iter_with_dtype>(dtype: DataType, iter: I) -> Self; + + #[inline(always)] + fn arr_from_iter_trusted_with_dtype(dtype: DataType, iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + Self::arr_from_iter_with_dtype(dtype, iter) + } + + fn try_arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Result; + + #[inline(always)] + fn try_arr_from_iter_trusted_with_dtype(dtype: DataType, iter: I) -> Result + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + Self::try_arr_from_iter_with_dtype(dtype, iter) + } +} + +pub trait ArrayFromIter: Sized { + fn arr_from_iter>(iter: I) -> Self; + + #[inline(always)] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + Self::arr_from_iter(iter) + } + + fn try_arr_from_iter>>(iter: I) -> Result; + + #[inline(always)] + fn try_arr_from_iter_trusted(iter: I) -> Result + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + Self::try_arr_from_iter(iter) + } +} + +impl> ArrayFromIterDtype for A { + #[inline(always)] + fn arr_from_iter_with_dtype>(dtype: DataType, iter: I) -> Self { + debug_assert!(std::mem::discriminant(&dtype) == std::mem::discriminant(&A::get_dtype())); + Self::arr_from_iter(iter) + } + + #[inline(always)] + fn arr_from_iter_trusted_with_dtype(dtype: DataType, iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + debug_assert!(std::mem::discriminant(&dtype) == std::mem::discriminant(&A::get_dtype())); + Self::arr_from_iter_with_dtype(dtype, iter) + } + + #[inline(always)] + fn try_arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Result { + debug_assert!(std::mem::discriminant(&dtype) == std::mem::discriminant(&A::get_dtype())); + Self::try_arr_from_iter(iter) + } + + #[inline(always)] + fn try_arr_from_iter_trusted_with_dtype(dtype: DataType, iter: I) -> Result + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + debug_assert!(std::mem::discriminant(&dtype) == std::mem::discriminant(&A::get_dtype())); + Self::try_arr_from_iter_with_dtype(dtype, iter) + } +} + +pub trait ArrayCollectIterExt: Iterator + Sized { + #[inline(always)] + fn collect_arr(self) -> A + where + A: ArrayFromIter, + { + A::arr_from_iter(self) + } + + #[inline(always)] + fn collect_arr_trusted(self) -> A + where + A: ArrayFromIter, + Self: TrustedLen, + { + A::arr_from_iter_trusted(self) + } + + #[inline(always)] + fn try_collect_arr(self) -> Result + where + A: ArrayFromIter, + Self: Iterator>, + { + A::try_arr_from_iter(self) + } + + #[inline(always)] + fn try_collect_arr_trusted(self) -> Result + where + A: ArrayFromIter, + Self: Iterator> + TrustedLen, + { + A::try_arr_from_iter_trusted(self) + } + + #[inline(always)] + fn collect_arr_with_dtype(self, dtype: DataType) -> A + where + A: ArrayFromIterDtype, + { + A::arr_from_iter_with_dtype(dtype, self) + } + + #[inline(always)] + fn collect_arr_trusted_with_dtype(self, dtype: DataType) -> A + where + A: ArrayFromIterDtype, + Self: TrustedLen, + { + A::arr_from_iter_trusted_with_dtype(dtype, self) + } + + #[inline(always)] + fn try_collect_arr_with_dtype(self, dtype: DataType) -> Result + where + A: ArrayFromIterDtype, + Self: Iterator>, + { + A::try_arr_from_iter_with_dtype(dtype, self) + } + + #[inline(always)] + fn try_collect_arr_trusted_with_dtype(self, dtype: DataType) -> Result + where + A: ArrayFromIterDtype, + Self: Iterator> + TrustedLen, + { + A::try_arr_from_iter_trusted_with_dtype(dtype, self) + } +} + +impl ArrayCollectIterExt for I {} + +// --------------- +// Implementations +// --------------- +macro_rules! impl_collect_vec_validity { + ($iter: ident, $x:ident, $unpack:expr) => {{ + let mut iter = $iter.into_iter(); + let mut buf: Vec = Vec::new(); + let mut bitmap: Vec = Vec::new(); + let lo = iter.size_hint().0; + buf.reserve(8 + lo); + bitmap.reserve(8 + 8 * (lo / 64)); + + let mut nonnull_count = 0; + let mut mask = 0u8; + 'exhausted: loop { + unsafe { + // SAFETY: when we enter this loop we always have at least one + // capacity in bitmap, and at least 8 in buf. + for i in 0..8 { + let Some($x) = iter.next() else { + break 'exhausted; + }; + #[allow(clippy::all)] + // #[allow(clippy::redundant_locals)] Clippy lint too new + let x = $unpack; + let nonnull = x.is_some(); + mask |= (nonnull as u8) << i; + nonnull_count += nonnull as usize; + buf.push_unchecked(x.unwrap_or_default()); + } + + bitmap.push_unchecked(mask); + mask = 0; + } + + buf.reserve(8); + if bitmap.len() == bitmap.capacity() { + bitmap.reserve(8); // Waste some space to make branch more predictable. + } + } + + unsafe { + // SAFETY: when we broke to 'exhausted we had capacity by the loop invariant. + // It's also no problem if we make the mask bigger than strictly necessary. + bitmap.push_unchecked(mask); + } + + let null_count = buf.len() - nonnull_count; + let arrow_bitmap = if null_count > 0 { + unsafe { + // SAFETY: we made sure the null_count is correct. + Some(Bitmap::from_inner(Arc::new(bitmap.into()), 0, buf.len(), null_count).unwrap()) + } + } else { + None + }; + + (buf, arrow_bitmap) + }}; +} + +macro_rules! impl_trusted_collect_vec_validity { + ($iter: ident, $x:ident, $unpack:expr) => {{ + let mut iter = $iter.into_iter(); + let mut buf: Vec = Vec::new(); + let mut bitmap: Vec = Vec::new(); + let n = iter.size_hint().1.expect("must have an upper bound"); + buf.reserve(n); + bitmap.reserve(8 + 8 * (n / 64)); + + let mut nonnull_count = 0; + while buf.len() + 8 <= n { + unsafe { + let mut mask = 0u8; + for i in 0..8 { + let $x = iter.next().unwrap_unchecked(); + #[allow(clippy::all)] + // #[allow(clippy::redundant_locals)] Clippy lint too new + let x = $unpack; + let nonnull = x.is_some(); + mask |= (nonnull as u8) << i; + nonnull_count += nonnull as usize; + buf.push_unchecked(x.unwrap_or_default()); + } + bitmap.push_unchecked(mask); + } + } + + if buf.len() < n { + unsafe { + let mut mask = 0u8; + for i in 0..n - buf.len() { + let $x = iter.next().unwrap_unchecked(); + let x = $unpack; + let nonnull = x.is_some(); + mask |= (nonnull as u8) << i; + nonnull_count += nonnull as usize; + buf.push_unchecked(x.unwrap_or_default()); + } + bitmap.push_unchecked(mask); + } + } + + let null_count = buf.len() - nonnull_count; + let arrow_bitmap = if null_count > 0 { + unsafe { + // SAFETY: we made sure the null_count is correct. + Some(Bitmap::from_inner(Arc::new(bitmap.into()), 0, buf.len(), null_count).unwrap()) + } + } else { + None + }; + + (buf, arrow_bitmap) + }}; +} + +impl ArrayFromIter for PrimitiveArray { + fn arr_from_iter>(iter: I) -> Self { + PrimitiveArray::from_vec(iter.into_iter().collect()) + } + + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + PrimitiveArray::from_vec(Vec::from_trusted_len_iter(iter)) + } + + fn try_arr_from_iter>>(iter: I) -> Result { + let v: Result, E> = iter.into_iter().collect(); + Ok(PrimitiveArray::from_vec(v?)) + } + + fn try_arr_from_iter_trusted(iter: I) -> Result + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + let v = Vec::try_from_trusted_len_iter(iter); + Ok(PrimitiveArray::from_vec(v?)) + } +} + +impl ArrayFromIter> for PrimitiveArray { + fn arr_from_iter>>(iter: I) -> Self { + let (buf, validity) = impl_collect_vec_validity!(iter, x, x); + PrimitiveArray::new(T::PolarsType::get_dtype().to_arrow(), buf.into(), validity) + } + + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + let (buf, validity) = impl_trusted_collect_vec_validity!(iter, x, x); + PrimitiveArray::new(T::PolarsType::get_dtype().to_arrow(), buf.into(), validity) + } + + fn try_arr_from_iter, E>>>( + iter: I, + ) -> Result { + let (buf, validity) = impl_collect_vec_validity!(iter, x, x?); + Ok(PrimitiveArray::new( + T::PolarsType::get_dtype().to_arrow(), + buf.into(), + validity, + )) + } + + fn try_arr_from_iter_trusted(iter: I) -> Result + where + I: IntoIterator, E>>, + I::IntoIter: TrustedLen, + { + let (buf, validity) = impl_trusted_collect_vec_validity!(iter, x, x?); + Ok(PrimitiveArray::new( + T::PolarsType::get_dtype().to_arrow(), + buf.into(), + validity, + )) + } +} + +// We don't use AsRef here because it leads to problems with conflicting implementations, +// as Rust considers that AsRef<[u8]> for Option<&[u8]> could be implemented. +trait IntoBytes { + type AsRefT: AsRef<[u8]>; + fn into_bytes(self) -> Self::AsRefT; +} +trait TrivialIntoBytes: AsRef<[u8]> {} +impl IntoBytes for T { + type AsRefT = Self; + fn into_bytes(self) -> Self { + self + } +} +impl TrivialIntoBytes for Vec {} +impl<'a> TrivialIntoBytes for Cow<'a, [u8]> {} +impl<'a> TrivialIntoBytes for &'a [u8] {} +impl TrivialIntoBytes for String {} +impl<'a> TrivialIntoBytes for &'a str {} +impl<'a> IntoBytes for Cow<'a, str> { + type AsRefT = Cow<'a, [u8]>; + fn into_bytes(self) -> Cow<'a, [u8]> { + match self { + Cow::Borrowed(a) => Cow::Borrowed(a.as_bytes()), + Cow::Owned(s) => Cow::Owned(s.into_bytes()), + } + } +} + +impl ArrayFromIter for BinaryArray { + fn arr_from_iter>(iter: I) -> Self { + BinaryArray::from_iter_values(iter.into_iter().map(|s| s.into_bytes())) + } + + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + unsafe { + // SAFETY: our iterator is TrustedLen. + MutableBinaryArray::from_trusted_len_values_iter_unchecked( + iter.into_iter().map(|s| s.into_bytes()), + ) + .into() + } + } + + fn try_arr_from_iter>>(iter: I) -> Result { + // No built-in for this? + let mut arr = MutableBinaryValuesArray::new(); + let mut iter = iter.into_iter(); + arr.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| -> Result<(), E> { + arr.push(x?.into_bytes()); + Ok(()) + })?; + Ok(arr.into()) + } + + // No faster implementation than this available, fall back to default. + // fn try_arr_from_iter_trusted(iter: I) -> Result +} + +impl ArrayFromIter> for BinaryArray { + fn arr_from_iter>>(iter: I) -> Self { + BinaryArray::from_iter(iter.into_iter().map(|s| Some(s?.into_bytes()))) + } + + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + unsafe { + // SAFETY: the iterator is TrustedLen. + BinaryArray::from_trusted_len_iter_unchecked( + iter.into_iter().map(|s| Some(s?.into_bytes())), + ) + } + } + + fn try_arr_from_iter, E>>>( + iter: I, + ) -> Result { + // No built-in for this? + let mut arr = MutableBinaryArray::new(); + let mut iter = iter.into_iter(); + arr.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| -> Result<(), E> { + arr.push(x?.map(|s| s.into_bytes())); + Ok(()) + })?; + Ok(arr.into()) + } + + fn try_arr_from_iter_trusted(iter: I) -> Result + where + I: IntoIterator, E>>, + I::IntoIter: TrustedLen, + { + unsafe { + // SAFETY: the iterator is TrustedLen. + BinaryArray::try_from_trusted_len_iter_unchecked( + iter.into_iter().map(|s| s.map(|s| Some(s?.into_bytes()))), + ) + } + } +} + +/// We use this to re-use the binary collect implementation for strings. +/// # Safety +/// The array must be valid UTF-8. +unsafe fn into_utf8array(arr: BinaryArray) -> Utf8Array { + unsafe { + let (_dt, offsets, values, validity) = arr.into_inner(); + let dt = arrow::datatypes::DataType::LargeUtf8; + Utf8Array::try_new_unchecked(dt, offsets, values, validity).unwrap_unchecked() + } +} + +trait StrIntoBytes: IntoBytes {} +impl StrIntoBytes for String {} +impl<'a> StrIntoBytes for &'a str {} +impl<'a> StrIntoBytes for Cow<'a, str> {} + +impl ArrayFromIter for Utf8Array { + #[inline(always)] + fn arr_from_iter>(iter: I) -> Self { + unsafe { into_utf8array(iter.into_iter().collect_arr()) } + } + + #[inline(always)] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator, + I::IntoIter: TrustedLen, + { + unsafe { into_utf8array(iter.into_iter().collect_arr()) } + } + + #[inline(always)] + fn try_arr_from_iter>>(iter: I) -> Result { + let arr = iter.into_iter().try_collect_arr()?; + unsafe { Ok(into_utf8array(arr)) } + } + + #[inline(always)] + fn try_arr_from_iter_trusted>>( + iter: I, + ) -> Result { + let arr = iter.into_iter().try_collect_arr()?; + unsafe { Ok(into_utf8array(arr)) } + } +} + +impl ArrayFromIter> for Utf8Array { + #[inline(always)] + fn arr_from_iter>>(iter: I) -> Self { + unsafe { into_utf8array(iter.into_iter().collect_arr()) } + } + + #[inline(always)] + fn arr_from_iter_trusted(iter: I) -> Self + where + I: IntoIterator>, + I::IntoIter: TrustedLen, + { + unsafe { into_utf8array(iter.into_iter().collect_arr()) } + } + + #[inline(always)] + fn try_arr_from_iter, E>>>( + iter: I, + ) -> Result { + let arr = iter.into_iter().try_collect_arr()?; + unsafe { Ok(into_utf8array(arr)) } + } + + #[inline(always)] + fn try_arr_from_iter_trusted, E>>>( + iter: I, + ) -> Result { + let arr = iter.into_iter().try_collect_arr()?; + unsafe { Ok(into_utf8array(arr)) } + } +} + +macro_rules! impl_collect_bool_validity { + ($iter: ident, $x:ident, $unpack:expr, $truth:expr, $nullity:expr, $with_valid:literal) => {{ + let mut iter = $iter.into_iter(); + let mut buf: Vec = Vec::new(); + let mut validity: Vec = Vec::new(); + let lo = iter.size_hint().0; + buf.reserve(8 + 8 * (lo / 64)); + if $with_valid { + validity.reserve(8 + 8 * (lo / 64)); + } + + let mut len = 0; + let mut buf_mask = 0u8; + let mut true_count = 0; + let mut valid_mask = 0u8; + let mut nonnull_count = 0; + 'exhausted: loop { + unsafe { + for i in 0..8 { + let Some($x) = iter.next() else { + break 'exhausted; + }; + #[allow(clippy::all)] + // #[allow(clippy::redundant_locals)] Clippy lint too new + let $x = $unpack; + let is_true: bool = $truth; + buf_mask |= (is_true as u8) << i; + true_count += is_true as usize; + if $with_valid { + let nonnull: bool = $nullity; + valid_mask |= (nonnull as u8) << i; + nonnull_count += nonnull as usize; + } + len += 1; + } + + buf.push_unchecked(buf_mask); + buf_mask = 0; + if $with_valid { + validity.push_unchecked(valid_mask); + valid_mask = 0; + } + } + + if buf.len() == buf.capacity() { + buf.reserve(8); // Waste some space to make branch more predictable. + if $with_valid { + validity.reserve(8); + } + } + } + + unsafe { + // SAFETY: when we broke to 'exhausted we had capacity by the loop invariant. + // It's also no problem if we make the mask bigger than strictly necessary. + buf.push_unchecked(buf_mask); + if $with_valid { + validity.push_unchecked(valid_mask); + } + } + + let false_count = len - true_count; + let values = + unsafe { Bitmap::from_inner(Arc::new(buf.into()), 0, len, false_count).unwrap() }; + + let null_count = len - nonnull_count; + let validity_bitmap = if $with_valid && null_count > 0 { + unsafe { + // SAFETY: we made sure the null_count is correct. + Some(Bitmap::from_inner(Arc::new(validity.into()), 0, len, null_count).unwrap()) + } + } else { + None + }; + + (values, validity_bitmap) + }}; +} + +impl ArrayFromIter for BooleanArray { + fn arr_from_iter>(iter: I) -> Self { + let dt = arrow::datatypes::DataType::Boolean; + let (values, _valid) = impl_collect_bool_validity!(iter, x, x, x, false, false); + BooleanArray::new(dt, values, None) + } + + // TODO: are efficient trusted collects for booleans worth it? + // fn arr_from_iter_trusted(iter: I) -> Self + + fn try_arr_from_iter>>(iter: I) -> Result { + let dt = arrow::datatypes::DataType::Boolean; + let (values, _valid) = impl_collect_bool_validity!(iter, x, x?, x, false, false); + Ok(BooleanArray::new(dt, values, None)) + } + + // fn try_arr_from_iter_trusted>>( +} + +impl ArrayFromIter> for BooleanArray { + fn arr_from_iter>>(iter: I) -> Self { + let dt = arrow::datatypes::DataType::Boolean; + let (values, valid) = + impl_collect_bool_validity!(iter, x, x, x.unwrap_or(false), x.is_some(), true); + BooleanArray::new(dt, values, valid) + } + + // fn arr_from_iter_trusted(iter: I) -> Self + + fn try_arr_from_iter, E>>>( + iter: I, + ) -> Result { + let dt = arrow::datatypes::DataType::Boolean; + let (values, valid) = + impl_collect_bool_validity!(iter, x, x?, x.unwrap_or(false), x.is_some(), true); + Ok(BooleanArray::new(dt, values, valid)) + } + + // fn try_arr_from_iter_trusted, E>>>( +} + +// We don't use AsRef here because it leads to problems with conflicting implementations, +// as Rust considers that AsRef for Option<&dyn Array> could be implemented. +trait AsArray { + fn as_array(&self) -> &dyn Array; + fn into_boxed_array(self) -> Box; // Prevents unnecessary re-boxing. +} +impl AsArray for Box { + fn as_array(&self) -> &dyn Array { + self.as_ref() + } + fn into_boxed_array(self) -> Box { + self + } +} +impl<'a> AsArray for &'a dyn Array { + fn as_array(&self) -> &'a dyn Array { + *self + } + fn into_boxed_array(self) -> Box { + self.to_boxed() + } +} + +// TODO: more efficient (fixed size) list collect routines. +impl ArrayFromIterDtype for ListArray { + fn arr_from_iter_with_dtype>(dtype: DataType, iter: I) -> Self { + let iter_values: Vec = iter.into_iter().collect(); + let mut builder = AnonymousListArrayBuilder::new(iter_values.len()); + for arr in &iter_values { + builder.push(arr.as_array()); + } + builder.finish(Some(&dtype.to_arrow())).unwrap() + } + + fn try_arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Result { + let iter_values = iter.into_iter().collect::, E>>()?; + Ok(Self::arr_from_iter_with_dtype(dtype, iter_values)) + } +} + +impl ArrayFromIterDtype> for ListArray { + fn arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Self { + let iter_values: Vec> = iter.into_iter().collect(); + let mut builder = AnonymousListArrayBuilder::new(iter_values.len()); + for arr in &iter_values { + builder.push_opt(arr.as_ref().map(|a| a.as_array())); + } + builder.finish(Some(&dtype.to_arrow())).unwrap() + } + + fn try_arr_from_iter_with_dtype, E>>>( + dtype: DataType, + iter: I, + ) -> Result { + let iter_values = iter.into_iter().collect::, E>>()?; + Ok(Self::arr_from_iter_with_dtype(dtype, iter_values)) + } +} + +#[cfg(feature = "dtype-array")] +impl ArrayFromIterDtype> for FixedSizeListArray { + fn arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Self { + let DataType::Array(_, width) = &dtype else { + panic!("FixedSizeListArray::arr_from_iter_with_dtype called with non-Array dtype"); + }; + let iter_values: Vec<_> = iter.into_iter().collect(); + let mut builder = AnonymousFixedSizeListArrayBuilder::new(iter_values.len(), *width); + for arr in iter_values { + builder.push(arr.into_boxed_array()); + } + builder.finish(Some(&dtype.to_arrow())).unwrap() + } + + fn try_arr_from_iter_with_dtype, E>>>( + dtype: DataType, + iter: I, + ) -> Result { + let iter_values = iter.into_iter().collect::, E>>()?; + Ok(Self::arr_from_iter_with_dtype(dtype, iter_values)) + } +} + +#[cfg(feature = "dtype-array")] +impl ArrayFromIterDtype>> for FixedSizeListArray { + fn arr_from_iter_with_dtype>>>( + dtype: DataType, + iter: I, + ) -> Self { + let DataType::Array(_, width) = &dtype else { + panic!("FixedSizeListArray::arr_from_iter_with_dtype called with non-Array dtype"); + }; + let iter_values: Vec<_> = iter.into_iter().collect(); + let mut builder = AnonymousFixedSizeListArrayBuilder::new(iter_values.len(), *width); + for arr in iter_values { + match arr { + Some(a) => builder.push(a.into_boxed_array()), + None => builder.push_null(), + } + } + builder.finish(Some(&dtype.to_arrow())).unwrap() + } + + fn try_arr_from_iter_with_dtype< + E, + I: IntoIterator>, E>>, + >( + dtype: DataType, + iter: I, + ) -> Result { + let iter_values = iter.into_iter().collect::, E>>()?; + Ok(Self::arr_from_iter_with_dtype(dtype, iter_values)) + } +} + +// TODO: more efficient implementations, I really took the short path here. +#[cfg(feature = "object")] +impl<'a, T: PolarsObject> ArrayFromIterDtype<&'a T> for ObjectArray { + fn arr_from_iter_with_dtype>(dtype: DataType, iter: I) -> Self { + Self::try_arr_from_iter_with_dtype( + dtype, + iter.into_iter().map(|o| -> Result<_, ()> { Ok(Some(o)) }), + ) + .unwrap() + } + + fn try_arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Result { + Self::try_arr_from_iter_with_dtype(dtype, iter.into_iter().map(|o| Ok(Some(o?)))) + } +} + +#[cfg(feature = "object")] +impl<'a, T: PolarsObject> ArrayFromIterDtype> for ObjectArray { + fn arr_from_iter_with_dtype>>( + dtype: DataType, + iter: I, + ) -> Self { + Self::try_arr_from_iter_with_dtype( + dtype, + iter.into_iter().map(|o| -> Result<_, ()> { Ok(o) }), + ) + .unwrap() + } + + fn try_arr_from_iter_with_dtype, E>>>( + _dtype: DataType, + iter: I, + ) -> Result { + let iter = iter.into_iter(); + let size = iter.size_hint().0; + + let mut null_mask_builder = arrow::bitmap::MutableBitmap::with_capacity(size); + let values: Vec = iter + .map(|value| match value? { + Some(value) => { + null_mask_builder.push(true); + Ok(value.clone()) + }, + None => { + null_mask_builder.push(false); + Ok(T::default()) + }, + }) + .collect::, E>>()?; + + let null_bit_buffer: Option = null_mask_builder.into(); + let null_bitmap = null_bit_buffer; + let len = values.len(); + Ok(ObjectArray { + values: Arc::new(values), + null_bitmap, + offset: 0, + len, + }) + } +} diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index c28e462e533c..49a9ac4ba872 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -14,6 +14,7 @@ pub use crate::chunked_array::builder::{ ListBooleanChunkedBuilder, ListBuilderTrait, ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, Utf8ChunkedBuilder, }; +pub use crate::chunked_array::collect::{ChunkedCollectInferIterExt, ChunkedCollectIterExt}; pub use crate::chunked_array::iterator::PolarsIterator; #[cfg(feature = "dtype-categorical")] pub use crate::chunked_array::logical::categorical::*; @@ -31,7 +32,7 @@ pub use crate::chunked_array::ops::*; pub use crate::chunked_array::temporal::conversion::*; pub(crate) use crate::chunked_array::ChunkIdIter; pub use crate::chunked_array::ChunkedArray; -pub use crate::datatypes::*; +pub use crate::datatypes::{ArrayCollectIterExt, *}; pub use crate::error::{ polars_bail, polars_ensure, polars_err, polars_warn, PolarsError, PolarsResult, };