diff --git a/.github/workflows/lint-global.yml b/.github/workflows/lint-global.yml index 95f9957b5309..031bac502548 100644 --- a/.github/workflows/lint-global.yml +++ b/.github/workflows/lint-global.yml @@ -15,4 +15,4 @@ jobs: - name: Lint Markdown and TOML uses: dprint/check@v2.2 - name: Spell Check with Typos - uses: crate-ci/typos@v1.26.8 + uses: crate-ci/typos@v1.27.2 diff --git a/Cargo.lock b/Cargo.lock index 51d28defc357..5176bd831139 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3191,7 +3191,6 @@ dependencies = [ "polars-ops", "polars-parquet", "polars-plan", - "polars-stream", "polars-time", "polars-utils", "pyo3", diff --git a/crates/polars-arrow/src/array/boolean/mod.rs b/crates/polars-arrow/src/array/boolean/mod.rs index c1a17c0f27f3..1e7efae00d49 100644 --- a/crates/polars-arrow/src/array/boolean/mod.rs +++ b/crates/polars-arrow/src/array/boolean/mod.rs @@ -357,8 +357,8 @@ impl BooleanArray { (dtype, values, validity) } - /// Creates a `[BooleanArray]` from its internal representation. - /// This is the inverted from `[BooleanArray::into_inner]` + /// Creates a [`BooleanArray`] from its internal representation. + /// This is the inverted from [`BooleanArray::into_inner`] /// /// # Safety /// Callers must ensure all invariants of this struct are upheld. diff --git a/crates/polars-arrow/src/array/primitive/mod.rs b/crates/polars-arrow/src/array/primitive/mod.rs index 6915a97a442b..ec4062fc5288 100644 --- a/crates/polars-arrow/src/array/primitive/mod.rs +++ b/crates/polars-arrow/src/array/primitive/mod.rs @@ -311,8 +311,8 @@ impl PrimitiveArray { (dtype, values, validity) } - /// Creates a `[PrimitiveArray]` from its internal representation. - /// This is the inverted from `[PrimitiveArray::into_inner]` + /// Creates a [`PrimitiveArray`] from its internal representation. + /// This is the inverted from [`PrimitiveArray::into_inner`] pub fn from_inner( dtype: ArrowDataType, values: Buffer, @@ -322,8 +322,8 @@ impl PrimitiveArray { Ok(unsafe { Self::from_inner_unchecked(dtype, values, validity) }) } - /// Creates a `[PrimitiveArray]` from its internal representation. - /// This is the inverted from `[PrimitiveArray::into_inner]` + /// Creates a [`PrimitiveArray`] from its internal representation. + /// This is the inverted from [`PrimitiveArray::into_inner`] /// /// # Safety /// Callers must ensure all invariants of this struct are upheld. diff --git a/crates/polars-arrow/src/bitmap/immutable.rs b/crates/polars-arrow/src/bitmap/immutable.rs index 5b8d510dfe6c..3cb2851f56b8 100644 --- a/crates/polars-arrow/src/bitmap/immutable.rs +++ b/crates/polars-arrow/src/bitmap/immutable.rs @@ -472,8 +472,8 @@ impl Bitmap { } } - /// Creates a `[Bitmap]` from its internal representation. - /// This is the inverted from `[Bitmap::into_inner]` + /// Creates a [`Bitmap`] from its internal representation. + /// This is the inverted from [`Bitmap::into_inner`] /// /// # Safety /// Callers must ensure all invariants of this struct are upheld. diff --git a/crates/polars-compute/src/distinct_count.rs b/crates/polars-compute/src/distinct_count.rs deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index 3e0e47a7e86a..49f0bf7ce1bf 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -81,4 +81,13 @@ impl ArrayChunked { ArrayChunked::try_from_chunk_iter(self.name().clone(), chunks) } + + /// Recurse nested types until we are at the leaf array. + pub fn get_leaf_array(&self) -> Series { + let mut current = self.get_inner(); + while let Some(child_array) = current.try_array() { + current = child_array.get_inner(); + } + current + } } diff --git a/crates/polars-core/src/chunked_array/from_iterator.rs b/crates/polars-core/src/chunked_array/from_iterator.rs index ba9e8d1e6ccc..de5c3f89ee44 100644 --- a/crates/polars-core/src/chunked_array/from_iterator.rs +++ b/crates/polars-core/src/chunked_array/from_iterator.rs @@ -152,6 +152,15 @@ where } } +impl FromIterator> for ListChunked { + fn from_iter>>(iter: T) -> Self { + ListChunked::from_iter( + iter.into_iter() + .map(|c| c.map(|c| c.take_materialized_series())), + ) + } +} + impl FromIterator> for ListChunked { #[inline] fn from_iter>>(iter: I) -> Self { diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 8ccd455e4bd0..2429d918e2ff 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -317,7 +317,7 @@ impl CategoricalChunked { } } - /// Create an `[Iterator]` that iterates over the `&str` values of the `[CategoricalChunked]`. + /// Create an [`Iterator`] that iterates over the `&str` values of the [`CategoricalChunked`]. pub fn iter_str(&self) -> CatIter<'_> { let iter = self.physical().into_iter(); CatIter { diff --git a/crates/polars-core/src/chunked_array/object/extension/mod.rs b/crates/polars-core/src/chunked_array/object/extension/mod.rs index f9167b200211..846ebfa5c16b 100644 --- a/crates/polars-core/src/chunked_array/object/extension/mod.rs +++ b/crates/polars-core/src/chunked_array/object/extension/mod.rs @@ -58,7 +58,7 @@ unsafe fn any_as_u8_slice(p: &T) -> &[u8] { std::slice::from_raw_parts((p as *const T) as *const u8, size_of::()) } -/// Create an extension Array that can be sent to arrow and (once wrapped in `[PolarsExtension]` will +/// Create an extension Array that can be sent to arrow and (once wrapped in [`PolarsExtension`] will /// also call drop on `T`, when the array is dropped. pub(crate) fn create_extension> + TrustedLen, T: Sized + Default>( iter: I, diff --git a/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs index f9a931a7846a..4c83426ca676 100644 --- a/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs +++ b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs @@ -23,7 +23,7 @@ impl PolarsExtension { Self { array: Some(array) } } - /// Take the Array hold by `[PolarsExtension]` and forget polars extension, + /// Take the Array hold by [`PolarsExtension`] and forget polars extension, /// so that drop is not called pub(crate) fn take_and_forget(self) -> FixedSizeBinaryArray { let mut md = ManuallyDrop::new(self); @@ -57,15 +57,15 @@ impl PolarsExtension { } } - /// Calls the heap allocated function in the `[ExtensionSentinel]` that knows - /// how to convert the `[FixedSizeBinaryArray]` to a `Series` of type `[ObjectChunked]` + /// Calls the heap allocated function in the [`ExtensionSentinel`] that knows + /// how to convert the [`FixedSizeBinaryArray`] to a `Series` of type [`ObjectChunked`] pub(crate) unsafe fn get_series(&self, name: &PlSmallStr) -> Series { self.with_sentinel(|sent| { (sent.to_series_fn.as_ref().unwrap())(self.array.as_ref().unwrap(), name) }) } - // heap allocates a function that converts the binary array to a Series of `[ObjectChunked]` + // heap allocates a function that converts the binary array to a Series of [`ObjectChunked`] // the `name` will be the `name` of the output `Series` when this function is called (later). pub(crate) unsafe fn set_to_series_fn(&mut self) { let f = Box::new(move |arr: &FixedSizeBinaryArray, name: &PlSmallStr| { diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs index 377b51afe134..c2bc3e35d364 100644 --- a/crates/polars-core/src/chunked_array/ops/fill_null.rs +++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs @@ -233,7 +233,7 @@ fn fill_with_gather Vec>( let idx = bits_to_idx(validity); - Ok(unsafe { s.take_unchecked_from_slice(&idx) }) + Ok(unsafe { s.take_slice_unchecked(&idx) }) } fn fill_forward_gather(s: &Series) -> PolarsResult { diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 625da8881117..7e45b6ad11ff 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -380,7 +380,7 @@ impl StructChunked { unsafe { DataFrame::new_no_checks(self.len(), columns) } } - /// Get access to one of this `[StructChunked]`'s fields + /// Get access to one of this [`StructChunked`]'s fields pub fn field_by_name(&self, name: &str) -> PolarsResult { self.fields_as_series() .into_iter() diff --git a/crates/polars-core/src/frame/column/arithmetic.rs b/crates/polars-core/src/frame/column/arithmetic.rs index 97907f3457b9..8018ee4527e6 100644 --- a/crates/polars-core/src/frame/column/arithmetic.rs +++ b/crates/polars-core/src/frame/column/arithmetic.rs @@ -1,70 +1,7 @@ use num_traits::{Num, NumCast}; -use polars_error::{polars_bail, PolarsResult}; +use polars_error::PolarsResult; use super::{Column, ScalarColumn, Series}; -use crate::utils::Container; - -fn output_length(a: &Column, b: &Column) -> PolarsResult { - match (a.len(), b.len()) { - // broadcasting - (1, o) | (o, 1) => Ok(o), - // equal - (a, b) if a == b => Ok(a), - // unequal - (a, b) => { - polars_bail!(InvalidOperation: "cannot do arithmetic operation on series of different lengths: got {} and {}", a, b) - }, - } -} - -fn unit_series_op PolarsResult>( - l: &Series, - r: &Series, - op: F, - length: usize, -) -> PolarsResult { - debug_assert!(l.len() <= 1); - debug_assert!(r.len() <= 1); - - op(l, r) - .map(|s| ScalarColumn::from_single_value_series(s, length)) - .map(Column::from) -} - -fn op_with_broadcast PolarsResult>( - l: &Column, - r: &Column, - op: F, -) -> PolarsResult { - // Here we rely on the underlying broadcast operations. - - let length = output_length(l, r)?; - match (l, r) { - (Column::Series(l), Column::Scalar(r)) => { - let r = r.as_single_value_series(); - if l.len() == 1 { - unit_series_op(l, &r, op, length) - } else { - op(l, &r).map(Column::from) - } - }, - (Column::Scalar(l), Column::Series(r)) => { - let l = l.as_single_value_series(); - if r.len() == 1 { - unit_series_op(&l, r, op, length) - } else { - op(&l, r).map(Column::from) - } - }, - (Column::Scalar(l), Column::Scalar(r)) => unit_series_op( - &l.as_single_value_series(), - &r.as_single_value_series(), - op, - length, - ), - (l, r) => op(l.as_materialized_series(), r.as_materialized_series()).map(Column::from), - } -} fn num_op_with_broadcast Series>( c: &'_ Column, @@ -90,7 +27,7 @@ macro_rules! broadcastable_ops { #[inline] fn $op(self, rhs: Self) -> Self::Output { - op_with_broadcast(&self, &rhs, |l, r| l.$op(r)) + self.try_apply_broadcasting_binary_elementwise(&rhs, |l, r| l.$op(r)) } } @@ -99,7 +36,7 @@ macro_rules! broadcastable_ops { #[inline] fn $op(self, rhs: Self) -> Self::Output { - op_with_broadcast(self, rhs, |l, r| l.$op(r)) + self.try_apply_broadcasting_binary_elementwise(rhs, |l, r| l.$op(r)) } } )+ diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index d21ec3ac8536..d2eec86c1b15 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -531,9 +531,38 @@ impl Column { match self { Self::Series(s) => unsafe { s.take_unchecked(indices) }.into(), Self::Partitioned(s) => { - unsafe { s.as_materialized_series().take_unchecked(indices) }.into() + let s = s.as_materialized_series(); + unsafe { s.take_unchecked(indices) }.into() + }, + Self::Scalar(s) => { + let idxs_length = indices.len(); + let idxs_null_count = indices.null_count(); + + let scalar = ScalarColumn::from_single_value_series( + s.as_single_value_series().take_unchecked(&IdxCa::new( + indices.name().clone(), + &[0][..s.len().min(1)], + )), + idxs_length, + ); + + // We need to make sure that null values in `idx` become null values in the result + if idxs_null_count == 0 { + scalar.into_column() + } else if idxs_null_count == idxs_length { + scalar.into_nulls().into_column() + } else { + let validity = indices.rechunk_validity(); + let series = scalar.take_materialized_series(); + let name = series.name().clone(); + let dtype = series.dtype().clone(); + let mut chunks = series.into_chunks(); + assert_eq!(chunks.len(), 1); + chunks[0] = chunks[0].with_validity(validity); + unsafe { Series::from_chunks_and_dtype_unchecked(name, chunks, &dtype) } + .into_column() + } }, - Self::Scalar(s) => s.resize(indices.len()).into(), } } /// # Safety @@ -543,13 +572,17 @@ impl Column { debug_assert!(check_bounds(indices, self.len() as IdxSize).is_ok()); match self { - Self::Series(s) => unsafe { s.take_unchecked_from_slice(indices) }.into(), - Self::Partitioned(s) => unsafe { - s.as_materialized_series() - .take_unchecked_from_slice(indices) - } + Self::Series(s) => unsafe { s.take_slice_unchecked(indices) }.into(), + Self::Partitioned(s) => { + let s = s.as_materialized_series(); + unsafe { s.take_slice_unchecked(indices) }.into() + }, + Self::Scalar(s) => ScalarColumn::from_single_value_series( + s.as_single_value_series() + .take_slice_unchecked(&[0][..s.len().min(1)]), + indices.len(), + ) .into(), - Self::Scalar(s) => s.resize(indices.len()).into(), } } @@ -670,15 +703,22 @@ impl Column { unsafe { self.as_materialized_series().agg_list(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub fn agg_valid_count(&self, groups: &GroupsProxy) -> Self { + // @partition-opt + // @scalar-opt + unsafe { self.as_materialized_series().agg_valid_count(groups) }.into() + } + pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Self { - Series::full_null(name, size, dtype).into() - // @TODO: This causes failures - // Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), size) + Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), size) } pub fn is_empty(&self) -> bool { - // @scalar-opt - self.as_materialized_series().is_empty() + self.len() == 0 } pub fn reverse(&self) -> Column { @@ -689,16 +729,16 @@ impl Column { } } - pub fn equals(&self, right: &Column) -> bool { + pub fn equals(&self, other: &Column) -> bool { // @scalar-opt self.as_materialized_series() - .equals(right.as_materialized_series()) + .equals(other.as_materialized_series()) } - pub fn equals_missing(&self, right: &Column) -> bool { + pub fn equals_missing(&self, other: &Column) -> bool { // @scalar-opt self.as_materialized_series() - .equals_missing(right.as_materialized_series()) + .equals_missing(other.as_materialized_series()) } pub fn set_sorted_flag(&mut self, sorted: IsSorted) { @@ -730,11 +770,6 @@ impl Column { } } - pub fn get_data_ptr(&self) -> usize { - // @scalar-opt - self.as_materialized_series().get_data_ptr() - } - pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { // @scalar-opt? self.as_materialized_series().vec_hash(build_hasher, buf) @@ -772,13 +807,6 @@ impl Column { unsafe { DataFrame::new_no_checks(self.len(), vec![self]) } } - pub fn unique_stable(&self) -> PolarsResult { - // @scalar-opt? - self.as_materialized_series() - .unique_stable() - .map(Column::from) - } - pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { // @scalar-opt self.into_materialized_series() @@ -795,9 +823,11 @@ impl Column { } pub fn explode(&self) -> PolarsResult { - // @scalar-opt self.as_materialized_series().explode().map(Column::from) } + pub fn implode(&self) -> PolarsResult { + self.as_materialized_series().implode() + } pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { // @scalar-opt @@ -839,8 +869,12 @@ impl Column { } pub fn drop_nulls(&self) -> Column { - // @scalar-opt - self.as_materialized_series().drop_nulls().into() + match self { + Column::Series(s) => s.drop_nulls().into_column(), + // @partition-opt + Column::Partitioned(s) => s.as_materialized_series().drop_nulls().into_column(), + Column::Scalar(s) => s.drop_nulls().into_column(), + } } pub fn is_sorted_flag(&self) -> IsSorted { @@ -849,8 +883,34 @@ impl Column { } pub fn unique(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().unique().map(Column::from) + match self { + Column::Series(s) => s.unique().map(Column::from), + // @partition-opt + Column::Partitioned(s) => s.as_materialized_series().unique().map(Column::from), + Column::Scalar(s) => { + _ = s.as_single_value_series().unique()?; + if s.is_empty() { + return Ok(s.clone().into_column()); + } + + Ok(s.resize(1).into_column()) + }, + } + } + pub fn unique_stable(&self) -> PolarsResult { + match self { + Column::Series(s) => s.unique_stable().map(Column::from), + // @partition-opt + Column::Partitioned(s) => s.as_materialized_series().unique_stable().map(Column::from), + Column::Scalar(s) => { + _ = s.as_single_value_series().unique_stable()?; + if s.is_empty() { + return Ok(s.clone().into_column()); + } + + Ok(s.resize(1).into_column()) + }, + } } pub fn reshape_list(&self, dimensions: &[ReshapeDimension]) -> PolarsResult { @@ -875,9 +935,26 @@ impl Column { .map(Self::from) } - pub fn filter(&self, filter: &ChunkedArray) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().filter(filter).map(Self::from) + pub fn filter(&self, filter: &BooleanChunked) -> PolarsResult { + match self { + Column::Series(s) => s.filter(filter).map(Column::from), + Column::Partitioned(s) => s.as_materialized_series().filter(filter).map(Column::from), + Column::Scalar(s) => { + if s.is_empty() { + return Ok(s.clone().into_column()); + } + + // Broadcasting + if filter.len() == 1 { + return match filter.get(0) { + Some(true) => Ok(s.clone().into_column()), + _ => Ok(s.resize(0).into_column()), + }; + } + + Ok(s.resize(filter.sum().unwrap() as usize).into_column()) + }, + } } #[cfg(feature = "random")] @@ -949,23 +1026,16 @@ impl Column { } pub fn is_finite(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_finite() + self.try_map_unary_elementwise_to_bool(|s| s.is_finite()) } - pub fn is_infinite(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_infinite() + self.try_map_unary_elementwise_to_bool(|s| s.is_infinite()) } - pub fn is_nan(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_nan() + self.try_map_unary_elementwise_to_bool(|s| s.is_nan()) } - pub fn is_not_nan(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_not_nan() + self.try_map_unary_elementwise_to_bool(|s| s.is_not_nan()) } pub fn wrapping_trunc_div_scalar(&self, rhs: T) -> Self @@ -1018,15 +1088,55 @@ impl Column { } pub fn bitand(&self, rhs: &Self) -> PolarsResult { - self.as_materialized_series() - .bitand(rhs.as_materialized_series()) - .map(Column::from) + // @partition-opt + // @scalar-opt + (self.as_materialized_series() & rhs.as_materialized_series()).map(Column::from) + } + pub fn bitor(&self, rhs: &Self) -> PolarsResult { + // @partition-opt + // @scalar-opt + (self.as_materialized_series() | rhs.as_materialized_series()).map(Column::from) + } + pub fn bitxor(&self, rhs: &Self) -> PolarsResult { + // @partition-opt + // @scalar-opt + (self.as_materialized_series() ^ rhs.as_materialized_series()).map(Column::from) + } + + pub fn try_add_owned(self, other: Self) -> PolarsResult { + match (self, other) { + (Column::Series(lhs), Column::Series(rhs)) => lhs.try_add_owned(rhs).map(Column::from), + (lhs, rhs) => lhs + rhs, + } + } + pub fn try_sub_owned(self, other: Self) -> PolarsResult { + match (self, other) { + (Column::Series(lhs), Column::Series(rhs)) => lhs.try_sub_owned(rhs).map(Column::from), + (lhs, rhs) => lhs - rhs, + } + } + pub fn try_mul_owned(self, other: Self) -> PolarsResult { + match (self, other) { + (Column::Series(lhs), Column::Series(rhs)) => lhs.try_mul_owned(rhs).map(Column::from), + (lhs, rhs) => lhs * rhs, + } } pub(crate) fn str_value(&self, index: usize) -> PolarsResult> { Ok(self.get(index)?.str_value()) } + pub fn min_reduce(&self) -> PolarsResult { + match self { + Column::Series(s) => s.min_reduce(), + Column::Partitioned(s) => s.min_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().min_reduce() + }, + } + } pub fn max_reduce(&self) -> PolarsResult { match self { Column::Series(s) => s.max_reduce(), @@ -1038,25 +1148,108 @@ impl Column { }, } } - - pub fn min_reduce(&self) -> PolarsResult { + pub fn median_reduce(&self) -> PolarsResult { match self { - Column::Series(s) => s.min_reduce(), - Column::Partitioned(s) => s.min_reduce(), + Column::Series(s) => s.median_reduce(), + Column::Partitioned(s) => s.as_materialized_series().median_reduce(), Column::Scalar(s) => { // We don't really want to deal with handling the full semantics here so we just // cast to a single value series. This is a tiny bit wasteful, but probably fine. - s.as_single_value_series().min_reduce() + s.as_single_value_series().median_reduce() + }, + } + } + pub fn mean_reduce(&self) -> Scalar { + match self { + Column::Series(s) => s.mean_reduce(), + Column::Partitioned(s) => s.as_materialized_series().mean_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().mean_reduce() + }, + } + } + pub fn std_reduce(&self, ddof: u8) -> PolarsResult { + match self { + Column::Series(s) => s.std_reduce(ddof), + Column::Partitioned(s) => s.as_materialized_series().std_reduce(ddof), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().std_reduce(ddof) + }, + } + } + pub fn var_reduce(&self, ddof: u8) -> PolarsResult { + match self { + Column::Series(s) => s.var_reduce(ddof), + Column::Partitioned(s) => s.as_materialized_series().var_reduce(ddof), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().var_reduce(ddof) + }, + } + } + pub fn sum_reduce(&self) -> PolarsResult { + // @partition-opt + // @scalar-opt + self.as_materialized_series().sum_reduce() + } + pub fn and_reduce(&self) -> PolarsResult { + match self { + Column::Series(s) => s.and_reduce(), + Column::Partitioned(s) => s.and_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().and_reduce() }, } } + pub fn or_reduce(&self) -> PolarsResult { + match self { + Column::Series(s) => s.or_reduce(), + Column::Partitioned(s) => s.or_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().or_reduce() + }, + } + } + pub fn xor_reduce(&self) -> PolarsResult { + match self { + Column::Series(s) => s.xor_reduce(), + // @partition-opt + Column::Partitioned(s) => s.as_materialized_series().xor_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().xor_reduce() + }, + } + } + pub fn n_unique(&self) -> PolarsResult { + match self { + Column::Series(s) => s.n_unique(), + Column::Partitioned(s) => s.partitions().n_unique(), + // @scalar-opt + Column::Scalar(s) => s.as_single_value_series().n_unique(), + } + } + pub fn quantile_reduce(&self, quantile: f64, method: QuantileMethod) -> PolarsResult { + self.as_materialized_series() + .quantile_reduce(quantile, method) + } pub(crate) fn estimated_size(&self) -> usize { // @scalar-opt self.as_materialized_series().estimated_size() } - pub(crate) fn sort_with(&self, options: SortOptions) -> PolarsResult { + pub fn sort_with(&self, options: SortOptions) -> PolarsResult { match self { Column::Series(s) => s.sort_with(options).map(Self::from), // @partition-opt @@ -1073,17 +1266,27 @@ impl Column { } } - pub fn apply_unary_elementwise(&self, f: impl Fn(&Series) -> Series) -> Column { + pub fn map_unary_elementwise_to_bool( + &self, + f: impl Fn(&Series) -> BooleanChunked, + ) -> BooleanChunked { + self.try_map_unary_elementwise_to_bool(|s| Ok(f(s))) + .unwrap() + } + pub fn try_map_unary_elementwise_to_bool( + &self, + f: impl Fn(&Series) -> PolarsResult, + ) -> PolarsResult { match self { - Column::Series(s) => f(s).into(), - Column::Partitioned(s) => s.apply_unary_elementwise(f).into(), - Column::Scalar(s) => { - ScalarColumn::from_single_value_series(f(&s.as_single_value_series()), s.len()) - .into() - }, + Column::Series(s) => f(s), + Column::Partitioned(s) => f(s.as_materialized_series()), + Column::Scalar(s) => Ok(f(&s.as_single_value_series())?.new_from_index(0, s.len())), } } + pub fn apply_unary_elementwise(&self, f: impl Fn(&Series) -> Series) -> Column { + self.try_apply_unary_elementwise(|s| Ok(f(s))).unwrap() + } pub fn try_apply_unary_elementwise( &self, f: impl Fn(&Series) -> PolarsResult, @@ -1099,6 +1302,98 @@ impl Column { } } + pub fn apply_broadcasting_binary_elementwise( + &self, + other: &Self, + op: impl Fn(&Series, &Series) -> Series, + ) -> PolarsResult { + self.try_apply_broadcasting_binary_elementwise(other, |lhs, rhs| Ok(op(lhs, rhs))) + } + pub fn try_apply_broadcasting_binary_elementwise( + &self, + other: &Self, + op: impl Fn(&Series, &Series) -> PolarsResult, + ) -> PolarsResult { + fn output_length(a: &Column, b: &Column) -> PolarsResult { + match (a.len(), b.len()) { + // broadcasting + (1, o) | (o, 1) => Ok(o), + // equal + (a, b) if a == b => Ok(a), + // unequal + (a, b) => { + polars_bail!(InvalidOperation: "cannot do a binary operation on columns of different lengths: got {} and {}", a, b) + }, + } + } + + // Here we rely on the underlying broadcast operations. + let length = output_length(self, other)?; + match (self, other) { + (Column::Series(lhs), Column::Series(rhs)) => op(lhs, rhs).map(Column::from), + (Column::Series(lhs), Column::Scalar(rhs)) => { + op(lhs, &rhs.as_single_value_series()).map(Column::from) + }, + (Column::Scalar(lhs), Column::Series(rhs)) => { + op(&lhs.as_single_value_series(), rhs).map(Column::from) + }, + (Column::Scalar(lhs), Column::Scalar(rhs)) => { + let lhs = lhs.as_single_value_series(); + let rhs = rhs.as_single_value_series(); + + Ok(ScalarColumn::from_single_value_series(op(&lhs, &rhs)?, length).into_column()) + }, + // @partition-opt + (lhs, rhs) => { + op(lhs.as_materialized_series(), rhs.as_materialized_series()).map(Column::from) + }, + } + } + + pub fn apply_binary_elementwise( + &self, + other: &Self, + f: impl Fn(&Series, &Series) -> Series, + f_lb: impl Fn(&Scalar, &Series) -> Series, + f_rb: impl Fn(&Series, &Scalar) -> Series, + ) -> Column { + self.try_apply_binary_elementwise( + other, + |lhs, rhs| Ok(f(lhs, rhs)), + |lhs, rhs| Ok(f_lb(lhs, rhs)), + |lhs, rhs| Ok(f_rb(lhs, rhs)), + ) + .unwrap() + } + pub fn try_apply_binary_elementwise( + &self, + other: &Self, + f: impl Fn(&Series, &Series) -> PolarsResult, + f_lb: impl Fn(&Scalar, &Series) -> PolarsResult, + f_rb: impl Fn(&Series, &Scalar) -> PolarsResult, + ) -> PolarsResult { + debug_assert_eq!(self.len(), other.len()); + + match (self, other) { + (Column::Series(lhs), Column::Series(rhs)) => f(lhs, rhs).map(Column::from), + (Column::Series(lhs), Column::Scalar(rhs)) => f_rb(lhs, rhs.scalar()).map(Column::from), + (Column::Scalar(lhs), Column::Series(rhs)) => f_lb(lhs.scalar(), rhs).map(Column::from), + (Column::Scalar(lhs), Column::Scalar(rhs)) => { + let lhs = lhs.as_single_value_series(); + let rhs = rhs.as_single_value_series(); + + Ok( + ScalarColumn::from_single_value_series(f(&lhs, &rhs)?, self.len()) + .into_column(), + ) + }, + // @partition-opt + (lhs, rhs) => { + f(lhs.as_materialized_series(), rhs.as_materialized_series()).map(Column::from) + }, + } + } + #[cfg(feature = "approx_unique")] pub fn approx_n_unique(&self) -> PolarsResult { match self { diff --git a/crates/polars-core/src/frame/column/partitioned.rs b/crates/polars-core/src/frame/column/partitioned.rs index a22e697290ec..16d4e9538634 100644 --- a/crates/polars-core/src/frame/column/partitioned.rs +++ b/crates/polars-core/src/frame/column/partitioned.rs @@ -274,4 +274,19 @@ impl PartitionedColumn { pub fn clear(&self) -> Self { Self::new_empty(self.name.clone(), self.values.dtype().clone()) } + + pub fn partitions(&self) -> &Series { + &self.values + } + pub fn partition_ends(&self) -> &[IdxSize] { + &self.ends + } + + pub fn or_reduce(&self) -> PolarsResult { + self.values.or_reduce() + } + + pub fn and_reduce(&self) -> PolarsResult { + self.values.and_reduce() + } } diff --git a/crates/polars-core/src/frame/column/scalar.rs b/crates/polars-core/src/frame/column/scalar.rs index 18e53c469960..e3d8105362c4 100644 --- a/crates/polars-core/src/frame/column/scalar.rs +++ b/crates/polars-core/src/frame/column/scalar.rs @@ -137,9 +137,10 @@ impl ScalarColumn { /// /// This will panic if the value cannot be made static or if the series has length `0`. pub fn from_single_value_series(series: Series, length: usize) -> Self { - debug_assert_eq!(series.len(), 1); - let value = series.get(0).unwrap(); - let value = value.into_static(); + debug_assert!(series.len() <= 1); + debug_assert!(!series.is_empty() || length == 0); + + let value = series.get(0).map_or(AnyValue::Null, |av| av.into_static()); let value = Scalar::new(series.dtype().clone(), value); ScalarColumn::new(series.name().clone(), value, length) } @@ -270,6 +271,19 @@ impl ScalarColumn { pub fn has_nulls(&self) -> bool { self.length != 0 && self.scalar.is_null() } + + pub fn drop_nulls(&self) -> Self { + if self.scalar.is_null() { + self.resize(0) + } else { + self.clone() + } + } + + pub fn into_nulls(mut self) -> Self { + self.scalar.update(AnyValue::Null); + self + } } impl IntoColumn for ScalarColumn { diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs index c12086def533..bedb8eccb060 100644 --- a/crates/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -18,7 +18,7 @@ fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer)> { } } -/// Arguments for `[DataFrame::unpivot]` function +/// Arguments for `LazyFrame::unpivot` function #[derive(Clone, Default, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct UnpivotArgsIR { diff --git a/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs index aaf24a470969..8f01ce3f291a 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs @@ -15,7 +15,7 @@ impl Series { } #[doc(hidden)] - pub fn agg_valid_count(&self, groups: &GroupsProxy) -> Series { + pub unsafe fn agg_valid_count(&self, groups: &GroupsProxy) -> Series { // Prevent a rechunk for every individual group. let s = if groups.len() > 1 && self.null_count() > 0 { self.rechunk() diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 9dee1e1f411a..17a36dc4ddfd 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -233,7 +233,7 @@ impl<'df> GroupBy<'df> { /// Where second value in the tuple is a vector with all matching indexes. /// /// # Safety - /// Groups should always be in bounds of the `DataFrame` hold by this `[GroupBy]`. + /// Groups should always be in bounds of the `DataFrame` hold by this [`GroupBy`]. /// If you mutate it, you must hold that invariant. pub unsafe fn get_groups_mut(&mut self) -> &mut GroupsProxy { &mut self.groups diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index ab74c7c1d2d1..aa434fb07df7 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -206,48 +206,29 @@ impl DataFrame { } // Reduce monomorphization. - pub fn _apply_columns(&self, func: &(dyn Fn(&Series) -> Series)) -> Vec { - self.materialized_column_iter() - .map(func) - .map(Column::from) - .collect() + fn try_apply_columns( + &self, + func: &(dyn Fn(&Column) -> PolarsResult + Send + Sync), + ) -> PolarsResult> { + self.columns.iter().map(func).collect() } - // Reduce monomorphization. - pub fn _apply_columns_par( - &self, - func: &(dyn Fn(&Series) -> Series + Send + Sync), - ) -> Vec { - POOL.install(|| { - self.par_materialized_column_iter() - .map(func) - .map(Column::from) - .collect() - }) + pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec { + self.columns.iter().map(func).collect() } - // Reduce monomorphization. fn try_apply_columns_par( &self, - func: &(dyn Fn(&Series) -> PolarsResult + Send + Sync), + func: &(dyn Fn(&Column) -> PolarsResult + Send + Sync), ) -> PolarsResult> { - POOL.install(|| { - self.par_materialized_column_iter() - .map(func) - .map(|s| s.map(Column::from)) - .collect() - }) + POOL.install(|| self.columns.par_iter().map(func).collect()) } - // Reduce monomorphization. - fn try_apply_columns( + pub fn _apply_columns_par( &self, - func: &(dyn Fn(&Series) -> PolarsResult + Send + Sync), - ) -> PolarsResult> { - self.materialized_column_iter() - .map(func) - .map(|s| s.map(Column::from)) - .collect() + func: &(dyn Fn(&Column) -> Column + Send + Sync), + ) -> Vec { + POOL.install(|| self.columns.par_iter().map(func).collect()) } /// Get the index of the column. @@ -565,13 +546,7 @@ impl DataFrame { /// Aggregate all the chunks in the DataFrame to a single chunk in parallel. /// This may lead to more peak memory consumption. pub fn as_single_chunk_par(&mut self) -> &mut Self { - if self.columns.iter().any(|c| { - if let Column::Series(s) = c { - s.n_chunks() > 1 - } else { - false - } - }) { + if self.columns.iter().any(|c| c.n_chunks() > 1) { self.columns = self._apply_columns_par(&|s| s.rechunk()); } self @@ -1896,12 +1871,9 @@ impl DataFrame { /// The indices must be in-bounds. pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self { let cols = if allow_threads { - POOL.install(|| self._apply_columns_par(&|s| s.take_unchecked(idx))) + POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx))) } else { - self.materialized_column_iter() - .map(|s| s.take_unchecked(idx)) - .map(Column::from) - .collect() + self._apply_columns(&|s| s.take_unchecked(idx)) }; unsafe { DataFrame::new_no_checks(idx.len(), cols) } } @@ -1914,10 +1886,7 @@ impl DataFrame { let cols = if allow_threads { POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx))) } else { - self.materialized_column_iter() - .map(|s| s.take_slice_unchecked(idx)) - .map(Column::from) - .collect() + self._apply_columns(&|s| s.take_slice_unchecked(idx)) }; unsafe { DataFrame::new_no_checks(idx.len(), cols) } } @@ -2567,7 +2536,6 @@ impl DataFrame { if offset == 0 && length == self.height() { return self.clone(); } - // @scalar-opt let columns = self._apply_columns_par(&|s| s.slice(offset, length)); unsafe { DataFrame::new_no_checks(length, columns) } } diff --git a/crates/polars-core/src/scalar/from.rs b/crates/polars-core/src/scalar/from.rs index 35345b2a6527..3af8671dadd1 100644 --- a/crates/polars-core/src/scalar/from.rs +++ b/crates/polars-core/src/scalar/from.rs @@ -14,6 +14,7 @@ macro_rules! impl_from { } impl_from! { + (bool, Boolean, Boolean) (i8, Int8, Int8) (i16, Int16, Int16) (i32, Int32, Int32) diff --git a/crates/polars-core/src/series/amortized_iter.rs b/crates/polars-core/src/series/amortized_iter.rs index e56a950578e0..167fbf82612c 100644 --- a/crates/polars-core/src/series/amortized_iter.rs +++ b/crates/polars-core/src/series/amortized_iter.rs @@ -3,7 +3,7 @@ use std::rc::Rc; use crate::prelude::*; -/// A `[Series]` that amortizes a few allocations during iteration. +/// A [`Series`] that amortizes a few allocations during iteration. #[derive(Clone)] pub struct AmortSeries { container: Rc, @@ -31,7 +31,7 @@ impl AmortSeries { } } - /// Creates a new `[UnsafeSeries]` + /// Creates a new [`UnsafeSeries`] /// /// # Safety /// Inner chunks must be from `Series` otherwise the dtype may be incorrect and lead to UB. diff --git a/crates/polars-core/src/series/arithmetic/bitops.rs b/crates/polars-core/src/series/arithmetic/bitops.rs new file mode 100644 index 000000000000..cd00e8de18db --- /dev/null +++ b/crates/polars-core/src/series/arithmetic/bitops.rs @@ -0,0 +1,65 @@ +use std::borrow::Cow; + +use polars_error::PolarsResult; + +use super::{polars_bail, BooleanChunked, ChunkedArray, DataType, IntoSeries, Series}; + +macro_rules! impl_bitop { + ($(($trait:ident, $f:ident))+) => { + $( + impl std::ops::$trait for &Series { + type Output = PolarsResult; + fn $f(self, rhs: Self) -> Self::Output { + use DataType as DT; + match self.dtype() { + DT::Boolean => { + let lhs: &BooleanChunked = self.as_ref().as_ref().as_ref(); + let rhs = lhs.unpack_series_matching_type(rhs)?; + Ok(lhs.$f(rhs).into_series()) + }, + dt if dt.is_integer() => with_match_physical_integer_polars_type!(dt, |$T| { + let lhs: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref(); + + let rhs = if rhs.len() == 1 { + Cow::Owned(rhs.cast(self.dtype())?) + } else { + Cow::Borrowed(rhs) + }; + + let rhs = lhs.unpack_series_matching_type(&rhs)?; + Ok(lhs.$f(&rhs).into_series()) + }), + _ => polars_bail!(opq = $f, self.dtype()), + } + } + } + impl std::ops::$trait for Series { + type Output = PolarsResult; + #[inline(always)] + fn $f(self, rhs: Self) -> Self::Output { + <&Series as std::ops::$trait>::$f(&self, &rhs) + } + } + impl std::ops::$trait<&Series> for Series { + type Output = PolarsResult; + #[inline(always)] + fn $f(self, rhs: &Series) -> Self::Output { + <&Series as std::ops::$trait>::$f(&self, rhs) + } + } + impl std::ops::$trait for &Series { + type Output = PolarsResult; + #[inline(always)] + fn $f(self, rhs: Series) -> Self::Output { + <&Series as std::ops::$trait>::$f(self, &rhs) + } + } + )+ + }; +} + +impl_bitop! { + (BitAnd, bitand) + (BitOr, bitor) + (BitXor, bitxor) +} diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs index f9e5ff42139b..01065c66c1d1 100644 --- a/crates/polars-core/src/series/arithmetic/borrowed.rs +++ b/crates/polars-core/src/series/arithmetic/borrowed.rs @@ -51,6 +51,18 @@ where ChunkedArray: IntoSeries, { fn subtract(lhs: &ChunkedArray, rhs: &Series) -> PolarsResult { + #[cfg(feature = "dtype-array")] + if let Some(rhs) = rhs.try_array() { + return rhs.arithm_helper_scalar_lhs(lhs.clone().into_series(), &|l, r| l.subtract(&r)); + } + + polars_ensure!( + lhs.dtype() == rhs.dtype(), + opq = add, + rhs.dtype(), + rhs.dtype() + ); + // SAFETY: // There will be UB if a ChunkedArray is alive with the wrong datatype. // we now only create the potentially wrong dtype for a short time. @@ -61,6 +73,18 @@ where Ok(out.into_series()) } fn add_to(lhs: &ChunkedArray, rhs: &Series) -> PolarsResult { + #[cfg(feature = "dtype-array")] + if let Some(rhs) = rhs.try_array() { + return rhs.arithm_helper_scalar_lhs(lhs.clone().into_series(), &|l, r| l.add_to(&r)); + } + + polars_ensure!( + lhs.dtype() == rhs.dtype(), + opq = add, + rhs.dtype(), + rhs.dtype() + ); + // SAFETY: // see subtract let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; @@ -68,6 +92,18 @@ where Ok(out.into_series()) } fn multiply(lhs: &ChunkedArray, rhs: &Series) -> PolarsResult { + #[cfg(feature = "dtype-array")] + if let Some(rhs) = rhs.try_array() { + return rhs.arithm_helper_scalar_lhs(lhs.clone().into_series(), &|l, r| l.multiply(&r)); + } + + polars_ensure!( + lhs.dtype() == rhs.dtype(), + opq = add, + rhs.dtype(), + rhs.dtype() + ); + // SAFETY: // see subtract let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; @@ -75,6 +111,18 @@ where Ok(out.into_series()) } fn divide(lhs: &ChunkedArray, rhs: &Series) -> PolarsResult { + #[cfg(feature = "dtype-array")] + if let Some(rhs) = rhs.try_array() { + return rhs.arithm_helper_scalar_lhs(lhs.clone().into_series(), &|l, r| l.divide(&r)); + } + + polars_ensure!( + lhs.dtype() == rhs.dtype(), + opq = add, + rhs.dtype(), + rhs.dtype() + ); + // SAFETY: // see subtract let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; @@ -82,6 +130,19 @@ where Ok(out.into_series()) } fn remainder(lhs: &ChunkedArray, rhs: &Series) -> PolarsResult { + #[cfg(feature = "dtype-array")] + if let Some(rhs) = rhs.try_array() { + return rhs + .arithm_helper_scalar_lhs(lhs.clone().into_series(), &|l, r| l.remainder(&r)); + } + + polars_ensure!( + lhs.dtype() == rhs.dtype(), + opq = add, + rhs.dtype(), + rhs.dtype() + ); + // SAFETY: // see subtract let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; @@ -114,24 +175,6 @@ impl NumOpsDispatchInner for BooleanType { } } -#[cfg(feature = "dtype-array")] -fn array_shape(dt: &DataType, infer: bool) -> Vec { - fn inner(dt: &DataType, buf: &mut Vec) { - if let DataType::Array(_, size) = dt { - buf.push(ReshapeDimension::Specified( - Dimension::try_from(*size as i64).unwrap(), - )) - } - } - - let mut buf = vec![]; - if infer { - buf.push(ReshapeDimension::Infer) - } - inner(dt, &mut buf); - buf -} - #[cfg(feature = "dtype-array")] fn broadcast_array(lhs: &ArrayChunked, rhs: &Series) -> PolarsResult<(ArrayChunked, Series)> { let out = match (lhs.len(), rhs.len()) { @@ -165,19 +208,56 @@ impl ArrayChunked { ) -> PolarsResult { let (lhs, rhs) = broadcast_array(self, rhs)?; - let l_leaf_array = lhs.clone().into_series().get_leaf_array(); - let shape = array_shape(lhs.dtype(), true); + polars_ensure!( + lhs.dtype() == rhs.dtype() - let r_leaf_array = if rhs.dtype().is_numeric() && rhs.len() == 1 { - rhs.clone() - } else { - polars_ensure!(lhs.dtype() == rhs.dtype(), InvalidOperation: "can only do arithmetic of arrays of the same type and shape; got {} and {}", self.dtype(), rhs.dtype()); - rhs.get_leaf_array() - }; + // @NOTE: we allow the arithmetic operations with a scalar of the leaf array + || rhs.dtype().is_numeric() && rhs.len() == 1, + InvalidOperation: "can only do arithmetic of arrays of the same type and shape; got {} and {}", + lhs.dtype(), rhs.dtype() + ); + + let l_leaf_array = lhs.get_leaf_array(); + let r_leaf_array = rhs.get_leaf_array(); + + let mut dt = lhs.dtype(); + let mut shape = vec![ReshapeDimension::Specified( + Dimension::new(lhs.len() as u64), + )]; + while let DataType::Array(child, size) = dt { + shape.push(ReshapeDimension::Specified(Dimension::new(*size as u64))); + dt = child; + } let out = op(l_leaf_array, r_leaf_array)?; out.reshape_array(&shape) } + + fn arithm_helper_scalar_lhs( + &self, + lhs: Series, + op: &dyn Fn(Series, Series) -> PolarsResult, + ) -> PolarsResult { + polars_ensure!( + lhs.len() == 1, + InvalidOperation: "can only do arithmetic of between arrays and a scalar the leaf type; got {} and {}", + lhs.dtype(), self.dtype() + ); + + let r_leaf_array = self.get_leaf_array(); + let out = op(lhs, r_leaf_array)?; + + let mut dt = self.dtype(); + let mut shape = vec![ReshapeDimension::Specified(Dimension::new( + self.len() as u64 + ))]; + while let DataType::Array(child, size) = dt { + shape.push(ReshapeDimension::Specified(Dimension::new(*size as u64))); + dt = child; + } + + out.reshape_array(&shape) + } } #[cfg(feature = "dtype-array")] diff --git a/crates/polars-core/src/series/arithmetic/mod.rs b/crates/polars-core/src/series/arithmetic/mod.rs index 0a5550b7b0f3..713bd4fbece3 100644 --- a/crates/polars-core/src/series/arithmetic/mod.rs +++ b/crates/polars-core/src/series/arithmetic/mod.rs @@ -1,3 +1,4 @@ +mod bitops; mod borrowed; mod list_borrowed; mod owned; diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index 83bbacd12c00..b4cd48295c4c 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -123,21 +123,6 @@ impl SeriesTrait for SeriesWrap { Some(self.0.boxed_metadata_dyn()) } - fn bitxor(&self, other: &Series) -> PolarsResult { - let other = self.0.unpack_series_matching_type(other)?; - Ok((&self.0).bitxor(other).into_series()) - } - - fn bitand(&self, other: &Series) -> PolarsResult { - let other = self.0.unpack_series_matching_type(other)?; - Ok((&self.0).bitand(other).into_series()) - } - - fn bitor(&self, other: &Series) -> PolarsResult { - let other = self.0.unpack_series_matching_type(other)?; - Ok((&self.0).bitor(other).into_series()) - } - fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index 846e326d35b2..85c0d87cf0f1 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -110,48 +110,18 @@ macro_rules! impl_dyn_series { } fn subtract(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = sub, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::subtract(&self.0, rhs) } fn add_to(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = add, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::add_to(&self.0, rhs) } fn multiply(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = mul, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::multiply(&self.0, rhs) } fn divide(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = div, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::divide(&self.0, rhs) } fn remainder(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = rem, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::remainder(&self.0, rhs) } #[cfg(feature = "algorithm_group_by")] diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index b2cb97e39b69..d4b9626d2bfc 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -26,7 +26,6 @@ mod time; use std::any::Any; use std::borrow::Cow; -use std::ops::{BitAnd, BitOr, BitXor}; use std::sync::RwLockReadGuard; use super::*; @@ -183,48 +182,18 @@ macro_rules! impl_dyn_series { } fn subtract(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = sub, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::subtract(&self.0, rhs) } fn add_to(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = add, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::add_to(&self.0, rhs) } fn multiply(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = mul, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::multiply(&self.0, rhs) } fn divide(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = div, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::divide(&self.0, rhs) } fn remainder(&self, rhs: &Series) -> PolarsResult { - polars_ensure!( - self.dtype() == rhs.dtype(), - opq = rem, - self.dtype(), - rhs.dtype() - ); NumOpsDispatch::remainder(&self.0, rhs) } #[cfg(feature = "algorithm_group_by")] @@ -259,36 +228,6 @@ macro_rules! impl_dyn_series { Some(self.0.boxed_metadata_dyn()) } - fn bitand(&self, other: &Series) -> PolarsResult { - let other = if other.len() == 1 { - Cow::Owned(other.cast(self.dtype())?) - } else { - Cow::Borrowed(other) - }; - let other = self.0.unpack_series_matching_type(&other)?; - Ok(self.0.bitand(&other).into_series()) - } - - fn bitor(&self, other: &Series) -> PolarsResult { - let other = if other.len() == 1 { - Cow::Owned(other.cast(self.dtype())?) - } else { - Cow::Borrowed(other) - }; - let other = self.0.unpack_series_matching_type(&other)?; - Ok(self.0.bitor(&other).into_series()) - } - - fn bitxor(&self, other: &Series) -> PolarsResult { - let other = if other.len() == 1 { - Cow::Owned(other.cast(self.dtype())?) - } else { - Cow::Borrowed(other) - }; - let other = self.0.unpack_series_matching_type(&other)?; - Ok(self.0.bitxor(&other).into_series()) - } - fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 82d50b99827b..81754abafa19 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -372,7 +372,7 @@ impl Series { self.cast_with_options(dtype, CastOptions::NonStrict) } - /// Cast `[Series]` to another `[DataType]`. + /// Cast [`Series`] to another [`DataType`]. pub fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult { use DataType as D; @@ -723,14 +723,6 @@ impl Series { } } - /// Take by index if ChunkedArray contains a single chunk. - /// - /// # Safety - /// This doesn't check any bounds. Null validity is checked. - pub unsafe fn take_unchecked_from_slice(&self, idx: &[IdxSize]) -> Series { - self.take_slice_unchecked(idx) - } - /// Traverse and collect every nth element in a new array. pub fn gather_every(&self, n: usize, offset: usize) -> Series { let idx = ((offset as IdxSize)..self.len() as IdxSize) diff --git a/crates/polars-core/src/series/ops/downcast.rs b/crates/polars-core/src/series/ops/downcast.rs index 2189fc319b5e..55450ee9aa2b 100644 --- a/crates/polars-core/src/series/ops/downcast.rs +++ b/crates/polars-core/src/series/ops/downcast.rs @@ -27,12 +27,12 @@ macro_rules! try_unpack_chunked { } impl Series { - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int8]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int8`] pub fn try_i8(&self) -> Option<&Int8Chunked> { try_unpack_chunked!(self, DataType::Int8 => Int8Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int16]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int16`] pub fn try_i16(&self) -> Option<&Int16Chunked> { try_unpack_chunked!(self, DataType::Int16 => Int16Chunked) } @@ -51,91 +51,91 @@ impl Series { /// } /// }).collect(); /// ``` - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int32]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int32`] pub fn try_i32(&self) -> Option<&Int32Chunked> { try_unpack_chunked!(self, DataType::Int32 => Int32Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int64]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int64`] pub fn try_i64(&self) -> Option<&Int64Chunked> { try_unpack_chunked!(self, DataType::Int64 => Int64Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Float32]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Float32`] pub fn try_f32(&self) -> Option<&Float32Chunked> { try_unpack_chunked!(self, DataType::Float32 => Float32Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Float64]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Float64`] pub fn try_f64(&self) -> Option<&Float64Chunked> { try_unpack_chunked!(self, DataType::Float64 => Float64Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt8]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt8`] pub fn try_u8(&self) -> Option<&UInt8Chunked> { try_unpack_chunked!(self, DataType::UInt8 => UInt8Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt16]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt16`] pub fn try_u16(&self) -> Option<&UInt16Chunked> { try_unpack_chunked!(self, DataType::UInt16 => UInt16Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt32]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt32`] pub fn try_u32(&self) -> Option<&UInt32Chunked> { try_unpack_chunked!(self, DataType::UInt32 => UInt32Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt64]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt64`] pub fn try_u64(&self) -> Option<&UInt64Chunked> { try_unpack_chunked!(self, DataType::UInt64 => UInt64Chunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Boolean]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Boolean`] pub fn try_bool(&self) -> Option<&BooleanChunked> { try_unpack_chunked!(self, DataType::Boolean => BooleanChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::String]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::String`] pub fn try_str(&self) -> Option<&StringChunked> { try_unpack_chunked!(self, DataType::String => StringChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Binary]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Binary`] pub fn try_binary(&self) -> Option<&BinaryChunked> { try_unpack_chunked!(self, DataType::Binary => BinaryChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Binary]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Binary`] pub fn try_binary_offset(&self) -> Option<&BinaryOffsetChunked> { try_unpack_chunked!(self, DataType::BinaryOffset => BinaryOffsetChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Time]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Time`] #[cfg(feature = "dtype-time")] pub fn try_time(&self) -> Option<&TimeChunked> { try_unpack_chunked!(self, DataType::Time => TimeChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Date]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Date`] #[cfg(feature = "dtype-date")] pub fn try_date(&self) -> Option<&DateChunked> { try_unpack_chunked!(self, DataType::Date => DateChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Datetime]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Datetime`] #[cfg(feature = "dtype-datetime")] pub fn try_datetime(&self) -> Option<&DatetimeChunked> { try_unpack_chunked!(self, DataType::Datetime(_, _) => DatetimeChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Duration]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Duration`] #[cfg(feature = "dtype-duration")] pub fn try_duration(&self) -> Option<&DurationChunked> { try_unpack_chunked!(self, DataType::Duration(_) => DurationChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Decimal]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Decimal`] #[cfg(feature = "dtype-decimal")] pub fn try_decimal(&self) -> Option<&DecimalChunked> { try_unpack_chunked!(self, DataType::Decimal(_, _) => DecimalChunked) @@ -146,19 +146,19 @@ impl Series { try_unpack_chunked!(self, DataType::List(_) => ListChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Array]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Array`] #[cfg(feature = "dtype-array")] pub fn try_array(&self) -> Option<&ArrayChunked> { try_unpack_chunked!(self, DataType::Array(_, _) => ArrayChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Categorical]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Categorical`] #[cfg(feature = "dtype-categorical")] pub fn try_categorical(&self) -> Option<&CategoricalChunked> { try_unpack_chunked!(self, DataType::Categorical(_, _) | DataType::Enum(_, _) => CategoricalChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Struct]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Struct`] #[cfg(feature = "dtype-struct")] pub fn try_struct(&self) -> Option<&StructChunked> { #[cfg(debug_assertions)] @@ -171,17 +171,17 @@ impl Series { try_unpack_chunked!(self, DataType::Struct(_) => StructChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Null]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Null`] pub fn try_null(&self) -> Option<&NullChunked> { try_unpack_chunked!(self, DataType::Null => NullChunked) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int8]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int8`] pub fn i8(&self) -> PolarsResult<&Int8Chunked> { self.try_i8() .ok_or_else(|| unpack_chunked_err!(self => "Int8")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int16]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int16`] pub fn i16(&self) -> PolarsResult<&Int16Chunked> { self.try_i16() .ok_or_else(|| unpack_chunked_err!(self => "Int16")) @@ -201,107 +201,107 @@ impl Series { /// } /// }).collect(); /// ``` - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int32]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int32`] pub fn i32(&self) -> PolarsResult<&Int32Chunked> { self.try_i32() .ok_or_else(|| unpack_chunked_err!(self => "Int32")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Int64]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int64`] pub fn i64(&self) -> PolarsResult<&Int64Chunked> { self.try_i64() .ok_or_else(|| unpack_chunked_err!(self => "Int64")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Float32]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Float32`] pub fn f32(&self) -> PolarsResult<&Float32Chunked> { self.try_f32() .ok_or_else(|| unpack_chunked_err!(self => "Float32")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Float64]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Float64`] pub fn f64(&self) -> PolarsResult<&Float64Chunked> { self.try_f64() .ok_or_else(|| unpack_chunked_err!(self => "Float64")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt8]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt8`] pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { self.try_u8() .ok_or_else(|| unpack_chunked_err!(self => "UInt8")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt16]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt16`] pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { self.try_u16() .ok_or_else(|| unpack_chunked_err!(self => "UInt16")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt32]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt32`] pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { self.try_u32() .ok_or_else(|| unpack_chunked_err!(self => "UInt32")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::UInt64]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::UInt64`] pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { self.try_u64() .ok_or_else(|| unpack_chunked_err!(self => "UInt64")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Boolean]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Boolean`] pub fn bool(&self) -> PolarsResult<&BooleanChunked> { self.try_bool() .ok_or_else(|| unpack_chunked_err!(self => "Boolean")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::String]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::String`] pub fn str(&self) -> PolarsResult<&StringChunked> { self.try_str() .ok_or_else(|| unpack_chunked_err!(self => "String")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Binary]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Binary`] pub fn binary(&self) -> PolarsResult<&BinaryChunked> { self.try_binary() .ok_or_else(|| unpack_chunked_err!(self => "Binary")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Binary]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Binary`] pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { self.try_binary_offset() .ok_or_else(|| unpack_chunked_err!(self => "BinaryOffset")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Time]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Time`] #[cfg(feature = "dtype-time")] pub fn time(&self) -> PolarsResult<&TimeChunked> { self.try_time() .ok_or_else(|| unpack_chunked_err!(self => "Time")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Date]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Date`] #[cfg(feature = "dtype-date")] pub fn date(&self) -> PolarsResult<&DateChunked> { self.try_date() .ok_or_else(|| unpack_chunked_err!(self => "Date")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Datetime]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Datetime`] #[cfg(feature = "dtype-datetime")] pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { self.try_datetime() .ok_or_else(|| unpack_chunked_err!(self => "Datetime")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Duration]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Duration`] #[cfg(feature = "dtype-duration")] pub fn duration(&self) -> PolarsResult<&DurationChunked> { self.try_duration() .ok_or_else(|| unpack_chunked_err!(self => "Duration")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Decimal]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Decimal`] #[cfg(feature = "dtype-decimal")] pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { self.try_decimal() @@ -314,21 +314,21 @@ impl Series { .ok_or_else(|| unpack_chunked_err!(self => "List")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Array]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Array`] #[cfg(feature = "dtype-array")] pub fn array(&self) -> PolarsResult<&ArrayChunked> { self.try_array() .ok_or_else(|| unpack_chunked_err!(self => "FixedSizeList")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Categorical]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Categorical`] #[cfg(feature = "dtype-categorical")] pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { self.try_categorical() .ok_or_else(|| unpack_chunked_err!(self => "Enum | Categorical")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Struct]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Struct`] #[cfg(feature = "dtype-struct")] pub fn struct_(&self) -> PolarsResult<&StructChunked> { #[cfg(debug_assertions)] @@ -343,7 +343,7 @@ impl Series { .ok_or_else(|| unpack_chunked_err!(self => "Struct")) } - /// Unpack to [`ChunkedArray`] of dtype `[DataType::Null]` + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Null`] pub fn null(&self) -> PolarsResult<&NullChunked> { self.try_null() .ok_or_else(|| unpack_chunked_err!(self => "Null")) diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 0352343baa82..c77a9de0f7ad 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -218,18 +218,6 @@ pub trait SeriesTrait: /// Rename the Series. fn rename(&mut self, name: PlSmallStr); - fn bitand(&self, _other: &Series) -> PolarsResult { - polars_bail!(opq = bitand, self._dtype()); - } - - fn bitor(&self, _other: &Series) -> PolarsResult { - polars_bail!(opq = bitor, self._dtype()); - } - - fn bitxor(&self, _other: &Series) -> PolarsResult { - polars_bail!(opq = bitxor, self._dtype()); - } - fn get_metadata(&self) -> Option> { None } @@ -299,19 +287,27 @@ pub trait SeriesTrait: /// Filter by boolean mask. This operation clones data. fn filter(&self, _filter: &BooleanChunked) -> PolarsResult; - /// Take by index. This operation is clone. + /// Take from `self` at the indexes given by `idx`. + /// + /// Null values in `idx` because null values in the output array. + /// + /// This operation is clone. fn take(&self, _indices: &IdxCa) -> PolarsResult; - /// Take by index. + /// Take from `self` at the indexes given by `idx`. + /// + /// Null values in `idx` because null values in the output array. /// /// # Safety /// This doesn't check any bounds. unsafe fn take_unchecked(&self, _idx: &IdxCa) -> Series; - /// Take by index. This operation is clone. + /// Take from `self` at the indexes given by `idx`. + /// + /// This operation is clone. fn take_slice(&self, _indices: &[IdxSize]) -> PolarsResult; - /// Take by index. + /// Take from `self` at the indexes given by `idx`. /// /// # Safety /// This doesn't check any bounds. @@ -428,7 +424,7 @@ pub trait SeriesTrait: /// Count the null values. fn null_count(&self) -> usize; - /// Return if any the chunks in this `[ChunkedArray]` have nulls. + /// Return if any the chunks in this [`ChunkedArray`] have nulls. fn has_nulls(&self) -> bool; /// Get unique values in the Series. diff --git a/crates/polars-core/src/testing.rs b/crates/polars-core/src/testing.rs index f227f2bfe861..ed7c3d4fbd3e 100644 --- a/crates/polars-core/src/testing.rs +++ b/crates/polars-core/src/testing.rs @@ -1,5 +1,4 @@ //! Testing utilities. -use std::ops::Deref; use crate::prelude::*; @@ -36,21 +35,6 @@ impl Series { } } } - - /// Get a pointer to the underlying data of this [`Series`]. - /// Can be useful for fast comparisons. - pub fn get_data_ptr(&self) -> usize { - let object = self.0.deref(); - - // SAFETY: - // A fat pointer consists of a data ptr and a ptr to the vtable. - // we specifically check that we only transmute &dyn SeriesTrait e.g. - // a trait object, therefore this is sound. - #[allow(clippy::transmute_undefined_repr)] - let (data_ptr, _vtable_ptr) = - unsafe { std::mem::transmute::<&dyn SeriesTrait, (usize, usize)>(object) }; - data_ptr - } } impl PartialEq for Series { @@ -128,26 +112,6 @@ impl DataFrame { } true } - - /// Checks if the Arc ptrs of the [`Series`] are equal - /// - /// # Example - /// - /// ```rust - /// # use polars_core::prelude::*; - /// let df1: DataFrame = df!("Atomic number" => &[1, 51, 300], - /// "Element" => &[Some("Hydrogen"), Some("Antimony"), None])?; - /// let df2: &DataFrame = &df1; - /// - /// assert!(df1.ptr_equal(df2)); - /// # Ok::<(), PolarsError>(()) - /// ``` - pub fn ptr_equal(&self, other: &DataFrame) -> bool { - self.columns - .iter() - .zip(other.columns.iter()) - .all(|(a, b)| a.get_data_ptr() == b.get_data_ptr()) - } } impl PartialEq for DataFrame { diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index f1cfa5251899..fb691d746715 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -50,7 +50,7 @@ impl PhysicalExpr for AggregationExpr { None } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let s = self.input.evaluate(df, state)?; let AggregationType { @@ -69,28 +69,29 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::Min => { if MetadataEnv::experimental_enabled() { if let Some(sc) = s.get_metadata().and_then(|v| v.min_value()) { - return Ok(sc.into_series(s.name().clone())); + return Ok(sc.into_column(s.name().clone())); } } match s.is_sorted_flag() { IsSorted::Ascending | IsSorted::Descending => { - s.min_reduce().map(|sc| sc.into_series(s.name().clone())) + s.min_reduce().map(|sc| sc.into_column(s.name().clone())) }, - IsSorted::Not => parallel_op_series( - |s| s.min_reduce().map(|sc| sc.into_series(s.name().clone())), + IsSorted::Not => parallel_op_columns( + |s| s.min_reduce().map(|sc| sc.into_column(s.name().clone())), s, allow_threading, ), } }, #[cfg(feature = "propagate_nans")] - GroupByMethod::NanMin => parallel_op_series( + GroupByMethod::NanMin => parallel_op_columns( |s| { Ok(polars_ops::prelude::nan_propagating_aggregate::nan_min_s( - &s, + s.as_materialized_series(), s.name().clone(), - )) + ) + .into_column()) }, s, allow_threading, @@ -102,28 +103,29 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::Max => { if MetadataEnv::experimental_enabled() { if let Some(sc) = s.get_metadata().and_then(|v| v.max_value()) { - return Ok(sc.into_series(s.name().clone())); + return Ok(sc.into_column(s.name().clone())); } } match s.is_sorted_flag() { IsSorted::Ascending | IsSorted::Descending => { - s.max_reduce().map(|sc| sc.into_series(s.name().clone())) + s.max_reduce().map(|sc| sc.into_column(s.name().clone())) }, - IsSorted::Not => parallel_op_series( - |s| s.max_reduce().map(|sc| sc.into_series(s.name().clone())), + IsSorted::Not => parallel_op_columns( + |s| s.max_reduce().map(|sc| sc.into_column(s.name().clone())), s, allow_threading, ), } }, #[cfg(feature = "propagate_nans")] - GroupByMethod::NanMax => parallel_op_series( + GroupByMethod::NanMax => parallel_op_columns( |s| { Ok(polars_ops::prelude::nan_propagating_aggregate::nan_max_s( - &s, + s.as_materialized_series(), s.name().clone(), - )) + ) + .into_column()) }, s, allow_threading, @@ -132,20 +134,20 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::NanMax => { panic!("activate 'propagate_nans' feature") }, - GroupByMethod::Median => s.median_reduce().map(|sc| sc.into_series(s.name().clone())), - GroupByMethod::Mean => Ok(s.mean_reduce().into_series(s.name().clone())), + GroupByMethod::Median => s.median_reduce().map(|sc| sc.into_column(s.name().clone())), + GroupByMethod::Mean => Ok(s.mean_reduce().into_column(s.name().clone())), GroupByMethod::First => Ok(if s.is_empty() { - Series::full_null(s.name().clone(), 1, s.dtype()) + Column::full_null(s.name().clone(), 1, s.dtype()) } else { s.head(Some(1)) }), GroupByMethod::Last => Ok(if s.is_empty() { - Series::full_null(s.name().clone(), 1, s.dtype()) + Column::full_null(s.name().clone(), 1, s.dtype()) } else { s.tail(Some(1)) }), - GroupByMethod::Sum => parallel_op_series( - |s| s.sum_reduce().map(|sc| sc.into_series(s.name().clone())), + GroupByMethod::Sum => parallel_op_columns( + |s| s.sum_reduce().map(|sc| sc.into_column(s.name().clone())), s, allow_threading, ), @@ -154,41 +156,41 @@ impl PhysicalExpr for AggregationExpr { if MetadataEnv::experimental_enabled() { if let Some(count) = s.get_metadata().and_then(|v| v.distinct_count()) { let count = count + IdxSize::from(s.null_count() > 0); - return Ok(IdxCa::from_slice(s.name().clone(), &[count]).into_series()); + return Ok(IdxCa::from_slice(s.name().clone(), &[count]).into_column()); } } s.n_unique().map(|count| { - IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_series() + IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_column() }) }, GroupByMethod::Count { include_nulls } => { let count = s.len() - s.null_count() * !include_nulls as usize; - Ok(IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_series()) + Ok(IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_column()) }, - GroupByMethod::Implode => s.implode().map(|ca| ca.into_series()), + GroupByMethod::Implode => s.implode().map(|ca| ca.into_column()), GroupByMethod::Std(ddof) => s .std_reduce(ddof) - .map(|sc| sc.into_series(s.name().clone())), + .map(|sc| sc.into_column(s.name().clone())), GroupByMethod::Var(ddof) => s .var_reduce(ddof) - .map(|sc| sc.into_series(s.name().clone())), + .map(|sc| sc.into_column(s.name().clone())), GroupByMethod::Quantile(_, _) => unimplemented!(), #[cfg(feature = "bitwise")] GroupByMethod::Bitwise(f) => match f { - GroupByBitwiseMethod::And => parallel_op_series( - |s| s.and_reduce().map(|sc| sc.into_series(s.name().clone())), + GroupByBitwiseMethod::And => parallel_op_columns( + |s| s.and_reduce().map(|sc| sc.into_column(s.name().clone())), s, allow_threading, ), - GroupByBitwiseMethod::Or => parallel_op_series( - |s| s.or_reduce().map(|sc| sc.into_series(s.name().clone())), + GroupByBitwiseMethod::Or => parallel_op_columns( + |s| s.or_reduce().map(|sc| sc.into_column(s.name().clone())), s, allow_threading, ), - GroupByBitwiseMethod::Xor => parallel_op_series( - |s| s.xor_reduce().map(|sc| sc.into_series(s.name().clone())), + GroupByBitwiseMethod::Xor => parallel_op_columns( + |s| s.xor_reduce().map(|sc| sc.into_column(s.name().clone())), s, allow_threading, ), @@ -223,27 +225,27 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::Min => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_min(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Max => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_max(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Median => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_median(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Mean => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_mean(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Sum => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_sum(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Count { include_nulls } => { if include_nulls || ac.series().null_count() == 0 { @@ -321,7 +323,7 @@ impl PhysicalExpr for AggregationExpr { .map(|s| s.len() as IdxSize - s.null_count() as IdxSize) }) .collect(); - AggregatedScalar(rename_series(out.into_series(), keep_name)) + AggregatedScalar(out.into_series().with_name(keep_name)) }, AggState::NotAggregated(s) => { let s = s.clone(); @@ -371,17 +373,17 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::First => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_first(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Last => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_last(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::NUnique => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_n_unique(&groups); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Implode => { // if the aggregation is already @@ -404,7 +406,7 @@ impl PhysicalExpr for AggregationExpr { agg.as_list().into_series() }, }; - AggregatedList(rename_series(s, keep_name)) + AggregatedList(s.with_name(keep_name)) }, GroupByMethod::Groups => { let mut column: ListChunked = ac.groups().as_list_chunked(); @@ -414,12 +416,12 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::Std(ddof) => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_std(&groups, ddof); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Var(ddof) => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_var(&groups, ddof); - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::Quantile(_, _) => { // implemented explicitly in AggQuantile struct @@ -433,7 +435,7 @@ impl PhysicalExpr for AggregationExpr { GroupByBitwiseMethod::Or => s.agg_or(&groups), GroupByBitwiseMethod::Xor => s.agg_xor(&groups), }; - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) }, GroupByMethod::NanMin => { #[cfg(feature = "propagate_nans")] @@ -444,7 +446,7 @@ impl PhysicalExpr for AggregationExpr { } else { s.agg_min(&groups) }; - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) } #[cfg(not(feature = "propagate_nans"))] { @@ -460,7 +462,7 @@ impl PhysicalExpr for AggregationExpr { } else { s.agg_max(&groups) }; - AggregatedScalar(rename_series(agg_s, keep_name)) + AggregatedScalar(agg_s.with_name(keep_name)) } #[cfg(not(feature = "propagate_nans"))] { @@ -493,20 +495,15 @@ impl PhysicalExpr for AggregationExpr { } } -fn rename_series(mut s: Series, name: PlSmallStr) -> Series { - s.rename(name); - s -} - impl PartitionedAggregation for AggregationExpr { fn evaluate_partitioned( &self, df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let expr = self.input.as_partitioned_aggregator().unwrap(); - let series = expr.evaluate_partitioned(df, groups, state)?; + let column = expr.evaluate_partitioned(df, groups, state)?; // SAFETY: // groups are in bounds @@ -514,15 +511,15 @@ impl PartitionedAggregation for AggregationExpr { match self.agg_type.groupby { #[cfg(feature = "dtype-struct")] GroupByMethod::Mean => { - let new_name = series.name().clone(); + let new_name = column.name().clone(); // ensure we don't overflow // the all 8 and 16 bits integers are already upcasted to int16 on `agg_sum` - let mut agg_s = if matches!(series.dtype(), DataType::Int32 | DataType::UInt32) + let mut agg_s = if matches!(column.dtype(), DataType::Int32 | DataType::UInt32) { - series.cast(&DataType::Int64).unwrap().agg_sum(groups) + column.cast(&DataType::Int64).unwrap().agg_sum(groups) } else { - series.agg_sum(groups) + column.agg_sum(groups) }; agg_s.rename(new_name.clone()); @@ -533,54 +530,52 @@ impl PartitionedAggregation for AggregationExpr { DataType::Float32 => agg_s, _ => agg_s.cast(&DataType::Float64).unwrap(), }; - let mut count_s = series.agg_valid_count(groups); + let mut count_s = column.agg_valid_count(groups); count_s.rename(PlSmallStr::from_static("__POLARS_COUNT")); - Ok(StructChunked::from_series( - new_name, - agg_s.len(), - [agg_s, count_s].iter(), + Ok( + StructChunked::from_columns(new_name, agg_s.len(), &[agg_s, count_s]) + .unwrap() + .into_column(), ) - .unwrap() - .into_series()) } }, GroupByMethod::Implode => { - let new_name = series.name().clone(); - let mut agg = series.agg_list(groups); + let new_name = column.name().clone(); + let mut agg = column.agg_list(groups); agg.rename(new_name); Ok(agg) }, GroupByMethod::First => { - let mut agg = series.agg_first(groups); - agg.rename(series.name().clone()); + let mut agg = column.agg_first(groups); + agg.rename(column.name().clone()); Ok(agg) }, GroupByMethod::Last => { - let mut agg = series.agg_last(groups); - agg.rename(series.name().clone()); + let mut agg = column.agg_last(groups); + agg.rename(column.name().clone()); Ok(agg) }, GroupByMethod::Max => { - let mut agg = series.agg_max(groups); - agg.rename(series.name().clone()); + let mut agg = column.agg_max(groups); + agg.rename(column.name().clone()); Ok(agg) }, GroupByMethod::Min => { - let mut agg = series.agg_min(groups); - agg.rename(series.name().clone()); + let mut agg = column.agg_min(groups); + agg.rename(column.name().clone()); Ok(agg) }, GroupByMethod::Sum => { - let mut agg = series.agg_sum(groups); - agg.rename(series.name().clone()); + let mut agg = column.agg_sum(groups); + agg.rename(column.name().clone()); Ok(agg) }, GroupByMethod::Count { include_nulls: true, } => { let mut ca = groups.group_count(); - ca.rename(series.name().clone()); - Ok(ca.into_series()) + ca.rename(column.name().clone()); + Ok(ca.into_column()) }, _ => { unimplemented!() @@ -591,10 +586,10 @@ impl PartitionedAggregation for AggregationExpr { fn finalize( &self, - partitioned: Series, + partitioned: Column, groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { match self.agg_type.groupby { GroupByMethod::Count { include_nulls: true, @@ -616,9 +611,9 @@ impl PartitionedAggregation for AggregationExpr { let (agg_count, agg_s) = unsafe { POOL.join(|| count.agg_sum(groups), || sum.agg_sum(groups)) }; let agg_s = &agg_s / &agg_count; - Ok(rename_series(agg_s?, new_name)) + Ok(agg_s?.with_name(new_name).into_column()) }, - _ => Ok(Series::full_null( + _ => Ok(Column::full_null( new_name, groups.len(), partitioned.dtype(), @@ -685,7 +680,7 @@ impl PartitionedAggregation for AggregationExpr { if can_fast_explode { ca.set_fast_explode() } - Ok(ca.into_series().as_list().into_series()) + Ok(ca.into_series().as_list().into_column()) }, GroupByMethod::First => { let mut agg = unsafe { partitioned.agg_first(groups) }; @@ -746,12 +741,12 @@ impl PhysicalExpr for AggQuantileExpr { None } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let input = self.input.evaluate(df, state)?; let quantile = self.get_quantile(df, state)?; input .quantile_reduce(quantile, self.method) - .map(|sc| sc.into_series(input.name().clone())) + .map(|sc| sc.into_column(input.name().clone())) } #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( @@ -791,9 +786,9 @@ impl PhysicalExpr for AggQuantileExpr { /// Simple wrapper to parallelize functions that can be divided over threads aggregated and /// finally aggregated in the main thread. This can be done for sum, min, max, etc. -fn parallel_op_series(f: F, s: Series, allow_threading: bool) -> PolarsResult +fn parallel_op_columns(f: F, s: Column, allow_threading: bool) -> PolarsResult where - F: Fn(Series) -> PolarsResult + Send + Sync, + F: Fn(Column) -> PolarsResult + Send + Sync, { // set during debug low so // we mimic production size data behavior @@ -826,7 +821,7 @@ where let mut iter = chunks.into_iter(); let first = iter.next().unwrap(); let dtype = first.dtype(); - let out = iter.fold(first.to_physical_repr().into_owned(), |mut acc, s| { + let out = iter.fold(first.to_physical_repr(), |mut acc, s| { acc.append(&s.to_physical_repr()).unwrap(); acc }); diff --git a/crates/polars-expr/src/expressions/alias.rs b/crates/polars-expr/src/expressions/alias.rs index 8d321263a3f5..6144a1418de2 100644 --- a/crates/polars-expr/src/expressions/alias.rs +++ b/crates/polars-expr/src/expressions/alias.rs @@ -18,7 +18,7 @@ impl AliasExpr { } } - fn finish(&self, input: Series) -> Series { + fn finish(&self, input: Column) -> Column { input.with_name(self.name.clone()) } } @@ -28,7 +28,7 @@ impl PhysicalExpr for AliasExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let series = self.physical_expr.evaluate(df, state)?; Ok(self.finish(series)) } @@ -42,12 +42,16 @@ impl PhysicalExpr for AliasExpr { ) -> PolarsResult> { let mut ac = self.physical_expr.evaluate_on_groups(df, groups, state)?; let s = ac.take(); - let s = self.finish(s); + let s = self.finish(s.into()); if ac.is_literal() { - ac.with_literal(s); + ac.with_literal(s.take_materialized_series()); } else { - ac.with_series(s, ac.is_aggregated(), Some(&self.expr))?; + ac.with_series( + s.take_materialized_series(), + ac.is_aggregated(), + Some(&self.expr), + )?; } Ok(ac) } @@ -78,7 +82,7 @@ impl PartitionedAggregation for AliasExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let agg = self.physical_expr.as_partitioned_aggregator().unwrap(); let s = agg.evaluate_partitioned(df, groups, state)?; Ok(s.with_name(self.name.clone())) @@ -86,10 +90,10 @@ impl PartitionedAggregation for AliasExpr { fn finalize( &self, - partitioned: Series, + partitioned: Column, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let agg = self.physical_expr.as_partitioned_aggregator().unwrap(); let s = agg.finalize(partitioned, groups, state)?; Ok(s.with_name(self.name.clone())) diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index d6c37a5a004f..ddb4c37fac5d 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -321,7 +321,7 @@ impl PhysicalExpr for ApplyExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let f = |e: &Arc| e.evaluate(df, state); let mut inputs = if self.allow_threading && self.inputs.len() > 1 { POOL.install(|| { @@ -341,14 +341,9 @@ impl PhysicalExpr for ApplyExpr { if self.allow_rename { self.eval_and_flatten(&mut inputs) - .map(|c| c.as_materialized_series().clone()) } else { let in_name = inputs[0].name().clone(); - Ok(self - .eval_and_flatten(&mut inputs)? - .as_materialized_series() - .clone() - .with_name(in_name)) + Ok(self.eval_and_flatten(&mut inputs)?.with_name(in_name)) } } @@ -681,29 +676,24 @@ impl PartitionedAggregation for ApplyExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let a = self.inputs[0].as_partitioned_aggregator().unwrap(); - let s = a.evaluate_partitioned(df, groups, state)?.into(); + let s = a.evaluate_partitioned(df, groups, state)?; if self.allow_rename { self.eval_and_flatten(&mut [s]) - .map(|c| c.as_materialized_series().clone()) } else { let in_name = s.name().clone(); - Ok(self - .eval_and_flatten(&mut [s])? - .as_materialized_series() - .clone() - .with_name(in_name)) + Ok(self.eval_and_flatten(&mut [s])?.with_name(in_name)) } } fn finalize( &self, - partitioned: Series, + partitioned: Column, _groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { Ok(partitioned) } } diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index 23f50af45273..7754c2b6633e 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -41,7 +41,7 @@ impl BinaryExpr { } /// Can partially do operations in place. -fn apply_operator_owned(left: Series, right: Series, op: Operator) -> PolarsResult { +fn apply_operator_owned(left: Column, right: Column, op: Operator) -> PolarsResult { match op { Operator::Plus => left.try_add_owned(right), Operator::Minus => left.try_sub_owned(right), @@ -52,15 +52,15 @@ fn apply_operator_owned(left: Series, right: Series, op: Operator) -> PolarsResu } } -pub fn apply_operator(left: &Series, right: &Series, op: Operator) -> PolarsResult { +pub fn apply_operator(left: &Column, right: &Column, op: Operator) -> PolarsResult { use DataType::*; match op { - Operator::Gt => ChunkCompareIneq::gt(left, right).map(|ca| ca.into_series()), - Operator::GtEq => ChunkCompareIneq::gt_eq(left, right).map(|ca| ca.into_series()), - Operator::Lt => ChunkCompareIneq::lt(left, right).map(|ca| ca.into_series()), - Operator::LtEq => ChunkCompareIneq::lt_eq(left, right).map(|ca| ca.into_series()), - Operator::Eq => ChunkCompareEq::equal(left, right).map(|ca| ca.into_series()), - Operator::NotEq => ChunkCompareEq::not_equal(left, right).map(|ca| ca.into_series()), + Operator::Gt => ChunkCompareIneq::gt(left, right).map(|ca| ca.into_column()), + Operator::GtEq => ChunkCompareIneq::gt_eq(left, right).map(|ca| ca.into_column()), + Operator::Lt => ChunkCompareIneq::lt(left, right).map(|ca| ca.into_column()), + Operator::LtEq => ChunkCompareIneq::lt_eq(left, right).map(|ca| ca.into_column()), + Operator::Eq => ChunkCompareEq::equal(left, right).map(|ca| ca.into_column()), + Operator::NotEq => ChunkCompareEq::not_equal(left, right).map(|ca| ca.into_column()), Operator::Plus => left + right, Operator::Minus => left - right, Operator::Multiply => left * right, @@ -87,7 +87,11 @@ pub fn apply_operator(left: &Series, right: &Series, op: Operator) -> PolarsResu Operator::FloorDivide => { #[cfg(feature = "round_series")] { - floor_div_series(left, right) + floor_div_series( + left.as_materialized_series(), + right.as_materialized_series(), + ) + .map(Column::from) } #[cfg(not(feature = "round_series"))] { @@ -104,8 +108,8 @@ pub fn apply_operator(left: &Series, right: &Series, op: Operator) -> PolarsResu .bitand(&right.cast(&DataType::Boolean)?), Operator::Xor => left.bitxor(right), Operator::Modulus => left % right, - Operator::EqValidity => left.equal_missing(right).map(|ca| ca.into_series()), - Operator::NotEqValidity => left.not_equal_missing(right).map(|ca| ca.into_series()), + Operator::EqValidity => left.equal_missing(right).map(|ca| ca.into_column()), + Operator::NotEqValidity => left.not_equal_missing(right).map(|ca| ca.into_column()), } } @@ -123,8 +127,8 @@ impl BinaryExpr { // Drop lhs so that we might operate in place. drop(ac_l.take()); - let out = apply_operator_owned(lhs, rhs, self.op)?; - ac_l.with_series(out, aggregated, Some(&self.expr))?; + let out = apply_operator_owned(lhs.into_column(), rhs.into_column(), self.op)?; + ac_l.with_series(out.take_materialized_series(), aggregated, Some(&self.expr))?; Ok(ac_l) } @@ -137,16 +141,16 @@ impl BinaryExpr { ac_l.groups(); ac_r.groups(); polars_ensure!(ac_l.groups.len() == ac_r.groups.len(), ComputeError: "lhs and rhs should have same group length"); - let left_s = ac_l.series().rechunk(); - let right_s = ac_r.series().rechunk(); + let left_s = ac_l.series().rechunk().into_column(); + let right_s = ac_r.series().rechunk().into_column(); let res_s = apply_operator(&left_s, &right_s, self.op)?; ac_l.with_update_groups(UpdateGroups::WithSeriesLen); let res_s = if res_s.len() == 1 { res_s.new_from_index(0, ac_l.groups.len()) } else { - ListChunked::full(name, &res_s, ac_l.groups.len()).into_series() + ListChunked::full(name, res_s.as_materialized_series(), ac_l.groups.len()).into_column() }; - ac_l.with_series(res_s, true, Some(&self.expr))?; + ac_l.with_series(res_s.take_materialized_series(), true, Some(&self.expr))?; Ok(ac_l) } @@ -159,7 +163,13 @@ impl BinaryExpr { let ca = ac_l .iter_groups(false) .zip(ac_r.iter_groups(false)) - .map(|(l, r)| Some(apply_operator(l?.as_ref(), r?.as_ref(), self.op))) + .map(|(l, r)| { + Some(apply_operator( + &l?.as_ref().clone().into_column(), + &r?.as_ref().clone().into_column(), + self.op, + )) + }) .map(|opt_res| opt_res.transpose()) .collect::>()? .with_name(name); @@ -175,7 +185,7 @@ impl PhysicalExpr for BinaryExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { // Window functions may set a global state that determine their output // state, so we don't let them run in parallel as they race // they also saturate the thread pool by themselves, so that's fine. @@ -246,8 +256,10 @@ impl PhysicalExpr for BinaryExpr { (AggState::AggregatedList(lhs), AggState::AggregatedList(rhs)) => { let lhs = lhs.list().unwrap(); let rhs = rhs.list().unwrap(); - let out = - lhs.apply_to_inner(&|lhs| apply_operator(&lhs, &rhs.get_inner(), self.op))?; + let out = lhs.apply_to_inner(&|lhs| { + apply_operator(&lhs.into_column(), &rhs.get_inner().into_column(), self.op) + .map(|c| c.take_materialized_series()) + })?; ac_l.with_series(out.into_series(), true, Some(&self.expr))?; Ok(ac_l) }, @@ -279,7 +291,7 @@ mod stats { use super::*; - fn apply_operator_stats_eq(min_max: &Series, literal: &Series) -> bool { + fn apply_operator_stats_eq(min_max: &Column, literal: &Column) -> bool { use ChunkCompareIneq as C; // Literal is greater than max, don't need to read. if C::gt(literal, min_max).map(|s| s.all()).unwrap_or(false) { @@ -294,7 +306,7 @@ mod stats { true } - fn apply_operator_stats_neq(min_max: &Series, literal: &Series) -> bool { + fn apply_operator_stats_neq(min_max: &Column, literal: &Column) -> bool { if min_max.len() < 2 || min_max.null_count() > 0 { return true; } @@ -311,7 +323,7 @@ mod stats { true } - fn apply_operator_stats_rhs_lit(min_max: &Series, literal: &Series, op: Operator) -> bool { + fn apply_operator_stats_rhs_lit(min_max: &Column, literal: &Column, op: Operator) -> bool { use ChunkCompareIneq as C; match op { Operator::Eq => apply_operator_stats_eq(min_max, literal), @@ -347,7 +359,7 @@ mod stats { } } - fn apply_operator_stats_lhs_lit(literal: &Series, min_max: &Series, op: Operator) -> bool { + fn apply_operator_stats_lhs_lit(literal: &Column, min_max: &Column, op: Operator) -> bool { use ChunkCompareIneq as C; match op { Operator::Eq => apply_operator_stats_eq(min_max, literal), @@ -423,7 +435,11 @@ mod stats { // will be incorrect if not debug_assert_eq!(min_max_s.null_count(), 0); let lit_s = self.right.evaluate(&dummy, &state).unwrap(); - Ok(apply_operator_stats_rhs_lit(&min_max_s, &lit_s, self.op)) + Ok(apply_operator_stats_rhs_lit( + &min_max_s.into_column(), + &lit_s, + self.op, + )) }, } }, @@ -435,7 +451,11 @@ mod stats { // will be incorrect if not debug_assert_eq!(min_max_s.null_count(), 0); let lit_s = self.left.evaluate(&dummy, &state).unwrap(); - Ok(apply_operator_stats_lhs_lit(&lit_s, &min_max_s, self.op)) + Ok(apply_operator_stats_lhs_lit( + &lit_s, + &min_max_s.into_column(), + self.op, + )) }, } }, @@ -476,7 +496,7 @@ impl PartitionedAggregation for BinaryExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let left = self.left.as_partitioned_aggregator().unwrap(); let right = self.right.as_partitioned_aggregator().unwrap(); let left = left.evaluate_partitioned(df, groups, state)?; @@ -486,10 +506,10 @@ impl PartitionedAggregation for BinaryExpr { fn finalize( &self, - partitioned: Series, + partitioned: Column, _groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { Ok(partitioned) } } diff --git a/crates/polars-expr/src/expressions/cast.rs b/crates/polars-expr/src/expressions/cast.rs index ebfd50311918..dcbd67d36a7e 100644 --- a/crates/polars-expr/src/expressions/cast.rs +++ b/crates/polars-expr/src/expressions/cast.rs @@ -12,7 +12,7 @@ pub struct CastExpr { } impl CastExpr { - fn finish(&self, input: &Series) -> PolarsResult { + fn finish(&self, input: &Column) -> PolarsResult { input.cast_with_options(&self.dtype, self.options) } } @@ -22,9 +22,9 @@ impl PhysicalExpr for CastExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { - let series = self.input.evaluate(df, state)?; - self.finish(&series) + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + let column = self.input.evaluate(df, state)?; + self.finish(&column) } #[allow(clippy::ptr_arg)] @@ -40,15 +40,18 @@ impl PhysicalExpr for CastExpr { // this will not explode and potentially increase memory due to overlapping groups AggState::AggregatedList(s) => { let ca = s.list().unwrap(); - let casted = ca.apply_to_inner(&|s| self.finish(&s))?; + let casted = ca.apply_to_inner(&|s| { + self.finish(&s.into_column()) + .map(|c| c.take_materialized_series()) + })?; ac.with_series(casted.into_series(), true, None)?; }, AggState::AggregatedScalar(s) => { - let s = self.finish(s)?; + let s = self.finish(&s.clone().into_column())?; if ac.is_literal() { - ac.with_literal(s); + ac.with_literal(s.take_materialized_series()); } else { - ac.with_series(s, true, None)?; + ac.with_series(s.take_materialized_series(), true, None)?; } }, _ => { @@ -56,12 +59,12 @@ impl PhysicalExpr for CastExpr { ac.groups(); let s = ac.flat_naive(); - let s = self.finish(s.as_ref())?; + let s = self.finish(&s.as_ref().clone().into_column())?; if ac.is_literal() { - ac.with_literal(s); + ac.with_literal(s.take_materialized_series()); } else { - ac.with_series(s, false, None)?; + ac.with_series(s.take_materialized_series(), false, None)?; } }, } @@ -91,17 +94,17 @@ impl PartitionedAggregation for CastExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let e = self.input.as_partitioned_aggregator().unwrap(); self.finish(&e.evaluate_partitioned(df, groups, state)?) } fn finalize( &self, - partitioned: Series, + partitioned: Column, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let agg = self.input.as_partitioned_aggregator().unwrap(); agg.finalize(partitioned, groups, state) } diff --git a/crates/polars-expr/src/expressions/column.rs b/crates/polars-expr/src/expressions/column.rs index 8a59d6c25ddb..2142d22df6d9 100644 --- a/crates/polars-expr/src/expressions/column.rs +++ b/crates/polars-expr/src/expressions/column.rs @@ -140,7 +140,7 @@ impl PhysicalExpr for ColumnExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let out = match self.schema.get_full(&self.name) { Some((idx, _, _)) => { // check if the schema was correct @@ -168,12 +168,12 @@ impl PhysicalExpr for ColumnExpr { // in debug builds we panic so that it can be fixed when occurring None => { if self.name.starts_with(CSE_REPLACED) { - return self.process_cse(df, &self.schema); + return self.process_cse(df, &self.schema).map(Column::from); } self.process_by_linear_search(df, state, true) }, }; - self.check_external_context(out, state) + self.check_external_context(out, state).map(Column::from) } #[allow(clippy::ptr_arg)] @@ -184,7 +184,11 @@ impl PhysicalExpr for ColumnExpr { state: &ExecutionState, ) -> PolarsResult> { let s = self.evaluate(df, state)?; - Ok(AggregationContext::new(s, Cow::Borrowed(groups), false)) + Ok(AggregationContext::new( + s.take_materialized_series(), + Cow::Borrowed(groups), + false, + )) } fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { @@ -209,16 +213,16 @@ impl PartitionedAggregation for ColumnExpr { df: &DataFrame, _groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { self.evaluate(df, state) } fn finalize( &self, - partitioned: Series, + partitioned: Column, _groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { Ok(partitioned) } } diff --git a/crates/polars-expr/src/expressions/count.rs b/crates/polars-expr/src/expressions/count.rs index 5e8b4c75e376..6102caf5a354 100644 --- a/crates/polars-expr/src/expressions/count.rs +++ b/crates/polars-expr/src/expressions/count.rs @@ -21,11 +21,8 @@ impl PhysicalExpr for CountExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult { - Ok(Series::new( - PlSmallStr::from_static("len"), - [df.height() as IdxSize], - )) + fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult { + Ok(Series::new(PlSmallStr::from_static("len"), [df.height() as IdxSize]).into_column()) } fn evaluate_on_groups<'a>( @@ -59,19 +56,19 @@ impl PartitionedAggregation for CountExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { self.evaluate_on_groups(df, groups, state) - .map(|mut ac| ac.aggregated()) + .map(|mut ac| ac.aggregated().into_column()) } /// Called to merge all the partitioned results in a final aggregate. #[allow(clippy::ptr_arg)] fn finalize( &self, - partitioned: Series, + partitioned: Column, groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { // SAFETY: groups are in bounds. let agg = unsafe { partitioned.agg_sum(groups) }; Ok(agg.with_name(PlSmallStr::from_static(LEN))) diff --git a/crates/polars-expr/src/expressions/filter.rs b/crates/polars-expr/src/expressions/filter.rs index b11d0dda6129..6f847a7fa8ed 100644 --- a/crates/polars-expr/src/expressions/filter.rs +++ b/crates/polars-expr/src/expressions/filter.rs @@ -24,7 +24,7 @@ impl PhysicalExpr for FilterExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let s_f = || self.input.evaluate(df, state); let predicate_f = || self.by.evaluate(df, state); diff --git a/crates/polars-expr/src/expressions/gather.rs b/crates/polars-expr/src/expressions/gather.rs index a6450bcb531b..19a0e35ff315 100644 --- a/crates/polars-expr/src/expressions/gather.rs +++ b/crates/polars-expr/src/expressions/gather.rs @@ -18,7 +18,7 @@ impl PhysicalExpr for GatherExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let series = self.phys_expr.evaluate(df, state)?; self.finish(df, state, series) } @@ -102,10 +102,10 @@ impl GatherExpr { &self, df: &DataFrame, state: &ExecutionState, - series: Series, - ) -> PolarsResult { + series: Column, + ) -> PolarsResult { let idx = self.idx.evaluate(df, state)?; - let idx = convert_to_unsigned_index(&idx, series.len())?; + let idx = convert_to_unsigned_index(idx.as_materialized_series(), series.len())?; series.take(&idx) } diff --git a/crates/polars-expr/src/expressions/literal.rs b/crates/polars-expr/src/expressions/literal.rs index 2089e4cf5bb4..0c6900d4356b 100644 --- a/crates/polars-expr/src/expressions/literal.rs +++ b/crates/polars-expr/src/expressions/literal.rs @@ -21,29 +21,31 @@ impl PhysicalExpr for LiteralExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.1) } - fn evaluate(&self, _df: &DataFrame, _state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, _df: &DataFrame, _state: &ExecutionState) -> PolarsResult { use LiteralValue::*; let s = match &self.0 { #[cfg(feature = "dtype-i8")] - Int8(v) => Int8Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + Int8(v) => Int8Chunked::full(get_literal_name().clone(), *v, 1).into_column(), #[cfg(feature = "dtype-i16")] - Int16(v) => Int16Chunked::full(get_literal_name().clone(), *v, 1).into_series(), - Int32(v) => Int32Chunked::full(get_literal_name().clone(), *v, 1).into_series(), - Int64(v) => Int64Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + Int16(v) => Int16Chunked::full(get_literal_name().clone(), *v, 1).into_column(), + Int32(v) => Int32Chunked::full(get_literal_name().clone(), *v, 1).into_column(), + Int64(v) => Int64Chunked::full(get_literal_name().clone(), *v, 1).into_column(), #[cfg(feature = "dtype-u8")] - UInt8(v) => UInt8Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + UInt8(v) => UInt8Chunked::full(get_literal_name().clone(), *v, 1).into_column(), #[cfg(feature = "dtype-u16")] - UInt16(v) => UInt16Chunked::full(get_literal_name().clone(), *v, 1).into_series(), - UInt32(v) => UInt32Chunked::full(get_literal_name().clone(), *v, 1).into_series(), - UInt64(v) => UInt64Chunked::full(get_literal_name().clone(), *v, 1).into_series(), - Float32(v) => Float32Chunked::full(get_literal_name().clone(), *v, 1).into_series(), - Float64(v) => Float64Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + UInt16(v) => UInt16Chunked::full(get_literal_name().clone(), *v, 1).into_column(), + UInt32(v) => UInt32Chunked::full(get_literal_name().clone(), *v, 1).into_column(), + UInt64(v) => UInt64Chunked::full(get_literal_name().clone(), *v, 1).into_column(), + Float32(v) => Float32Chunked::full(get_literal_name().clone(), *v, 1).into_column(), + Float64(v) => Float64Chunked::full(get_literal_name().clone(), *v, 1).into_column(), #[cfg(feature = "dtype-decimal")] Decimal(v, scale) => Int128Chunked::full(get_literal_name().clone(), *v, 1) .into_decimal_unchecked(None, *scale) - .into_series(), - Boolean(v) => BooleanChunked::full(get_literal_name().clone(), *v, 1).into_series(), - Null => polars_core::prelude::Series::new_null(get_literal_name().clone(), 1), + .into_column(), + Boolean(v) => BooleanChunked::full(get_literal_name().clone(), *v, 1).into_column(), + Null => { + polars_core::prelude::Series::new_null(get_literal_name().clone(), 1).into_column() + }, Range { low, high, dtype } => match dtype { DataType::Int32 => { polars_ensure!( @@ -53,13 +55,13 @@ impl PhysicalExpr for LiteralExpr { let low = *low as i32; let high = *high as i32; let ca: NoNull = (low..high).collect(); - ca.into_inner().into_series() + ca.into_inner().into_column() }, DataType::Int64 => { let low = *low; let high = *high; let ca: NoNull = (low..high).collect(); - ca.into_inner().into_series() + ca.into_inner().into_column() }, DataType::UInt32 => { polars_ensure!( @@ -69,28 +71,28 @@ impl PhysicalExpr for LiteralExpr { let low = *low as u32; let high = *high as u32; let ca: NoNull = (low..high).collect(); - ca.into_inner().into_series() + ca.into_inner().into_column() }, dt => polars_bail!( InvalidOperation: "datatype `{}` is not supported as range", dt ), }, - String(v) => StringChunked::full(get_literal_name().clone(), v, 1).into_series(), - Binary(v) => BinaryChunked::full(get_literal_name().clone(), v, 1).into_series(), + String(v) => StringChunked::full(get_literal_name().clone(), v, 1).into_column(), + Binary(v) => BinaryChunked::full(get_literal_name().clone(), v, 1).into_column(), #[cfg(feature = "dtype-datetime")] DateTime(timestamp, tu, tz) => { Int64Chunked::full(get_literal_name().clone(), *timestamp, 1) .into_datetime(*tu, tz.clone()) - .into_series() + .into_column() }, #[cfg(feature = "dtype-duration")] Duration(v, tu) => Int64Chunked::full(get_literal_name().clone(), *v, 1) .into_duration(*tu) - .into_series(), + .into_column(), #[cfg(feature = "dtype-date")] Date(v) => Int32Chunked::full(get_literal_name().clone(), *v, 1) .into_date() - .into_series(), + .into_column(), #[cfg(feature = "dtype-time")] Time(v) => { if !(0..NANOSECONDS_IN_DAY).contains(v) { @@ -102,16 +104,17 @@ impl PhysicalExpr for LiteralExpr { Int64Chunked::full(get_literal_name().clone(), *v, 1) .into_time() - .into_series() + .into_column() }, - Series(series) => series.deref().clone(), - OtherScalar(s) => s.clone().into_series(get_literal_name().clone()), + Series(series) => series.deref().clone().into_column(), + OtherScalar(s) => s.clone().into_column(get_literal_name().clone()), lv @ (Int(_) | Float(_) | StrCat(_)) => polars_core::prelude::Series::from_any_values( get_literal_name().clone(), &[lv.to_any_value().unwrap()], false, ) - .unwrap(), + .unwrap() + .into_column(), }; Ok(s) } @@ -124,7 +127,10 @@ impl PhysicalExpr for LiteralExpr { state: &ExecutionState, ) -> PolarsResult> { let s = self.evaluate(df, state)?; - Ok(AggregationContext::from_literal(s, Cow::Borrowed(groups))) + Ok(AggregationContext::from_literal( + s.take_materialized_series(), + Cow::Borrowed(groups), + )) } fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { @@ -150,16 +156,16 @@ impl PartitionedAggregation for LiteralExpr { df: &DataFrame, _groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { self.evaluate(df, state) } fn finalize( &self, - partitioned: Series, + partitioned: Column, _groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { Ok(partitioned) } } diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs index 15550c517fe7..8ccc5349b733 100644 --- a/crates/polars-expr/src/expressions/mod.rs +++ b/crates/polars-expr/src/expressions/mod.rs @@ -536,7 +536,7 @@ pub trait PhysicalExpr: Send + Sync { } /// Take a DataFrame and evaluate the expression. - fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult; + fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult; /// Some expression that are not aggregations can be done per group /// Think of sort, slice, filter, shift, etc. @@ -611,7 +611,9 @@ impl PhysicalIoExpr for PhysicalIoHelper { if self.has_window_function { state.insert_has_window_function_flag(); } - self.expr.evaluate(df, &state) + self.expr + .evaluate(df, &state) + .map(|c| c.take_materialized_series()) } fn live_variables(&self) -> Option> { @@ -651,14 +653,14 @@ pub trait PartitionedAggregation: Send + Sync + PhysicalExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult; + ) -> PolarsResult; /// Called to merge all the partitioned results in a final aggregate. #[allow(clippy::ptr_arg)] fn finalize( &self, - partitioned: Series, + partitioned: Column, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult; + ) -> PolarsResult; } diff --git a/crates/polars-expr/src/expressions/rolling.rs b/crates/polars-expr/src/expressions/rolling.rs index 806e3d5b0398..7e9897d7328c 100644 --- a/crates/polars-expr/src/expressions/rolling.rs +++ b/crates/polars-expr/src/expressions/rolling.rs @@ -19,7 +19,7 @@ pub(crate) struct RollingExpr { } impl PhysicalExpr for RollingExpr { - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let groups_key = format!("{:?}", &self.options); let groups_map = state.group_tuples.read().unwrap(); @@ -47,7 +47,7 @@ impl PhysicalExpr for RollingExpr { if let Some(name) = &self.out_name { out.rename(name.clone()); } - Ok(out) + Ok(out.into_column()) } fn evaluate_on_groups<'a>( diff --git a/crates/polars-expr/src/expressions/slice.rs b/crates/polars-expr/src/expressions/slice.rs index d0e187120939..2b805edd1bb0 100644 --- a/crates/polars-expr/src/expressions/slice.rs +++ b/crates/polars-expr/src/expressions/slice.rs @@ -82,7 +82,7 @@ impl PhysicalExpr for SliceExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let results = POOL.install(|| { [&self.offset, &self.length, &self.input] .par_iter() @@ -92,7 +92,11 @@ impl PhysicalExpr for SliceExpr { let offset = &results[0]; let length = &results[1]; let series = &results[2]; - let (offset, length) = extract_args(offset, length, &self.expr)?; + let (offset, length) = extract_args( + offset.as_materialized_series(), + length.as_materialized_series(), + &self.expr, + )?; Ok(series.slice(offset, length)) } diff --git a/crates/polars-expr/src/expressions/sort.rs b/crates/polars-expr/src/expressions/sort.rs index 751b09b07475..be9fe57e29ce 100644 --- a/crates/polars-expr/src/expressions/sort.rs +++ b/crates/polars-expr/src/expressions/sort.rs @@ -46,7 +46,7 @@ impl PhysicalExpr for SortExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let series = self.physical_expr.evaluate(df, state)?; series.sort_with(self.options) } diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index f966e4cbb544..1624d7c9bcd6 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -199,7 +199,7 @@ impl PhysicalExpr for SortByExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let series_f = || self.input.evaluate(df, state); if self.by.is_empty() { // Sorting by 0 columns returns input unchanged. @@ -220,13 +220,11 @@ impl PhysicalExpr for SortByExpr { .by .iter() .map(|e| { - e.evaluate(df, state) - .map(|s| match s.dtype() { - #[cfg(feature = "dtype-categorical")] - DataType::Categorical(_, _) | DataType::Enum(_, _) => s, - _ => s.to_physical_repr().into_owned(), - }) - .map(Column::from) + e.evaluate(df, state).map(|s| match s.dtype() { + #[cfg(feature = "dtype-categorical")] + DataType::Categorical(_, _) | DataType::Enum(_, _) => s, + _ => s.to_physical_repr(), + }) }) .collect::>>()?; diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs index 37600c71f06a..2d1035c22eb7 100644 --- a/crates/polars-expr/src/expressions/ternary.rs +++ b/crates/polars-expr/src/expressions/ternary.rs @@ -79,7 +79,7 @@ impl PhysicalExpr for TernaryExpr { Some(&self.expr) } - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { let mut state = state.split(); // Don't cache window functions as they run in parallel. state.remove_cache_window_flag(); @@ -337,7 +337,7 @@ impl PartitionedAggregation for TernaryExpr { df: &DataFrame, groups: &GroupsProxy, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { let truthy = self.truthy.as_partitioned_aggregator().unwrap(); let falsy = self.falsy.as_partitioned_aggregator().unwrap(); let mask = self.predicate.as_partitioned_aggregator().unwrap(); @@ -352,10 +352,10 @@ impl PartitionedAggregation for TernaryExpr { fn finalize( &self, - partitioned: Series, + partitioned: Column, _groups: &GroupsProxy, _state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { Ok(partitioned) } } diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index f843c0e83d95..e15a301f68b4 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -371,7 +371,7 @@ impl PhysicalExpr for WindowExpr { // This first cached the group_by and the join tuples, but rayon under a mutex leads to deadlocks: // https://github.com/rayon-rs/rayon/issues/592 - fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { // This method does the following: // 1. determine group_by tuples based on the group_column // 2. apply an aggregation function @@ -400,7 +400,7 @@ impl PhysicalExpr for WindowExpr { if df.is_empty() { let field = self.phys_function.to_field(&df.schema())?; - return Ok(Series::full_null(field.name().clone(), 0, field.dtype())); + return Ok(Column::full_null(field.name().clone(), 0, field.dtype())); } let group_by_columns = self @@ -443,7 +443,7 @@ impl PhysicalExpr for WindowExpr { if let Some((order_by, options)) = &self.order_by { let order_by = order_by.evaluate(df, state)?; polars_ensure!(order_by.len() == df.height(), ShapeMismatch: "the order by expression evaluated to a length: {} that doesn't match the input DataFrame: {}", order_by.len(), df.height()); - groups = update_groups_sort_by(&groups, &order_by, options)? + groups = update_groups_sort_by(&groups, order_by.as_materialized_series(), options)? } let out: PolarsResult = Ok(groups); @@ -521,7 +521,7 @@ impl PhysicalExpr for WindowExpr { if let Some(name) = &self.out_name { out.rename(name.clone()); } - Ok(out) + Ok(out.into_column()) }, Explode => { let mut out = ac.aggregated().explode()?; @@ -529,7 +529,7 @@ impl PhysicalExpr for WindowExpr { if let Some(name) = &self.out_name { out.rename(name.clone()); } - Ok(out) + Ok(out.into_column()) }, Map => { // TODO! @@ -551,6 +551,7 @@ impl PhysicalExpr for WindowExpr { state, &cache_key, ) + .map(Column::from) }, Join => { let out_column = ac.aggregated(); @@ -566,7 +567,7 @@ impl PhysicalExpr for WindowExpr { // we take the group locations to directly map them to the right place (UpdateGroups::No, Some(out)) => { cache_gb(gb, state, &cache_key); - Ok(out) + Ok(out.into_column()) }, (_, _) => { let keys = gb.keys(); @@ -625,7 +626,7 @@ impl PhysicalExpr for WindowExpr { jt_map.insert(cache_key, join_opt_ids); } - Ok(out) + Ok(out.into_column()) }, } }, diff --git a/crates/polars-io/src/csv/read/options.rs b/crates/polars-io/src/csv/read/options.rs index 8c840e137d7a..cbbf796d45d7 100644 --- a/crates/polars-io/src/csv/read/options.rs +++ b/crates/polars-io/src/csv/read/options.rs @@ -298,7 +298,7 @@ impl CsvParseOptions { } /// Automatically try to parse dates/datetimes and time. If parsing fails, - /// columns remain of dtype `[DataType::String]`. + /// columns remain of dtype [`DataType::String`]. pub fn with_try_parse_dates(mut self, try_parse_dates: bool) -> Self { self.try_parse_dates = try_parse_dates; self diff --git a/crates/polars-io/src/shared.rs b/crates/polars-io/src/shared.rs index 4babd4f65bd5..1eea338f4788 100644 --- a/crates/polars-io/src/shared.rs +++ b/crates/polars-io/src/shared.rs @@ -13,7 +13,7 @@ pub trait SerReader where R: Read, { - /// Create a new instance of the `[SerReader]` + /// Create a new instance of the [`SerReader`] fn new(reader: R) -> Self; /// Make sure that all columns are contiguous in memory by diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 78f8274fb079..3f8c64dd1970 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -47,6 +47,7 @@ parquet = [ "polars-pipe?/parquet", "polars-expr/parquet", "polars-mem-engine/parquet", + "polars-stream?/parquet", ] async = [ "polars-plan/async", @@ -54,11 +55,26 @@ async = [ "polars-pipe?/async", "polars-mem-engine/async", ] -cloud = ["async", "polars-pipe?/cloud", "polars-plan/cloud", "tokio", "futures", "polars-mem-engine/cloud"] +cloud = [ + "async", + "polars-pipe?/cloud", + "polars-plan/cloud", + "tokio", + "futures", + "polars-mem-engine/cloud", + "polars-stream?/cloud", +] cloud_write = ["cloud"] -ipc = ["polars-io/ipc", "polars-plan/ipc", "polars-pipe?/ipc", "polars-mem-engine/ipc"] -json = ["polars-io/json", "polars-plan/json", "polars-json", "polars-pipe?/json", "polars-mem-engine/json"] -csv = ["polars-io/csv", "polars-plan/csv", "polars-pipe?/csv", "polars-mem-engine/csv"] +ipc = ["polars-io/ipc", "polars-plan/ipc", "polars-pipe?/ipc", "polars-mem-engine/ipc", "polars-stream?/ipc"] +json = [ + "polars-io/json", + "polars-plan/json", + "polars-json", + "polars-pipe?/json", + "polars-mem-engine/json", + "polars-stream?/json", +] +csv = ["polars-io/csv", "polars-plan/csv", "polars-pipe?/csv", "polars-mem-engine/csv", "polars-stream?/csv"] temporal = [ "dtype-datetime", "dtype-date", @@ -223,7 +239,7 @@ string_reverse = ["polars-plan/string_reverse"] string_to_integer = ["polars-plan/string_to_integer"] arg_where = ["polars-plan/arg_where"] search_sorted = ["polars-plan/search_sorted"] -merge_sorted = ["polars-plan/merge_sorted"] +merge_sorted = ["polars-plan/merge_sorted", "polars-stream?/merge_sorted"] meta = ["polars-plan/meta"] pivot = ["polars-core/rows", "polars-ops/pivot", "polars-plan/pivot"] top_k = ["polars-plan/top_k"] diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs index d73e4be5d13e..c706ee9b6957 100644 --- a/crates/polars-lazy/src/dsl/list.rs +++ b/crates/polars-lazy/src/dsl/list.rs @@ -69,7 +69,7 @@ fn run_per_sublist( let df = s.into_frame(); let out = phys_expr.evaluate(&df, &state); match out { - Ok(s) => Some(s), + Ok(s) => Some(s.take_materialized_series()), Err(e) => { *m_err.lock().unwrap() = Some(e); None @@ -90,7 +90,7 @@ fn run_per_sublist( let out = phys_expr.evaluate(&df_container, &state); df_container.clear_columns(); match out { - Ok(s) => Some(s), + Ok(s) => Some(s.take_materialized_series()), Err(e) => { err = Some(e); None diff --git a/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs index ad4d8cd1fb48..0700f5f767e7 100644 --- a/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs @@ -36,7 +36,9 @@ impl PhysicalIoExpr for Wrap { } impl PhysicalPipedExpr for Wrap { fn evaluate(&self, chunk: &DataChunk, state: &ExecutionState) -> PolarsResult { - self.0.evaluate(&chunk.data, state) + self.0 + .evaluate(&chunk.data, state) + .map(|c| c.take_materialized_series()) } fn field(&self, input_schema: &Schema) -> PolarsResult { self.0.to_field(input_schema) diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 7127d64e87cb..dbfa989b5d05 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -137,7 +137,7 @@ impl LazyCsvReader { }) } - /// Set the `char` used as quote char. The default is `b'"'`. If set to `[None]` quoting is disabled. + /// Set the `char` used as quote char. The default is `b'"'`. If set to [`None`] quoting is disabled. #[must_use] pub fn with_quote_char(self, quote_char: Option) -> Self { self.map_parse_options(|opts| opts.with_quote_char(quote_char)) @@ -181,7 +181,7 @@ impl LazyCsvReader { } /// Automatically try to parse dates/datetimes and time. - /// If parsing fails, columns remain of dtype `[DataType::String]`. + /// If parsing fails, columns remain of dtype [`DataType::String`]. #[cfg(feature = "temporal")] pub fn with_try_parse_dates(self, try_parse_dates: bool) -> Self { self.map_parse_options(|opts| opts.with_try_parse_dates(try_parse_dates)) diff --git a/crates/polars-mem-engine/src/executors/filter.rs b/crates/polars-mem-engine/src/executors/filter.rs index 689674345760..417a7ecf766e 100644 --- a/crates/polars-mem-engine/src/executors/filter.rs +++ b/crates/polars-mem-engine/src/executors/filter.rs @@ -45,7 +45,7 @@ impl FilterExec { if self.has_window { state.clear_window_expr_cache() } - df.filter(series_to_mask(&s)?) + df.filter(series_to_mask(s.as_materialized_series())?) } fn execute_chunks( @@ -55,7 +55,7 @@ impl FilterExec { ) -> PolarsResult { let iter = chunks.into_par_iter().map(|df| { let s = self.predicate.evaluate(&df, state)?; - df.filter(series_to_mask(&s)?) + df.filter(series_to_mask(s.as_materialized_series())?) }); let df = POOL.install(|| iter.collect::>>())?; Ok(accumulate_dataframes_vertical_unchecked(df)) diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index ad41378b3086..61cb9b10bc52 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -332,11 +332,7 @@ impl PartitionGroupByExec { .map(|(expr, partitioned_s)| { let agg_expr = expr.as_partitioned_aggregator().unwrap(); agg_expr - .finalize( - partitioned_s.as_materialized_series().clone(), - groups, - state, - ) + .finalize(partitioned_s.clone(), groups, state) .map(Column::from) }) .collect(); diff --git a/crates/polars-mem-engine/src/executors/join.rs b/crates/polars-mem-engine/src/executors/join.rs index 5edab8551ece..4fed3cb7a3ff 100644 --- a/crates/polars-mem-engine/src/executors/join.rs +++ b/crates/polars-mem-engine/src/executors/join.rs @@ -139,8 +139,8 @@ impl Executor for JoinExec { let df = df_left._join_impl( &df_right, - left_on_series, - right_on_series, + left_on_series.into_iter().map(|c| c.take_materialized_series()).collect(), + right_on_series.into_iter().map(|c| c.take_materialized_series()).collect(), self.args.clone(), true, state.verbose(), diff --git a/crates/polars-mem-engine/src/executors/projection_utils.rs b/crates/polars-mem-engine/src/executors/projection_utils.rs index 47464849582e..01dc5f362fd9 100644 --- a/crates/polars-mem-engine/src/executors/projection_utils.rs +++ b/crates/polars-mem-engine/src/executors/projection_utils.rs @@ -20,7 +20,7 @@ fn rolling_evaluate( df: &DataFrame, state: &ExecutionState, rolling: PlHashMap<&RollingGroupOptions, Vec>, -) -> PolarsResult>> { +) -> PolarsResult>> { POOL.install(|| { rolling .par_iter() @@ -51,7 +51,7 @@ fn window_evaluate( df: &DataFrame, state: &ExecutionState, window: PlHashMap>, -) -> PolarsResult>> { +) -> PolarsResult>> { POOL.install(|| { window .par_iter() @@ -99,7 +99,7 @@ fn execute_projection_cached_window_fns( df: &DataFrame, exprs: &[Arc], state: &ExecutionState, -) -> PolarsResult> { +) -> PolarsResult> { // We partition by normal expression and window expression // - the normal expressions can run in parallel // - the window expression take more memory and often use the same group_by keys and join tuples @@ -202,7 +202,7 @@ fn run_exprs_par( df: &DataFrame, exprs: &[Arc], state: &ExecutionState, -) -> PolarsResult> { +) -> PolarsResult> { POOL.install(|| { exprs .par_iter() @@ -215,7 +215,7 @@ fn run_exprs_seq( df: &DataFrame, exprs: &[Arc], state: &ExecutionState, -) -> PolarsResult> { +) -> PolarsResult> { exprs.iter().map(|expr| expr.evaluate(df, state)).collect() } @@ -225,7 +225,7 @@ pub(super) fn evaluate_physical_expressions( state: &ExecutionState, has_windows: bool, run_parallel: bool, -) -> PolarsResult> { +) -> PolarsResult> { let expr_runner = if has_windows { execute_projection_cached_window_fns } else if run_parallel && exprs.len() > 1 { @@ -246,7 +246,7 @@ pub(super) fn evaluate_physical_expressions( pub(super) fn check_expand_literals( df: &DataFrame, phys_expr: &[Arc], - mut selected_columns: Vec, + mut selected_columns: Vec, zero_length: bool, options: ProjectionOptions, ) -> PolarsResult { diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index ddeed0e8996b..ba6fa8111402 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -37,7 +37,12 @@ impl StackExec { self.options.run_parallel, )?; // We don't have to do a broadcast check as cse is not allowed to hit this. - df._add_series(res, schema)?; + df._add_series( + res.into_iter() + .map(|c| c.take_materialized_series()) + .collect(), + schema, + )?; Ok(df) }); @@ -94,7 +99,12 @@ impl StackExec { } } } - df._add_series(res, schema)?; + df._add_series( + res.into_iter() + .map(|v| v.take_materialized_series()) + .collect(), + schema, + )?; } df }; diff --git a/crates/polars-ops/src/chunked_array/gather/chunked.rs b/crates/polars-ops/src/chunked_array/gather/chunked.rs index 391837d52947..249e8dc1730a 100644 --- a/crates/polars-ops/src/chunked_array/gather/chunked.rs +++ b/crates/polars-ops/src/chunked_array/gather/chunked.rs @@ -88,6 +88,22 @@ fn prepare_series(s: &Series) -> Cow { phys } +impl TakeChunked for Column { + unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { + // @scalar-opt + let s = self.as_materialized_series(); + let s = unsafe { s.take_chunked_unchecked(by, sorted) }; + s.into_column() + } + + unsafe fn take_opt_chunked_unchecked(&self, by: &[ChunkId]) -> Self { + // @scalar-opt + let s = self.as_materialized_series(); + let s = unsafe { s.take_opt_chunked_unchecked(by) }; + s.into_column() + } +} + impl TakeChunked for Series { unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { let phys = prepare_series(self); diff --git a/crates/polars-ops/src/series/ops/idx.rs b/crates/polars-ops/src/series/ops/idx.rs deleted file mode 100644 index 07d6381435e2..000000000000 --- a/crates/polars-ops/src/series/ops/idx.rs +++ /dev/null @@ -1,22 +0,0 @@ -use num_traits::{FromPrimitive, Zero}; -use polars_core::prelude::*; -use polars_utils::index::ToIdx; - -fn prepare_gather_index_impl(ca: &ChunkedArray, length: usize) -> IdxCa -where T: PolarsNumericType, -T::Native: ToIdx -{ - T::Native::from_usize() - - ca.apply_generic(|v| { - v.and_then(|v|{ - if v < T::Native::zero() { - - } - - v.to_idx_size() - }) - }) -} - -pub fn convert_to_index(s: &Series, length: usize) diff --git a/crates/polars-ops/src/series/ops/replace.rs b/crates/polars-ops/src/series/ops/replace.rs index 7c5697429372..4aa84910239c 100644 --- a/crates/polars-ops/src/series/ops/replace.rs +++ b/crates/polars-ops/src/series/ops/replace.rs @@ -239,9 +239,11 @@ fn create_replacer(mut old: Series, mut new: Series, add_mask: bool) -> PolarsRe let len = old.len(); let cols = if add_mask { - // @scalar-opt - let mask = Column::new(PlSmallStr::from_static("__POLARS_REPLACE_MASK"), &[true]) - .new_from_index(0, new.len()); + let mask = Column::new_scalar( + PlSmallStr::from_static("__POLARS_REPLACE_MASK"), + true.into(), + new.len(), + ); vec![old.into(), new.into(), mask] } else { vec![old.into(), new.into()] diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index a88ff858e6ee..3d6b92aeba67 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1822,6 +1822,7 @@ impl Expr { #[cfg(feature = "dtype-struct")] /// Count all unique values and create a struct mapping value to count. /// (Note that it is better to turn parallel off in the aggregation context). + /// The name of the struct field with the counts is given by the parameter `name`. pub fn value_counts(self, sort: bool, parallel: bool, name: &str, normalize: bool) -> Self { self.apply_private(FunctionExpr::ValueCounts { sort, @@ -1837,7 +1838,7 @@ impl Expr { #[cfg(feature = "unique_counts")] /// Returns a count of the unique values in the order of appearance. - /// This method differs from [`Expr::value_counts]` in that it does not return the + /// This method differs from [`Expr::value_counts`] in that it does not return the /// values, only the counts and might be faster. pub fn unique_counts(self) -> Self { self.apply_private(FunctionExpr::UniqueCounts) @@ -1967,10 +1968,10 @@ impl Expr { /// Apply a function/closure over multiple columns once the logical plan get executed. /// -/// This function is very similar to `[apply_mul]`, but differs in how it handles aggregations. +/// This function is very similar to [`apply_multiple`], but differs in how it handles aggregations. /// -/// * `map_mul` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power` -/// * `apply_mul` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. +/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power` +/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. /// /// It is the responsibility of the caller that the schema is correct by giving /// the correct output_type. If None given the output type of the input expr is used. @@ -1995,11 +1996,11 @@ where /// Apply a function/closure over multiple columns once the logical plan get executed. /// -/// This function is very similar to `[apply_mul]`, but differs in how it handles aggregations. +/// This function is very similar to [`apply_multiple`], but differs in how it handles aggregations. /// -/// * `map_mul` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power` -/// * `apply_mul` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. -/// * `map_list_mul` should be used when the function expects a list aggregated series. +/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power` +/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. +/// * [`map_list_multiple`] should be used when the function expects a list aggregated series. pub fn map_list_multiple(function: F, expr: E, output_type: GetOutput) -> Expr where F: Fn(&mut [Column]) -> PolarsResult> + 'static + Send + Sync, @@ -2025,10 +2026,10 @@ where /// It is the responsibility of the caller that the schema is correct by giving /// the correct output_type. If None given the output type of the input expr is used. /// -/// This difference with `[map_mul]` is that `[apply_mul]` will create a separate `[Series]` per group. +/// This difference with [`map_multiple`] is that [`apply_multiple`] will create a separate [`Series`] per group. /// -/// * `[map_mul]` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power` -/// * `[apply_mul]` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. +/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power` +/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. pub fn apply_multiple( function: F, expr: E, diff --git a/crates/polars-plan/src/plans/aexpr/mod.rs b/crates/polars-plan/src/plans/aexpr/mod.rs index 286ea86ac968..e53dc50dc6d9 100644 --- a/crates/polars-plan/src/plans/aexpr/mod.rs +++ b/crates/polars-plan/src/plans/aexpr/mod.rs @@ -192,7 +192,7 @@ pub enum AExpr { /// Function arguments /// Some functions rely on aliases, /// for instance assignment of struct fields. - /// Therefor we need `[ExprIr]`. + /// Therefor we need [`ExprIr`]. input: Vec, /// function to apply function: FunctionExpr, diff --git a/crates/polars-plan/src/plans/functions/explode.rs b/crates/polars-plan/src/plans/functions/explode.rs deleted file mode 100644 index a5140d81103b..000000000000 --- a/crates/polars-plan/src/plans/functions/explode.rs +++ /dev/null @@ -1,5 +0,0 @@ -use super::*; - -pub(super) fn explode_impl(df: DataFrame, columns: &[PlSmallStr]) -> PolarsResult { - df.explode(columns) -} diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index 16af7a3071df..1f1624fa7b0f 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -22,7 +22,7 @@ polars-time = { workspace = true } polars-utils = { workspace = true } # TODO! remove this once truly activated. This is required to make sdist building work -polars-stream = { workspace = true } +# polars-stream = { workspace = true } ahash = { workspace = true } arboard = { workspace = true, optional = true } diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index 24db48144508..d3ebb376d10f 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -469,6 +469,7 @@ pub fn lit(value: &Bound<'_, PyAny>, allow_object: bool, is_scalar: bool) -> PyR ) })?; match av { + #[cfg(feature = "object")] AnyValue::ObjectOwned(_) => { let s = Python::with_gil(|py| { PySeries::new_object(py, "", vec![ObjectValue::from(value.into_py(py))], false) diff --git a/crates/polars-python/src/functions/misc.rs b/crates/polars-python/src/functions/misc.rs index 2ade770d728e..1df25a222b16 100644 --- a/crates/polars-python/src/functions/misc.rs +++ b/crates/polars-python/src/functions/misc.rs @@ -66,5 +66,6 @@ pub fn register_plugin_function( #[pyfunction] pub fn __register_startup_deps() { + #[cfg(feature = "object")] crate::on_startup::register_startup_deps() } diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 13529cfd9d1f..fd89884ece82 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -1130,6 +1130,7 @@ impl PyLazyFrame { ldf.tail(n).into() } + #[cfg(feature = "pivot")] #[pyo3(signature = (on, index, value_name, variable_name))] fn unpivot( &self, diff --git a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs index 06a98e3fe970..e3425b52ccd9 100644 --- a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs @@ -973,6 +973,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { StringFunction::ExtractMany { .. } => { return Err(PyNotImplementedError::new_err("extract_many")) }, + #[cfg(feature = "regex")] StringFunction::EscapeRegex => { (PyStringFunction::EscapeRegex.into_py(py),).to_object(py) }, @@ -1221,7 +1222,6 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { FunctionExpr::Mode => ("mode",).to_object(py), FunctionExpr::Skew(bias) => ("skew", bias).to_object(py), FunctionExpr::Kurtosis(fisher, bias) => ("kurtosis", fisher, bias).to_object(py), - #[cfg(feature = "dtype-array")] FunctionExpr::Reshape(_) => return Err(PyNotImplementedError::new_err("reshape")), #[cfg(feature = "repeat_by")] FunctionExpr::RepeatBy => ("repeat_by",).to_object(py), diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 28c5e459b1e5..05a56d920719 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -584,6 +584,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { columns.iter().map(|s| s.to_string()).collect::>(), ) .to_object(py), + #[cfg(feature = "pivot")] FunctionIR::Unpivot { args, schema: _ } => ( "unpivot", args.index.iter().map(|s| s.as_str()).collect::>(), diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index f65822146d2c..b14285e77aa0 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -168,25 +168,15 @@ impl PySeries { } fn bitand(&self, other: &PySeries) -> PyResult { - let out = self - .series - .bitand(&other.series) - .map_err(PyPolarsErr::from)?; + let out = (&self.series & &other.series).map_err(PyPolarsErr::from)?; Ok(out.into()) } - fn bitor(&self, other: &PySeries) -> PyResult { - let out = self - .series - .bitor(&other.series) - .map_err(PyPolarsErr::from)?; + let out = (&self.series | &other.series).map_err(PyPolarsErr::from)?; Ok(out.into()) } fn bitxor(&self, other: &PySeries) -> PyResult { - let out = self - .series - .bitxor(&other.series) - .map_err(PyPolarsErr::from)?; + let out = (&self.series ^ &other.series).map_err(PyPolarsErr::from)?; Ok(out.into()) } diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index fc130a035140..c40f477ff741 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -16,7 +16,7 @@ futures = { workspace = true } memmap = { workspace = true } parking_lot = { workspace = true } pin-project-lite = { workspace = true } -polars-io = { workspace = true, features = ["async", "cloud", "aws"] } +polars-io = { workspace = true } polars-utils = { workspace = true } rand = { workspace = true } rayon = { workspace = true } @@ -26,11 +26,10 @@ tokio = { workspace = true } polars-core = { workspace = true } polars-error = { workspace = true } -polars-expr = { workspace = true, features = ["dtype-full"] } -# TODO: feature gate -polars-mem-engine = { workspace = true, features = ["parquet", "csv", "json", "ipc", "cloud", "python", "dtype-categorical", "dtype-i8", "dtype-i16", "dtype-u8", "dtype-u16", "dtype-decimal", "dtype-struct", "object"] } +polars-expr = { workspace = true } +polars-mem-engine = { workspace = true } polars-parquet = { workspace = true } -polars-plan = { workspace = true, features = ["parquet", "csv", "json", "ipc", "cloud", "python", "serde", "dtype-categorical", "dtype-i8", "dtype-i16", "dtype-u8", "dtype-u16", "dtype-decimal", "dtype-struct", "object"] } +polars-plan = { workspace = true } [build-dependencies] version_check = { workspace = true } @@ -41,6 +40,11 @@ bitwise = ["polars-core/bitwise", "polars-plan/bitwise", "polars-expr/bitwise"] merge_sorted = ["polars-plan/merge_sorted"] dynamic_group_by = [] strings = [] +ipc = ["polars-mem-engine/ipc", "polars-plan/ipc"] +parquet = ["polars-mem-engine/parquet", "polars-plan/parquet"] +csv = ["polars-mem-engine/csv", "polars-plan/csv"] +json = ["polars-mem-engine/json", "polars-plan/json"] +cloud = ["polars-mem-engine/cloud", "polars-plan/cloud", "polars-io/cloud"] # We need to specify default features here to match workspace defaults. # Otherwise we get warnings with cargo check/clippy. diff --git a/crates/polars-stream/src/expression.rs b/crates/polars-stream/src/expression.rs index 3c1b9445997c..197a28e265cc 100644 --- a/crates/polars-stream/src/expression.rs +++ b/crates/polars-stream/src/expression.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use polars_core::frame::DataFrame; -use polars_core::prelude::Series; +use polars_core::prelude::Column; use polars_error::PolarsResult; use polars_expr::prelude::{ExecutionState, PhysicalExpr}; @@ -21,7 +21,7 @@ impl StreamExpr { } } - pub async fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { + pub async fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { if self.reentrant { let state = state.clone(); let phys_expr = self.inner.clone(); diff --git a/crates/polars-stream/src/nodes/group_by.rs b/crates/polars-stream/src/nodes/group_by.rs index a2a2ae0d4d96..fb91a2965ac5 100644 --- a/crates/polars-stream/src/nodes/group_by.rs +++ b/crates/polars-stream/src/nodes/group_by.rs @@ -85,7 +85,10 @@ impl GroupBySinkState { // SAFETY: we resize the reduction to the number of groups beforehand. reduction.resize(local.grouper.num_groups()); reduction.update_groups( - &selector.evaluate(&df, state).await?, + selector + .evaluate(&df, state) + .await? + .as_materialized_series(), &group_idxs, )?; } diff --git a/crates/polars-stream/src/nodes/io_sinks/mod.rs b/crates/polars-stream/src/nodes/io_sinks/mod.rs index ce14ad3b0f7a..cc1682199a2a 100644 --- a/crates/polars-stream/src/nodes/io_sinks/mod.rs +++ b/crates/polars-stream/src/nodes/io_sinks/mod.rs @@ -1 +1,2 @@ +#[cfg(feature = "ipc")] pub mod ipc; diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index 559e4717c4e9..4fb42daddd6b 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -8,6 +8,7 @@ pub mod io_sinks; pub mod map; pub mod multiplexer; pub mod ordered_union; +#[cfg(feature = "parquet")] pub mod parquet_source; pub mod reduce; pub mod select; diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index 565854e97b81..8a863050be9b 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -64,7 +64,7 @@ impl ReduceNode { while let Ok(morsel) = recv.recv().await { for (reducer, selector) in local_reducers.iter_mut().zip(selectors) { let input = selector.evaluate(morsel.df(), state).await?; - reducer.update_group(&input, 0)?; + reducer.update_group(input.as_materialized_series(), 0)?; } } diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index ed0f08a0d48f..e0735144da79 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -99,9 +99,13 @@ fn visualize_plan_rec( PhysNodeKind::FileSink { input, file_type, .. } => match file_type { + #[cfg(feature = "parquet")] FileType::Parquet(_) => ("parquet-sink".to_string(), from_ref(input)), + #[cfg(feature = "ipc")] FileType::Ipc(_) => ("ipc-sink".to_string(), from_ref(input)), + #[cfg(feature = "csv")] FileType::Csv(_) => ("csv-sink".to_string(), from_ref(input)), + #[cfg(feature = "json")] FileType::Json(_) => ("json-sink".to_string(), from_ref(input)), }, PhysNodeKind::InMemoryMap { input, map: _ } => { @@ -140,9 +144,13 @@ fn visualize_plan_rec( file_options, } => { let name = match scan_type { + #[cfg(feature = "parquet")] FileScan::Parquet { .. } => "parquet-source", + #[cfg(feature = "csv")] FileScan::Csv { .. } => "csv-source", + #[cfg(feature = "ipc")] FileScan::Ipc { .. } => "ipc-source", + #[cfg(feature = "json")] FileScan::NDJson { .. } => "ndjson-source", FileScan::Anonymous { .. } => "anonymous-source", }; diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 485bbf03a7fe..d57a8667c479 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -212,6 +212,7 @@ pub fn lower_ir( let file_type = file_type.clone(); match file_type { + #[cfg(feature = "ipc")] FileType::Ipc(_) => { let phys_input = lower_ir!(*input)?; PhysNodeKind::FileSink { @@ -223,6 +224,7 @@ pub fn lower_ir( _ => todo!(), } }, + #[cfg(feature = "cloud")] SinkType::Cloud { .. } => todo!(), }, diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index d9253e48dfa5..472cf982a253 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -213,6 +213,7 @@ fn to_graph_rec<'a>( let input_key = to_graph_rec(*input, ctx)?; match file_type { + #[cfg(feature = "ipc")] FileType::Ipc(ipc_writer_options) => ctx.graph.add_node( nodes::io_sinks::ipc::IpcSinkNode::new(input_schema, path, ipc_writer_options)?, [input_key], @@ -341,6 +342,7 @@ fn to_graph_rec<'a>( use polars_plan::prelude::FileScan; match scan_type { + #[cfg(feature = "parquet")] FileScan::Parquet { options, cloud_options, diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 685ed71d8306..9ff45610a3c7 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -311,6 +311,7 @@ dtype-array = [ "polars-core/dtype-array", "polars-lazy?/dtype-array", "polars-ops/dtype-array", + "polars-plan?/dtype-array", ] dtype-i8 = [ "polars-core/dtype-i8", @@ -414,6 +415,8 @@ docs-selection = [ "dynamic_group_by", "extract_groups", "replace", + "approx_unique", + "unique_counts", ] bench = [ diff --git a/py-polars/polars/_utils/udfs.py b/py-polars/polars/_utils/udfs.py index 0ff968ed59ec..ed91c1920cc9 100644 --- a/py-polars/polars/_utils/udfs.py +++ b/py-polars/polars/_utils/udfs.py @@ -183,11 +183,15 @@ class OpNames: "endswith": "str.ends_with", "lower": "str.to_lowercase", "lstrip": "str.strip_chars_start", + "removeprefix": "str.strip_prefix", + "removesuffix": "str.strip_suffix", + "replace": "str.replace", "rstrip": "str.strip_chars_end", "startswith": "str.starts_with", "strip": "str.strip_chars", "title": "str.to_titlecase", "upper": "str.to_uppercase", + "zfill": "str.zfill", # temporal "date": "dt.date", "isoweekday": "dt.weekday", @@ -983,7 +987,7 @@ def _rewrite_methods( """Replace python method calls with synthetic POLARS_EXPRESSION op.""" LOAD_METHOD = OpNames.LOAD_ATTR if _MIN_PY312 else {"LOAD_METHOD"} if matching_instructions := ( - # method call with one basic arg, eg: "s.endswith('!')" + # method call with one arg, eg: "s.endswith('!')" self._matches( idx, opnames=[LOAD_METHOD, {"LOAD_CONST"}, OpNames.CALL], @@ -1016,6 +1020,47 @@ def _rewrite_methods( px = inst._replace(opname="POLARS_EXPRESSION", argval=expr, argrepr=expr) updated_instructions.append(px) + elif matching_instructions := ( + # method call with three args, eg: "s.replace('!','?',count=2)" + self._matches( + idx, + opnames=[ + LOAD_METHOD, + {"LOAD_CONST"}, + {"LOAD_CONST"}, + {"LOAD_CONST"}, + OpNames.CALL, + ], + argvals=[_PYTHON_METHODS_MAP], + ) + or + # method call with two args, eg: "s.replace('!','?')" + self._matches( + idx, + opnames=[LOAD_METHOD, {"LOAD_CONST"}, {"LOAD_CONST"}, OpNames.CALL], + argvals=[_PYTHON_METHODS_MAP], + ) + ): + inst = matching_instructions[0] + expr = _PYTHON_METHODS_MAP[inst.argval] + + param_values = [ + i.argval + for i in matching_instructions[1 : len(matching_instructions) - 1] + ] + if expr == "str.replace": + if len(param_values) == 3: + old, new, count = param_values + expr += f"({old!r},{new!r},n={count},literal=True)" + else: + old, new = param_values + expr = f"str.replace_all({old!r},{new!r},literal=True)" + else: + expr += f"({','.join(repr(v) for v in param_values)})" + + px = inst._replace(opname="POLARS_EXPRESSION", argval=expr, argrepr=expr) + updated_instructions.append(px) + return len(matching_instructions) @staticmethod diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 3403f8c12dac..4ff2752fdfb5 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -162,6 +162,7 @@ ParquetCompression, PivotAgg, PolarsDataType, + PythonDataType, RollingInterpolationMethod, RowTotalsDefinition, SchemaDefinition, @@ -7620,7 +7621,9 @@ def drop_in_place(self, name: str) -> Series: def cast( self, dtypes: ( - Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] + Mapping[ + ColumnNameOrSelector | PolarsDataType, PolarsDataType | PythonDataType + ] | PolarsDataType ), *, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index daebcf452c1e..0483d058180b 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -1036,7 +1036,7 @@ def scan_csv( decimal_comma: bool = False, glob: bool = True, storage_options: dict[str, Any] | None = None, - credential_provider: CredentialProviderFunction | Literal["auto"] | None = None, + credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto", retries: int = 2, file_cache_ttl: int | None = None, include_file_paths: str | None = None, diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 1348134fc0d6..b8af12ae8806 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -364,7 +364,7 @@ def scan_ipc( row_index_name: str | None = None, row_index_offset: int = 0, storage_options: dict[str, Any] | None = None, - credential_provider: CredentialProviderFunction | Literal["auto"] | None = None, + credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto", memory_map: bool = True, retries: int = 2, file_cache_ttl: int | None = None, diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 983b8cddcfe1..7da4635408d1 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -38,7 +38,7 @@ def read_ndjson( row_index_offset: int = 0, ignore_errors: bool = False, storage_options: dict[str, Any] | None = None, - credential_provider: CredentialProviderFunction | Literal["auto"] | None = None, + credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto", retries: int = 2, file_cache_ttl: int | None = None, include_file_paths: str | None = None, @@ -206,7 +206,7 @@ def scan_ndjson( row_index_offset: int = 0, ignore_errors: bool = False, storage_options: dict[str, Any] | None = None, - credential_provider: CredentialProviderFunction | Literal["auto"] | None = None, + credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto", retries: int = 2, file_cache_ttl: int | None = None, include_file_paths: str | None = None, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 16ff7f614349..0cc91a94693f 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -54,7 +54,7 @@ def read_parquet( rechunk: bool = False, low_memory: bool = False, storage_options: dict[str, Any] | None = None, - credential_provider: CredentialProviderFunction | Literal["auto"] | None = None, + credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto", retries: int = 2, use_pyarrow: bool = False, pyarrow_options: dict[str, Any] | None = None, @@ -338,7 +338,7 @@ def scan_parquet( low_memory: bool = False, cache: bool = True, storage_options: dict[str, Any] | None = None, - credential_provider: CredentialProviderFunction | Literal["auto"] | None = None, + credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto", retries: int = 2, include_file_paths: str | None = None, allow_missing_columns: bool = False, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 9b438ae8dbfa..3cc0cb85fb8c 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -111,6 +111,7 @@ Label, Orientation, PolarsDataType, + PythonDataType, RollingInterpolationMethod, SchemaDefinition, SchemaDict, @@ -2899,7 +2900,9 @@ def cache(self) -> LazyFrame: def cast( self, dtypes: ( - Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] + Mapping[ + ColumnNameOrSelector | PolarsDataType, PolarsDataType | PythonDataType + ] | PolarsDataType ), *, @@ -2979,6 +2982,7 @@ def cast( 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} """ if not isinstance(dtypes, Mapping): + dtypes = parse_into_dtype(dtypes) return self._from_pyldf(self._ldf.cast_all(dtypes, strict)) cast_map = {} diff --git a/py-polars/polars/schema.py b/py-polars/polars/schema.py index 81ade5a6b206..fb1b8268bf2f 100644 --- a/py-polars/polars/schema.py +++ b/py-polars/polars/schema.py @@ -56,31 +56,37 @@ class Schema(BaseSchema): Parameters ---------- schema - The schema definition given by column names and their associated *instantiated* + The schema definition given by column names and their associated Polars data type. Accepts a mapping or an iterable of tuples. Examples -------- - Define a schema by passing *instantiated* data types. - - >>> schema = pl.Schema({"foo": pl.Int8(), "bar": pl.String()}) + Define a schema by passing instantiated data types. + + >>> schema = pl.Schema( + ... { + ... "foo": pl.String(), + ... "bar": pl.Duration("us"), + ... "baz": pl.Array(pl.Int8, 4), + ... } + ... ) >>> schema - Schema({'foo': Int8, 'bar': String}) + Schema({'foo': String, 'bar': Duration(time_unit='us'), 'baz': Array(Int8, shape=(4,))}) Access the data type associated with a specific column name. - >>> schema["foo"] - Int8 + >>> schema["baz"] + Array(Int8, shape=(4,)) Access various schema properties using the `names`, `dtypes`, and `len` methods. >>> schema.names() - ['foo', 'bar'] + ['foo', 'bar', 'baz'] >>> schema.dtypes() - [Int8, String] + [String, Duration(time_unit='us'), Array(Int8, shape=(4,))] >>> schema.len() - 2 - """ + 3 + """ # noqa: W505 def __init__( self, @@ -123,15 +129,41 @@ def __setitem__( super().__setitem__(name, dtype) def names(self) -> list[str]: - """Get the column names of the schema.""" + """ + Get the column names of the schema. + + Examples + -------- + >>> s = pl.Schema({"x": pl.Float64(), "y": pl.Datetime(time_zone="UTC")}) + >>> s.names() + ['x', 'y'] + """ return list(self.keys()) def dtypes(self) -> list[DataType]: - """Get the data types of the schema.""" + """ + Get the data types of the schema. + + Examples + -------- + >>> s = pl.Schema({"x": pl.UInt8(), "y": pl.List(pl.UInt8)}) + >>> s.dtypes() + [UInt8, List(UInt8)] + """ return list(self.values()) def len(self) -> int: - """Get the number of columns in the schema.""" + """ + Get the number of schema entries. + + Examples + -------- + >>> s = pl.Schema({"x": pl.Int32(), "y": pl.List(pl.String)}) + >>> s.len() + 2 + >>> len(s) + 2 + """ return len(self) def to_python(self) -> dict[str, type]: @@ -140,7 +172,13 @@ def to_python(self) -> dict[str, type]: Examples -------- - >>> s = pl.Schema({"x": pl.Int8(), "y": pl.String(), "z": pl.Duration("ms")}) + >>> s = pl.Schema( + ... { + ... "x": pl.Int8(), + ... "y": pl.String(), + ... "z": pl.Duration("us"), + ... } + ... ) >>> s.to_python() {'x': , 'y': , 'z': } """ diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 934cad33d1d7..d86c9d29cd0f 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4967,14 +4967,14 @@ def round_sig_figs(self, digits: int) -> Series: Examples -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s = pl.Series([0.01234, 3.333, 3450.0]) >>> s.round_sig_figs(2) shape: (3,) Series: '' [f64] [ 0.012 3.3 - 1200.0 + 3500.0 ] """ diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index e89a8e19c0b6..9225aa8a690e 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -62,8 +62,8 @@ hypothesis # ------- pytest==8.3.2 -pytest-codspeed==2.2.1 -pytest-cov==5.0.0 +pytest-codspeed==3.0.0 +pytest-cov==6.0.0 pytest-xdist==3.6.1 # Need moto.server to mock s3fs - see: https://github.com/aio-libs/aiobotocore/issues/755 diff --git a/py-polars/requirements-lint.txt b/py-polars/requirements-lint.txt index b6c173bf8320..9c216431c84c 100644 --- a/py-polars/requirements-lint.txt +++ b/py-polars/requirements-lint.txt @@ -1,3 +1,3 @@ mypy[faster-cache]==1.13.0 ruff==0.7.1 -typos==1.26.8 +typos==1.27.2 diff --git a/py-polars/tests/unit/constructors/test_dataframe.py b/py-polars/tests/unit/constructors/test_dataframe.py index 251ec5e7bce2..e885919294d1 100644 --- a/py-polars/tests/unit/constructors/test_dataframe.py +++ b/py-polars/tests/unit/constructors/test_dataframe.py @@ -59,7 +59,7 @@ def test_df_init_from_generator_dict_view() -> None: data = { "keys": d.keys(), "vals": d.values(), - "itms": d.items(), + "items": d.items(), } with pytest.raises(TypeError, match="unexpected value"): pl.DataFrame(data, strict=True) @@ -68,12 +68,12 @@ def test_df_init_from_generator_dict_view() -> None: assert df.schema == { "keys": pl.Int64, "vals": pl.String, - "itms": pl.List(pl.String), + "items": pl.List(pl.String), } assert df.to_dict(as_series=False) == { "keys": [0, 1, 2], "vals": ["x", "y", "z"], - "itms": [["0", "x"], ["1", "y"], ["2", "z"]], + "items": [["0", "x"], ["1", "y"], ["2", "z"]], } @@ -86,19 +86,19 @@ def test_df_init_from_generator_reversed_dict_view() -> None: data = { "rev_keys": reversed(d.keys()), "rev_vals": reversed(d.values()), - "rev_itms": reversed(d.items()), + "rev_items": reversed(d.items()), } - df = pl.DataFrame(data, schema_overrides={"rev_itms": pl.Object}) + df = pl.DataFrame(data, schema_overrides={"rev_items": pl.Object}) assert df.schema == { "rev_keys": pl.Int64, "rev_vals": pl.String, - "rev_itms": pl.Object, + "rev_items": pl.Object, } assert df.to_dict(as_series=False) == { "rev_keys": [2, 1, 0], "rev_vals": ["z", "y", "x"], - "rev_itms": [(2, "z"), (1, "y"), (0, "x")], + "rev_items": [(2, "z"), (1, "y"), (0, "x")], } diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index d8910cda4fb2..c375e1952347 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -736,7 +736,10 @@ def test_concat() -> None: def test_arg_where() -> None: s = pl.Series([True, False, True, False]) - assert_series_equal(pl.arg_where(s, eager=True).cast(int), pl.Series([0, 2])) + assert_series_equal( + pl.arg_where(s, eager=True).cast(int), + pl.Series([0, 2]), + ) def test_to_dummies() -> None: @@ -1060,7 +1063,7 @@ def test_cast_frame() -> None: # cast via col:dtype map assert df.cast( - dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")} + dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")}, ).schema == { "a": pl.Float64, "b": pl.Float32, @@ -1068,6 +1071,16 @@ def test_cast_frame() -> None: "d": pl.Datetime("ms"), } + # cast via col:pytype map + assert df.cast( + dtypes={"b": float, "c": str, "d": datetime}, + ).schema == { + "a": pl.Float64, + "b": pl.Float64, + "c": pl.String, + "d": pl.Datetime("us"), + } + # cast via selector:dtype map assert df.cast( { diff --git a/py-polars/tests/unit/datatypes/test_array.py b/py-polars/tests/unit/datatypes/test_array.py index b578266b0c6f..df7a5c5cc4ec 100644 --- a/py-polars/tests/unit/datatypes/test_array.py +++ b/py-polars/tests/unit/datatypes/test_array.py @@ -383,3 +383,16 @@ def test_zero_width_array(fn: str) -> None: df = pl.concat([a.to_frame(), b.to_frame()], how="horizontal") df.select(c=expr_f(pl.col.a, pl.col.b)) + + +def test_elementwise_arithmetic_19682() -> None: + dt = pl.Array(pl.Int64, (2, 3)) + + a = pl.Series("a", [[[1, 2, 3], [4, 5, 6]]], dt) + sc = pl.Series("a", [1]) + zfa = pl.Series("a", [[]], pl.Array(pl.Int64, 0)) + + assert_series_equal(a + a, pl.Series("a", [[[2, 4, 6], [8, 10, 12]]], dt)) + assert_series_equal(a + sc, pl.Series("a", [[[2, 3, 4], [5, 6, 7]]], dt)) + assert_series_equal(sc + a, pl.Series("a", [[[2, 3, 4], [5, 6, 7]]], dt)) + assert_series_equal(zfa + zfa, pl.Series("a", [[]], pl.Array(pl.Int64, 0))) diff --git a/py-polars/tests/unit/io/cloud/test_cloud.py b/py-polars/tests/unit/io/cloud/test_cloud.py index f943ab5e2c26..54d1b5ccd6a6 100644 --- a/py-polars/tests/unit/io/cloud/test_cloud.py +++ b/py-polars/tests/unit/io/cloud/test_cloud.py @@ -1,3 +1,5 @@ +from functools import partial + import pytest import polars as pl @@ -11,6 +13,11 @@ def test_scan_nonexistent_cloud_path_17444(format: str) -> None: path_str = f"s3://my-nonexistent-bucket/data.{format}" scan_function = getattr(pl, f"scan_{format}") + # Prevent automatic credential provideder instantiation, otherwise CI may fail with + # * pytest.PytestUnraisableExceptionWarning: + # * Exception ignored: + # * ResourceWarning: unclosed socket + scan_function = partial(scan_function, credential_provider=None) # Just calling the scan function should not raise any errors if format == "ndjson": diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 8ea7d2152bc0..4fc9e0321a66 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -2356,6 +2356,7 @@ def test_nested_dicts(content: list[float | None]) -> None: [i if i % 7 < 3 and i % 5 > 3 else None for i in range(57)], ], ) +@pytest.mark.slow def test_dict_slices( leading_nulls: list[None], trailing_nulls: list[None], diff --git a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py index 74946f084d51..61416054755d 100644 --- a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py +++ b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py @@ -176,6 +176,26 @@ """lambda x: x.lstrip().startswith(('!','#','?',"'"))""", """pl.col("b").str.strip_chars_start().str.contains(r"^(!|\\#|\\?|')")""", ), + ( + "b", + "lambda x: x.replace(':','')", + """pl.col("b").str.replace_all(':','',literal=True)""", + ), + ( + "b", + "lambda x: x.replace(':','',2)", + """pl.col("b").str.replace(':','',n=2,literal=True)""", + ), + ( + "b", + "lambda x: x.removeprefix('A').removesuffix('F')", + """pl.col("b").str.strip_prefix('A').str.strip_suffix('F')""", + ), + ( + "b", + "lambda x: x.zfill(8)", + """pl.col("b").str.zfill(8)""", + ), # --------------------------------------------- # json expr: load/extract # --------------------------------------------- diff --git a/py-polars/tests/unit/operations/test_cast.py b/py-polars/tests/unit/operations/test_cast.py index dca9eeb3e767..4e8dae9b2d38 100644 --- a/py-polars/tests/unit/operations/test_cast.py +++ b/py-polars/tests/unit/operations/test_cast.py @@ -13,12 +13,13 @@ from polars.testing.asserts.series import assert_series_equal if TYPE_CHECKING: - from polars._typing import PolarsDataType + from polars._typing import PolarsDataType, PythonDataType -def test_string_date() -> None: +@pytest.mark.parametrize("dtype", [pl.Date(), pl.Date, date]) +def test_string_date(dtype: PolarsDataType | PythonDataType) -> None: df = pl.DataFrame({"x1": ["2021-01-01"]}).with_columns( - **{"x1-date": pl.col("x1").cast(pl.Date)} + **{"x1-date": pl.col("x1").cast(dtype)} ) expected = pl.DataFrame({"x1-date": [date(2021, 1, 1)]}) out = df.select(pl.col("x1-date")) @@ -668,9 +669,10 @@ def test_bool_numeric_supertype(dtype: PolarsDataType) -> None: assert result.item() - 0.3333333 <= 0.00001 -def test_cast_consistency() -> None: +@pytest.mark.parametrize("dtype", [pl.String(), pl.String, str]) +def test_cast_consistency(dtype: PolarsDataType | PythonDataType) -> None: assert pl.DataFrame().with_columns(a=pl.lit(0.0)).with_columns( - b=pl.col("a").cast(pl.String), c=pl.lit(0.0).cast(pl.String) + b=pl.col("a").cast(dtype), c=pl.lit(0.0).cast(dtype) ).to_dict(as_series=False) == {"a": [0.0], "b": ["0.0"], "c": ["0.0"]}