Skip to content

Commit

Permalink
perf: fix accidental quadratic behavior; cache null_count (#11889)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Oct 20, 2023
1 parent 5425f6a commit c69722d
Show file tree
Hide file tree
Showing 14 changed files with 53 additions and 9 deletions.
6 changes: 5 additions & 1 deletion crates/polars-core/src/chunked_array/builder/binary.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;

pub struct BinaryChunkedBuilder {
Expand Down Expand Up @@ -40,14 +42,16 @@ impl BinaryChunkedBuilder {

pub fn finish(mut self) -> BinaryChunked {
let arr = self.builder.as_box();
let length = arr.len() as IdxSize;
let length = IdxSize::try_from(arr.len()).expect(LENGTH_LIMIT_MSG);
let null_count = arr.null_count() as IdxSize;

ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
null_count,
}
}

Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/builder/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ impl ChunkedBuilder<bool, BooleanType> for BooleanChunkedBuilder {

fn finish(mut self) -> BooleanChunked {
let arr = self.array_builder.as_box();
let length = arr.len() as IdxSize;

let mut ca = ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
length: 0,
null_count: 0,
};
ca.compute_len();
ca
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/builder/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ where

fn finish(mut self) -> ChunkedArray<T> {
let arr = self.array_builder.as_box();
let length = arr.len() as IdxSize;
let mut ca = ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
length: 0,
null_count: 0,
};
ca.compute_len();
ca
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/builder/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ impl Utf8ChunkedBuilder {

pub fn finish(mut self) -> Utf8Chunked {
let arr = self.builder.as_box();
let length = arr.len() as IdxSize;

let mut ca = ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
length: 0,
null_count: 0,
};
ca.compute_len();
ca
Expand Down
11 changes: 10 additions & 1 deletion crates/polars-core/src/chunked_array/from.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;

#[allow(clippy::all)]
Expand Down Expand Up @@ -143,10 +145,12 @@ where
);

let mut length = 0;
let mut null_count = 0;
let chunks = chunks
.into_iter()
.map(|x| {
length += x.len();
null_count += x.null_count();
Box::new(x) as Box<dyn Array>
})
.collect();
Expand All @@ -156,7 +160,8 @@ where
chunks,
phantom: PhantomData,
bit_settings: Default::default(),
length: length.try_into().unwrap(),
length: length.try_into().expect(LENGTH_LIMIT_MSG),
null_count: null_count as IdxSize,
}
}

Expand Down Expand Up @@ -184,6 +189,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand Down Expand Up @@ -213,6 +219,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand All @@ -235,6 +242,7 @@ where
phantom: PhantomData,
bit_settings,
length: 0,
null_count: 0,
};
out.compute_len();
if !keep_sorted {
Expand All @@ -258,6 +266,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand Down
5 changes: 4 additions & 1 deletion crates/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ pub struct ChunkedArray<T: PolarsDataType> {
phantom: PhantomData<T>,
pub(crate) bit_settings: Settings,
length: IdxSize,
null_count: IdxSize,
}

bitflags! {
Expand Down Expand Up @@ -303,6 +304,7 @@ impl<T: PolarsDataType> ChunkedArray<T> {
///
/// # Safety
/// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
/// And the `null_count` remains correct.
#[inline]
pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
&mut self.chunks
Expand All @@ -316,7 +318,7 @@ impl<T: PolarsDataType> ChunkedArray<T> {
/// Count the null values.
#[inline]
pub fn null_count(&self) -> usize {
self.chunks.iter().map(|arr| arr.null_count()).sum()
self.null_count as usize
}

/// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
Expand Down Expand Up @@ -610,6 +612,7 @@ impl<T: PolarsDataType> Clone for ChunkedArray<T> {
phantom: PhantomData,
bit_settings: self.bit_settings,
length: self.length,
null_count: self.null_count,
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-core/src/chunked_array/object/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ where
let null_bitmap: Option<Bitmap> = self.bitmask_builder.into();

let len = self.values.len();
let null_count = null_bitmap
.as_ref()
.map(|validity| validity.unset_bits())
.unwrap_or(0) as IdxSize;

let arr = Box::new(ObjectArray {
values: Arc::new(self.values),
Expand All @@ -72,6 +76,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: len as IdxSize,
null_count,
}
}
}
Expand Down Expand Up @@ -136,6 +141,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: len as IdxSize,
null_count: 0,
}
}

Expand Down
4 changes: 4 additions & 0 deletions crates/polars-core/src/chunked_array/ops/append.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ where
update_sorted_flag_before_append::<T>(self, other);
let len = self.len();
self.length += other.length;
self.null_count += other.null_count;
new_chunks(&mut self.chunks, &other.chunks, len);
}
}
Expand All @@ -90,6 +91,7 @@ impl ListChunked {

let len = self.len();
self.length += other.length;
self.null_count += other.null_count;
new_chunks(&mut self.chunks, &other.chunks, len);
self.set_sorted_flag(IsSorted::Not);
if !other._can_fast_explode() {
Expand All @@ -108,6 +110,7 @@ impl ArrayChunked {

let len = self.len();
self.length += other.length;
self.null_count += other.null_count;
new_chunks(&mut self.chunks, &other.chunks, len);
self.set_sorted_flag(IsSorted::Not);
Ok(())
Expand All @@ -120,6 +123,7 @@ impl<T: PolarsObject> ObjectChunked<T> {
pub fn append(&mut self, other: &Self) {
let len = self.len();
self.length += other.length;
self.null_count += other.null_count;
self.set_sorted_flag(IsSorted::Not);
new_chunks(&mut self.chunks, &other.chunks, len);
}
Expand Down
1 change: 1 addition & 0 deletions crates/polars-core/src/chunked_array/ops/apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ impl<T: PolarsNumericType> ChunkedArray<T> {
.for_each(|arr| arrow::compute::arity_assign::unary(arr, f))
};
// can be in any order now
self.compute_len();
self.set_sorted_flag(IsSorted::Not);
}
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/chunked_array/ops/chunkops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ impl<T: PolarsDataType> ChunkedArray<T> {
}
}
self.length = IdxSize::try_from(inner(&self.chunks)).expect(LENGTH_LIMIT_MSG);
self.null_count = self
.chunks
.iter()
.map(|arr| arr.null_count())
.sum::<usize>() as IdxSize;

if self.length <= 1 {
self.set_sorted_flag(IsSorted::Ascending)
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-core/src/chunked_array/upstream_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ impl<T: PolarsDataType> Default for ChunkedArray<T> {
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
}
}
}
Expand Down Expand Up @@ -330,6 +331,7 @@ impl<T: PolarsObject> FromIterator<Option<T>> for ObjectChunked<T> {
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ impl Series {

/// # Safety
/// The caller must ensure the length and the data types of `ArrayRef` does not change.
/// And that the null_count is updated (e.g. with a `compute_len()`)
pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
#[allow(unused_mut)]
let mut ca = self._get_inner_mut();
Expand Down Expand Up @@ -254,6 +255,11 @@ impl Series {
Ok(self)
}

/// Redo a length and null_count compute
pub fn compute_len(&mut self) {
self._get_inner_mut().compute_len()
}

/// Extend the memory backed by this array with the values from `other`.
///
/// See [`ChunkedArray::extend`] and [`ChunkedArray::append`].
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-core/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,7 @@ pub fn coalesce_nulls<'a, T: PolarsDataType>(
*arr_b = arr_b.with_validity(arr.validity().cloned())
}
}
b.compute_len();
(Cow::Owned(a), Cow::Owned(b))
} else {
(Cow::Borrowed(a), Cow::Borrowed(b))
Expand All @@ -899,6 +900,8 @@ pub fn coalesce_nulls_series(a: &Series, b: &Series) -> (Series, Series) {
*arr_a = arr_a.with_validity(validity.clone());
*arr_b = arr_b.with_validity(validity);
}
a.compute_len();
b.compute_len();
(a, b)
} else {
(a.clone(), b.clone())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ pub(crate) fn insert_streaming_nodes(
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(_) => string_cache,
DataType::List(inner) => allowed_dtype(inner, string_cache),
#[cfg(feature = "dtype-struct")]
DataType::Struct(fields) => fields
.iter()
.all(|fld| allowed_dtype(fld.data_type(), string_cache)),
Expand Down

0 comments on commit c69722d

Please sign in to comment.