Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(rust, python): fix O(n^2) in sorted check during append #10241

Merged
merged 3 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions crates/polars-core/src/chunked_array/object/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ where
&self.values[self.offset + index]
}

pub fn get(&self, index: usize) -> Option<&T> {
if self.is_valid(index) {
Some(unsafe { self.value_unchecked(index) })
} else {
None
}
}

/// Get a value at a certain index location
///
/// # Safety
Expand Down
4 changes: 3 additions & 1 deletion crates/polars-core/src/chunked_array/ops/append.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ pub(super) fn update_sorted_flag_before_append<'a, T>(
// this is safe as we go from &mut borrow to &
// because the trait is only implemented for &ChunkedArray<T>
let borrow = std::mem::transmute::<&ChunkedArray<T>, &'a ChunkedArray<T>>(ca);
borrow.get_unchecked(ca.len() - 1)
// ensure we don't access with `len() - 1` this will have O(n^2) complexity
// if we append many chunks that are sorted
borrow.last()
}
};
let start = unsafe { other.get_unchecked(0) };
Expand Down
7 changes: 7 additions & 0 deletions crates/polars-core/src/chunked_array/ops/downcast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ impl<'a, T> Chunks<'a, T> {
pub fn len(&self) -> usize {
self.chunks.len()
}

pub fn last(&self) -> Option<&'a T> {
self.chunks.last().map(|arr| {
let arr = &**arr;
unsafe { &*(arr as *const dyn Array as *const T) }
})
}
}

#[doc(hidden)]
Expand Down
10 changes: 10 additions & 0 deletions crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@ pub trait TakeRandom {
{
self.get(index)
}

/// This is much faster if we have many chunks as we don't have to compute the index
/// # Panics
/// Panics if `index >= self.len()`
fn last(&self) -> Option<Self::Item>;
}
// Utility trait because associated type needs a lifetime
pub trait TakeRandomUtf8 {
Expand All @@ -188,6 +193,11 @@ pub trait TakeRandomUtf8 {
{
self.get(index)
}

/// This is much faster if we have many chunks
/// # Panics
/// Panics if `index >= self.len()`
fn last(&self) -> Option<Self::Item>;
}

/// Fast access by index.
Expand Down
93 changes: 83 additions & 10 deletions crates/polars-core/src/chunked_array/ops/take/take_random.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::convert::TryFrom;

use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::bitmap::utils::get_bit_unchecked;
use arrow::bitmap::Bitmap;
Expand Down Expand Up @@ -103,6 +101,14 @@ where
Self::Multi(m) => m.get_unchecked(index),
}
}

fn last(&self) -> Option<Self::Item> {
match self {
Self::SingleNoNull(s) => s.last(),
Self::Single(s) => s.last(),
Self::Multi(m) => m.last(),
}
}
}

pub enum TakeRandBranch2<S, M> {
Expand Down Expand Up @@ -130,6 +136,12 @@ where
Self::Multi(m) => m.get_unchecked(index),
}
}
fn last(&self) -> Option<Self::Item> {
match self {
Self::Single(s) => s.last(),
Self::Multi(m) => m.last(),
}
}
}

#[allow(clippy::type_complexity)]
Expand Down Expand Up @@ -187,6 +199,11 @@ impl<'a> TakeRandom for Utf8TakeRandom<'a> {
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
take_random_get_unchecked!(self, index)
}
fn last(&self) -> Option<Self::Item> {
self.chunks
.last()
.and_then(|arr| arr.get(arr.len().saturating_sub(1)))
}
}

pub struct Utf8TakeRandomSingleChunk<'a> {
Expand All @@ -209,6 +226,9 @@ impl<'a> TakeRandom for Utf8TakeRandomSingleChunk<'a> {
None
}
}
fn last(&self) -> Option<Self::Item> {
self.get(self.arr.len().saturating_sub(1))
}
}

impl<'a> IntoTakeRandom<'a> for &'a Utf8Chunked {
Expand Down Expand Up @@ -251,6 +271,11 @@ impl<'a> TakeRandom for BinaryTakeRandom<'a> {
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
take_random_get_unchecked!(self, index)
}
fn last(&self) -> Option<Self::Item> {
self.chunks
.last()
.and_then(|arr| arr.get(arr.len().saturating_sub(1)))
}
}

pub struct BinaryTakeRandomSingleChunk<'a> {
Expand All @@ -273,6 +298,9 @@ impl<'a> TakeRandom for BinaryTakeRandomSingleChunk<'a> {
None
}
}
fn last(&self) -> Option<Self::Item> {
self.get(self.arr.len().saturating_sub(1))
}
}

impl<'a> IntoTakeRandom<'a> for &'a BinaryChunked {
Expand Down Expand Up @@ -334,8 +362,11 @@ impl<'a> IntoTakeRandom<'a> for &'a ListChunked {
};
TakeRandBranch2::Single(t)
} else {
let name = self.name();
let inner_type = self.inner_dtype().to_physical();
let t = ListTakeRandom {
ca: self,
name,
inner_type,
chunks: chunks.collect(),
chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(),
};
Expand Down Expand Up @@ -367,6 +398,11 @@ where
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
take_random_get_unchecked!(self, index)
}
fn last(&self) -> Option<Self::Item> {
self.chunks
.last()
.and_then(|arr| arr.get(arr.len().saturating_sub(1)))
}
}

pub struct NumTakeRandomCont<'a, T> {
Expand All @@ -388,6 +424,9 @@ where
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
Some(*self.slice.get_unchecked(index))
}
fn last(&self) -> Option<Self::Item> {
self.slice.last().copied()
}
}

pub struct TakeRandomBitmap<'a> {
Expand Down Expand Up @@ -445,6 +484,9 @@ where
None
}
}
fn last(&self) -> Option<Self::Item> {
self.get(self.vals.len().saturating_sub(1))
}
}

pub struct BoolTakeRandom<'a> {
Expand All @@ -464,6 +506,12 @@ impl<'a> TakeRandom for BoolTakeRandom<'a> {
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
take_random_get_unchecked!(self, index)
}

fn last(&self) -> Option<Self::Item> {
self.chunks
.last()
.and_then(|arr| arr.get(arr.len().saturating_sub(1)))
}
}

pub struct BoolTakeRandomSingleChunk<'a> {
Expand All @@ -486,10 +534,14 @@ impl<'a> TakeRandom for BoolTakeRandomSingleChunk<'a> {
None
}
}
fn last(&self) -> Option<Self::Item> {
self.arr.get(self.arr.len().saturating_sub(1))
}
}

pub struct ListTakeRandom<'a> {
ca: &'a ListChunked,
inner_type: DataType,
name: &'a str,
chunks: Vec<&'a ListArray<i64>>,
chunk_lens: Vec<IdxSize>,
}
Expand All @@ -500,18 +552,28 @@ impl<'a> TakeRandom for ListTakeRandom<'a> {
#[inline]
fn get(&self, index: usize) -> Option<Self::Item> {
let v = take_random_get!(self, index);
v.map(|v| {
let s = Series::try_from((self.ca.name(), v));
s.unwrap()
v.map(|arr| unsafe {
Series::from_chunks_and_dtype_unchecked(self.name, vec![arr], &self.inner_type)
})
}

#[inline]
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
let v = take_random_get_unchecked!(self, index);
v.map(|v| {
let s = Series::try_from((self.ca.name(), v));
s.unwrap()
v.map(|arr| unsafe {
Series::from_chunks_and_dtype_unchecked(self.name, vec![arr], &self.inner_type)
})
}
fn last(&self) -> Option<Self::Item> {
self.chunks.last().and_then(|arr| {
let arr = arr.get(arr.len().saturating_sub(1));
arr.map(|arr| unsafe {
Series::from_chunks_and_dtype_unchecked(
self.name,
vec![arr.to_boxed()],
&self.inner_type,
)
})
})
}
}
Expand Down Expand Up @@ -543,6 +605,9 @@ impl<'a> TakeRandom for ListTakeRandomSingleChunk<'a> {
None
}
}
fn last(&self) -> Option<Self::Item> {
self.get(self.arr.len().saturating_sub(1))
}
}

#[cfg(feature = "object")]
Expand All @@ -564,6 +629,11 @@ impl<'a, T: PolarsObject> TakeRandom for ObjectTakeRandom<'a, T> {
unsafe fn get_unchecked(&self, index: usize) -> Option<Self::Item> {
take_random_get_unchecked!(self, index)
}
fn last(&self) -> Option<Self::Item> {
self.chunks
.last()
.and_then(|arr| arr.get(arr.len().saturating_sub(1)))
}
}

#[cfg(feature = "object")]
Expand All @@ -588,6 +658,9 @@ impl<'a, T: PolarsObject> TakeRandom for ObjectTakeRandomSingleChunk<'a, T> {
None
}
}
fn last(&self) -> Option<Self::Item> {
self.arr.get(self.arr.len().saturating_sub(1))
}
}

#[cfg(feature = "object")]
Expand Down
Loading