Skip to content

Commit

Permalink
Merge branch 'main' into dup-error-msg
Browse files Browse the repository at this point in the history
  • Loading branch information
mcrumiller committed Oct 21, 2023
2 parents 50cf70a + 04357ef commit dd26bbd
Show file tree
Hide file tree
Showing 76 changed files with 615 additions and 320 deletions.
6 changes: 5 additions & 1 deletion crates/polars-core/src/chunked_array/builder/binary.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;

pub struct BinaryChunkedBuilder {
Expand Down Expand Up @@ -40,14 +42,16 @@ impl BinaryChunkedBuilder {

pub fn finish(mut self) -> BinaryChunked {
let arr = self.builder.as_box();
let length = arr.len() as IdxSize;
let length = IdxSize::try_from(arr.len()).expect(LENGTH_LIMIT_MSG);
let null_count = arr.null_count() as IdxSize;

ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
null_count,
}
}

Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/builder/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ impl ChunkedBuilder<bool, BooleanType> for BooleanChunkedBuilder {

fn finish(mut self) -> BooleanChunked {
let arr = self.array_builder.as_box();
let length = arr.len() as IdxSize;

let mut ca = ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
length: 0,
null_count: 0,
};
ca.compute_len();
ca
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/builder/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ where

fn finish(mut self) -> ChunkedArray<T> {
let arr = self.array_builder.as_box();
let length = arr.len() as IdxSize;
let mut ca = ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
length: 0,
null_count: 0,
};
ca.compute_len();
ca
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/builder/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ impl Utf8ChunkedBuilder {

pub fn finish(mut self) -> Utf8Chunked {
let arr = self.builder.as_box();
let length = arr.len() as IdxSize;

let mut ca = ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
phantom: PhantomData,
bit_settings: Default::default(),
length,
length: 0,
null_count: 0,
};
ca.compute_len();
ca
Expand Down
11 changes: 10 additions & 1 deletion crates/polars-core/src/chunked_array/from.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;

#[allow(clippy::all)]
Expand Down Expand Up @@ -143,10 +145,12 @@ where
);

let mut length = 0;
let mut null_count = 0;
let chunks = chunks
.into_iter()
.map(|x| {
length += x.len();
null_count += x.null_count();
Box::new(x) as Box<dyn Array>
})
.collect();
Expand All @@ -156,7 +160,8 @@ where
chunks,
phantom: PhantomData,
bit_settings: Default::default(),
length: length.try_into().unwrap(),
length: length.try_into().expect(LENGTH_LIMIT_MSG),
null_count: null_count as IdxSize,
}
}

Expand Down Expand Up @@ -184,6 +189,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand Down Expand Up @@ -213,6 +219,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand All @@ -235,6 +242,7 @@ where
phantom: PhantomData,
bit_settings,
length: 0,
null_count: 0,
};
out.compute_len();
if !keep_sorted {
Expand All @@ -258,6 +266,7 @@ where
phantom: PhantomData,
bit_settings: Default::default(),
length: 0,
null_count: 0,
};
out.compute_len();
out
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::using_string_cache;

impl From<&CategoricalChunked> for DictionaryArray<u32> {
fn from(ca: &CategoricalChunked) -> Self {
let keys = ca.logical().rechunk();
let keys = ca.physical().rechunk();
let keys = keys.downcast_iter().next().unwrap();
let map = &**ca.get_rev_map();
let dtype = ArrowDataType::Dictionary(
Expand Down Expand Up @@ -42,7 +42,7 @@ impl From<&CategoricalChunked> for DictionaryArray<u32> {
}
impl From<&CategoricalChunked> for DictionaryArray<i64> {
fn from(ca: &CategoricalChunked) -> Self {
let keys = ca.logical().rechunk();
let keys = ca.physical().rechunk();
let keys = keys.downcast_iter().next().unwrap();
let map = &**ca.get_rev_map();
let dtype = ArrowDataType::Dictionary(
Expand Down
56 changes: 31 additions & 25 deletions crates/polars-core/src/chunked_array/logical/categorical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ bitflags! {

#[derive(Clone)]
pub struct CategoricalChunked {
logical: Logical<CategoricalType, UInt32Type>,
physical: Logical<CategoricalType, UInt32Type>,
/// 1st bit: original local categorical
/// meaning that n_unique is the same as the cat map length
/// 2nd bit: use lexical sorting
Expand All @@ -32,31 +32,37 @@ pub struct CategoricalChunked {

impl CategoricalChunked {
pub(crate) fn field(&self) -> Field {
let name = self.logical().name();
let name = self.physical().name();
Field::new(name, self.dtype().clone())
}

pub fn is_empty(&self) -> bool {
self.len() == 0
}

#[inline]
pub fn len(&self) -> usize {
self.logical.len()
self.physical.len()
}

#[inline]
pub fn null_count(&self) -> usize {
self.physical.null_count()
}

pub fn name(&self) -> &str {
self.logical.name()
self.physical.name()
}

// TODO: Rename this
/// Get a reference to the physical array (the categories).
pub fn logical(&self) -> &UInt32Chunked {
&self.logical
pub fn physical(&self) -> &UInt32Chunked {
&self.physical
}

/// Get a mutable reference to the physical array (the categories).
pub(crate) fn logical_mut(&mut self) -> &mut UInt32Chunked {
&mut self.logical
pub(crate) fn physical_mut(&mut self) -> &mut UInt32Chunked {
&mut self.physical
}

/// Convert a categorical column to its local representation.
Expand All @@ -72,7 +78,7 @@ impl CategoricalChunked {
// if all physical map keys are equal to their values,
// we can skip the apply and only update the rev_map
let local_ca = self
.logical()
.physical()
.apply(|opt_v| opt_v.map(|v| *physical_map.get(&v).unwrap()));

let mut out =
Expand All @@ -84,12 +90,12 @@ impl CategoricalChunked {
}

pub(crate) fn get_flags(&self) -> Settings {
self.logical().get_flags()
self.physical().get_flags()
}

/// Set flags for the Chunked Array
pub(crate) fn set_flags(&mut self, flags: Settings) {
self.logical_mut().set_flags(flags)
self.physical_mut().set_flags(flags)
}

/// Build a categorical from an original RevMap. That means that the number of categories in the `RevMapping == self.unique().len()`.
Expand All @@ -105,7 +111,7 @@ impl CategoricalChunked {
let mut bit_settings = BitSettings::default();
bit_settings.insert(BitSettings::ORIGINAL);
Self {
logical,
physical: logical,
bit_settings,
}
}
Expand Down Expand Up @@ -135,22 +141,22 @@ impl CategoricalChunked {
let mut logical = Logical::<UInt32Type, _>::new_logical::<CategoricalType>(idx);
logical.2 = Some(DataType::Categorical(Some(rev_map)));
Self {
logical,
physical: logical,
bit_settings: Default::default(),
}
}

/// # Safety
/// The existing index values must be in bounds of the new [`RevMapping`].
pub(crate) unsafe fn set_rev_map(&mut self, rev_map: Arc<RevMapping>, keep_fast_unique: bool) {
self.logical.2 = Some(DataType::Categorical(Some(rev_map)));
self.physical.2 = Some(DataType::Categorical(Some(rev_map)));
if !keep_fast_unique {
self.set_fast_unique(false)
}
}

pub(crate) fn can_fast_unique(&self) -> bool {
self.bit_settings.contains(BitSettings::ORIGINAL) && self.logical.chunks.len() == 1
self.bit_settings.contains(BitSettings::ORIGINAL) && self.physical.chunks.len() == 1
}

pub(crate) fn set_fast_unique(&mut self, toggle: bool) {
Expand All @@ -163,7 +169,7 @@ impl CategoricalChunked {

/// Get a reference to the mapping of categorical types to the string values.
pub fn get_rev_map(&self) -> &Arc<RevMapping> {
if let DataType::Categorical(Some(rev_map)) = &self.logical.2.as_ref().unwrap() {
if let DataType::Categorical(Some(rev_map)) = &self.physical.2.as_ref().unwrap() {
rev_map
} else {
panic!("implementation error")
Expand All @@ -172,7 +178,7 @@ impl CategoricalChunked {

/// Create an `[Iterator]` that iterates over the `&str` values of the `[CategoricalChunked]`.
pub fn iter_str(&self) -> CatIter<'_> {
let iter = self.logical().into_iter();
let iter = self.physical().into_iter();
CatIter {
rev: self.get_rev_map(),
iter,
Expand All @@ -182,7 +188,7 @@ impl CategoricalChunked {

impl LogicalType for CategoricalChunked {
fn dtype(&self) -> &DataType {
self.logical.2.as_ref().unwrap()
self.physical.2.as_ref().unwrap()
}

fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
Expand All @@ -191,7 +197,7 @@ impl LogicalType for CategoricalChunked {
}

unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
match self.logical.0.get_unchecked(i) {
match self.physical.0.get_unchecked(i) {
Some(i) => AnyValue::Categorical(i, self.get_rev_map(), SyncPtr::new_null()),
None => AnyValue::Null,
}
Expand All @@ -203,16 +209,16 @@ impl LogicalType for CategoricalChunked {
let mapping = &**self.get_rev_map();

let mut builder =
Utf8ChunkedBuilder::new(self.logical.name(), self.len(), self.len() * 5);
Utf8ChunkedBuilder::new(self.physical.name(), self.len(), self.len() * 5);

let f = |idx: u32| mapping.get(idx);

if !self.logical.has_validity() {
self.logical
if !self.physical.has_validity() {
self.physical
.into_no_null_iter()
.for_each(|idx| builder.append_value(f(idx)));
} else {
self.logical.into_iter().for_each(|opt_idx| {
self.physical.into_iter().for_each(|opt_idx| {
builder.append_option(opt_idx.map(f));
});
}
Expand All @@ -222,13 +228,13 @@ impl LogicalType for CategoricalChunked {
},
DataType::UInt32 => {
let ca = unsafe {
UInt32Chunked::from_chunks(self.logical.name(), self.logical.chunks.clone())
UInt32Chunked::from_chunks(self.physical.name(), self.physical.chunks.clone())
};
Ok(ca.into_series())
},
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(_) => Ok(self.clone().into_series()),
_ => self.logical.cast(dtype),
_ => self.physical.cast(dtype),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;
use crate::chunked_array::ops::append::new_chunks;
use crate::series::IsSorted;

impl CategoricalChunked {
fn set_lengths(&mut self, other: &Self) {
let length_self = &mut self.physical_mut().length;
*length_self = length_self
.checked_add(other.len() as IdxSize)
.expect(LENGTH_LIMIT_MSG);
self.physical_mut().null_count += other.null_count() as IdxSize;
}

pub fn append(&mut self, other: &Self) -> PolarsResult<()> {
if self.logical.null_count() == self.len() && other.logical.null_count() == other.len() {
if self.physical.null_count() == self.len() && other.physical.null_count() == other.len() {
let len = self.len();
self.logical_mut().length += other.len() as IdxSize;
new_chunks(&mut self.logical.chunks, &other.logical().chunks, len);
self.set_lengths(other);
new_chunks(&mut self.physical.chunks, &other.physical().chunks, len);
return Ok(());
}
let is_local_different_source =
Expand All @@ -23,10 +33,10 @@ impl CategoricalChunked {
let new_rev_map = self._merge_categorical_map(other)?;
unsafe { self.set_rev_map(new_rev_map, false) };

self.logical_mut().length += other.len() as IdxSize;
new_chunks(&mut self.logical.chunks, &other.logical().chunks, len);
self.set_lengths(other);
new_chunks(&mut self.physical.chunks, &other.physical().chunks, len);
}
self.logical.set_sorted_flag(IsSorted::Not);
self.physical.set_sorted_flag(IsSorted::Not);
Ok(())
}
}
Loading

0 comments on commit dd26bbd

Please sign in to comment.