Skip to content

Commit

Permalink
refactor: Fix struct outer validity;fmt;is_in;cast;cmp (#17590)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Jul 12, 2024
1 parent b40be85 commit f61af0f
Show file tree
Hide file tree
Showing 18 changed files with 197 additions and 222 deletions.
5 changes: 3 additions & 2 deletions crates/polars-core/src/chunked_array/comparison/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,7 @@ where
R: Fn(BooleanChunked, BooleanChunked) -> BooleanChunked,
{
if a.len() != b.len() || a.struct_fields().len() != b.struct_fields().len() {
// polars_ensure!(a.len() == 1 || b.len() == 1, ShapeMismatch: "length lhs: {}, length rhs: {}", a.len(), b.len());
BooleanChunked::full("", value, a.len())
} else {
let (a, b) = align_chunks_binary(a, b);
Expand Down Expand Up @@ -707,7 +708,7 @@ impl ChunkCompare<&StructChunked2> for StructChunked2 {
self,
rhs,
|l, r| l.not_equal(r).unwrap(),
|a, b| a.not_equal(&b).unique().unwrap(),
|a, b| a | b,
true,
)
}
Expand All @@ -717,7 +718,7 @@ impl ChunkCompare<&StructChunked2> for StructChunked2 {
self,
rhs,
|l, r| l.not_equal_missing(r).unwrap(),
|a, b| a.not_equal_missing(&b).unique().unwrap(),
|a, b| a | b,
true,
)
}
Expand Down
55 changes: 0 additions & 55 deletions crates/polars-core/src/chunked_array/iterator/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use arrow::array::*;

use crate::prelude::*;
#[cfg(feature = "dtype-struct")]
use crate::series::iterator::SeriesIter;
use crate::utils::Container;

pub mod par;

Expand Down Expand Up @@ -421,58 +418,6 @@ impl<T: PolarsObject> ObjectChunked<T> {
}
}

// TODO: STRUCT REFACTOR: REMOVE THIS
#[cfg(feature = "dtype-struct")]
impl<'a> IntoIterator for &'a StructChunked2 {
type Item = Option<&'a [AnyValue<'a>]>;
type IntoIter = StructIter2<'a>;

fn into_iter(self) -> Self::IntoIter {
assert_eq!(self.n_chunks(), 1);
let fields = self.fields_as_series();
let field_iter = fields
.iter()
.map(|s| {
let iter = s.iter();
// SAFETY: this works as the reference is to the heap, and not to the struct.
unsafe { std::mem::transmute::<SeriesIter<'_>, SeriesIter<'a>>(iter) }
})
.collect();

StructIter2 {
field_iter,
buf: vec![],
}
}
}

#[cfg(feature = "dtype-struct")]
pub struct StructIter2<'a> {
field_iter: Vec<SeriesIter<'a>>,
buf: Vec<AnyValue<'a>>,
}

#[cfg(feature = "dtype-struct")]
impl<'a> Iterator for StructIter2<'a> {
type Item = Option<&'a [AnyValue<'a>]>;

fn next(&mut self) -> Option<Self::Item> {
self.buf.clear();

for it in &mut self.field_iter {
self.buf.push(it.next()?);
}

// SAFETY:
// Lifetime is bound to struct, we just cannot set the lifetime for the iterator trait
unsafe {
Some(Some(std::mem::transmute::<&'_ [AnyValue], &'a [AnyValue]>(
&self.buf,
)))
}
}
}

/// Wrapper struct to convert an iterator of type `T` into one of type [`Option<T>`]. It is useful to make the
/// [`IntoIterator`] trait, in which every iterator shall return an [`Option<T>`].
pub struct SomeIterator<I>(I)
Expand Down
30 changes: 30 additions & 0 deletions crates/polars-core/src/chunked_array/ops/any_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use polars_utils::sync::SyncPtr;
use crate::chunked_array::object::extension::polars_extension::PolarsExtension;
use crate::prelude::*;
use crate::series::implementations::null::NullChunked;
use crate::utils::index_to_chunked_index;

#[inline]
#[allow(unused_variables)]
Expand Down Expand Up @@ -317,3 +318,32 @@ impl ChunkAnyValue for NullChunked {
Ok(AnyValue::Null)
}
}

#[cfg(feature = "dtype-struct")]
impl ChunkAnyValue for StructChunked2 {
/// Gets AnyValue from LogicalType
fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
polars_ensure!(i < self.len(), oob = i, self.len());
unsafe { Ok(self.get_any_value_unchecked(i)) }
}

unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
let (chunk_idx, idx) = index_to_chunked_index(self.chunks.iter().map(|c| c.len()), i);
if let DataType::Struct(flds) = self.dtype() {
// SAFETY: we already have a single chunk and we are
// guarded by the type system.
unsafe {
let arr = &**self.chunks.get_unchecked(chunk_idx);
let arr = &*(arr as *const dyn Array as *const StructArray);

if arr.is_null_unchecked(idx) {
AnyValue::Null
} else {
AnyValue::Struct(idx, arr, flds)
}
}
} else {
unreachable!()
}
}
}
42 changes: 41 additions & 1 deletion crates/polars-core/src/chunked_array/ops/downcast.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::marker::PhantomData;

use arrow::array::*;
use arrow::bitmap::Bitmap;
use arrow::compute::utils::combine_validities_and;

use crate::prelude::*;
Expand Down Expand Up @@ -155,9 +156,48 @@ impl<T: PolarsDataType> ChunkedArray<T> {
unsafe {
for (arr, other) in self.chunks_mut().iter_mut().zip(chunks) {
let validity = combine_validities_and(arr.validity(), other.validity());
arr.with_validity(validity);
*arr = arr.with_validity(validity);
}
}
self.compute_len();
}

pub(crate) fn set_outer_validity(&mut self, validity: Option<Bitmap>) {
assert_eq!(self.chunks().len(), 1);
unsafe {
let arr = self.chunks_mut().iter_mut().next().unwrap();
*arr = arr.with_validity(validity);
}
self.compute_len();
}

pub fn with_outer_validity(mut self, validity: Option<Bitmap>) -> Self {
self.set_outer_validity(validity);
self
}

pub fn with_outer_validity_chunked(mut self, validity: BooleanChunked) -> Self {
assert_eq!(self.len(), validity.len());
if !self
.chunks
.iter()
.zip(validity.chunks.iter())
.map(|(a, b)| a.len() == b.len())
.all_equal()
|| self.chunks.len() != validity.chunks().len()
{
let ca = self.rechunk();
let validity = validity.rechunk();
ca.with_outer_validity_chunked(validity)
} else {
unsafe {
for (arr, valid) in self.chunks_mut().iter_mut().zip(validity.downcast_iter()) {
assert!(valid.validity().is_none());
*arr = arr.with_validity(Some(valid.values().clone()))
}
}
self.compute_len();
self
}
}
}
45 changes: 11 additions & 34 deletions crates/polars-core/src/chunked_array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ mod frame;
use std::fmt::Write;

use arrow::array::StructArray;
use arrow::bitmap::Bitmap;
use arrow::compute::utils::combine_validities_and;
use arrow::legacy::utils::CustomIterTools;
use polars_error::{polars_ensure, PolarsResult};
Expand All @@ -14,7 +13,7 @@ use crate::chunked_array::ChunkedArray;
use crate::prelude::sort::arg_sort_multiple::{_get_rows_encoded_arr, _get_rows_encoded_ca};
use crate::prelude::*;
use crate::series::Series;
use crate::utils::{index_to_chunked_index, Container};
use crate::utils::Container;

pub type StructChunked2 = ChunkedArray<StructType>;

Expand Down Expand Up @@ -171,7 +170,11 @@ impl StructChunked2 {
})
.collect::<PolarsResult<Vec<_>>>()?;

Self::from_series(self.name(), &new_fields).map(|ca| ca.into_series())
let mut out = Self::from_series(self.name(), &new_fields)?;
if self.null_count > 0 {
out.merge_validities(self.chunks());
}
Ok(out.into_series())
},
DataType::String => {
let ca = self.clone();
Expand Down Expand Up @@ -223,7 +226,11 @@ impl StructChunked2 {
}
})
.collect::<PolarsResult<Vec<_>>>()?;
Self::from_series(self.name(), &fields).map(|ca| ca.into_series())
let mut out = Self::from_series(self.name(), &fields)?;
if self.null_count > 0 {
out.merge_validities(self.chunks());
}
Ok(out.into_series())
},
}
}
Expand All @@ -248,27 +255,6 @@ impl StructChunked2 {
self.cast_with_options(dtype, CastOptions::NonStrict)
}

/// Gets AnyValue from LogicalType
pub(crate) fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
polars_ensure!(i < self.len(), oob = i, self.len());
unsafe { Ok(self.get_any_value_unchecked(i)) }
}

pub(crate) unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
let (chunk_idx, idx) = index_to_chunked_index(self.chunks.iter().map(|c| c.len()), i);
if let DataType::Struct(flds) = self.dtype() {
// SAFETY: we already have a single chunk and we are
// guarded by the type system.
unsafe {
let arr = &**self.chunks.get_unchecked(chunk_idx);
let arr = &*(arr as *const dyn Array as *const StructArray);
AnyValue::Struct(idx, arr, flds)
}
} else {
unreachable!()
}
}

pub fn _apply_fields<F>(&self, mut func: F) -> PolarsResult<Self>
where
F: FnMut(&Series) -> Series,
Expand Down Expand Up @@ -341,15 +327,6 @@ impl StructChunked2 {
self.compute_len();
}

pub(crate) fn set_outer_validity(&mut self, validity: Option<Bitmap>) {
assert_eq!(self.chunks().len(), 1);
unsafe {
let arr = self.downcast_iter_mut().next().unwrap();
arr.set_validity(validity)
}
self.compute_len();
}

pub fn unnest(mut self) -> DataFrame {
self.propagate_nulls();

Expand Down
3 changes: 1 addition & 2 deletions crates/polars-ops/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,6 @@ mod tests {
assert_eq!(ca.json_infer(Some(2)).unwrap(), expected_dtype);
}

// STRUCT REFACTOR
#[ignore]
#[test]
fn test_json_decode() {
let s = Series::new(
Expand All @@ -212,6 +210,7 @@ mod tests {
],
)
.unwrap()
.with_outer_validity_chunked(BooleanChunked::new("", [false, true, true, false]))
.into_series();
let expected_dtype = expected_series.dtype().clone();

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/series/ops/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ pub fn qcut(
}

mod test {
// STRUCT REFACTOR
// This need metadata in fields
#[ignore]
#[test]
fn test_map_cats_fast_unique() {
Expand Down
17 changes: 15 additions & 2 deletions py-polars/polars/_utils/construction/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,26 @@ def sequence_to_pyseries(
return pyseries

elif dtype == Struct:
# This is very bad. Goes via rows? And needs to do outer nullability separate.
# It also has two data passes.
# TODO: eventually go into struct builder
struct_schema = dtype.to_schema() if isinstance(dtype, Struct) else None
empty = {} # type: ignore[var-annotated]

data = []
invalid = []
for i, v in enumerate(values):
if v is None:
invalid.append(i)
data.append(empty)
else:
data.append(v)

return plc.sequence_to_pydf(
data=[(empty if v is None else v) for v in values],
data=data,
schema=struct_schema,
orient="row",
).to_struct(name)
).to_struct(name, invalid)

if python_dtype is None:
if value is None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10518,7 +10518,7 @@ def to_struct(self, name: str = "") -> Series:
{5,"five"}
]
"""
return wrap_s(self._df.to_struct(name))
return wrap_s(self._df.to_struct(name, []))

def unnest(
self,
Expand Down
Loading

0 comments on commit f61af0f

Please sign in to comment.