Skip to content

Commit

Permalink
struct null counts
Browse files Browse the repository at this point in the history
  • Loading branch information
magarick committed Jul 28, 2023
1 parent 0caa3ff commit 2048a5c
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 37 deletions.
66 changes: 29 additions & 37 deletions crates/polars-core/src/chunked_array/logical/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ mod from;

use std::collections::BTreeMap;
use std::io::Write;
use std::ops::BitAnd;
use std::ops::BitOr;

use arrow::bitmap::MutableBitmap;
use arrow::offset::OffsetsBuffer;
Expand All @@ -26,6 +26,7 @@ pub struct StructChunked {
field: Field,
chunks: Vec<ArrayRef>,
null_count: usize,
total_null_count: usize,
}

fn arrays_to_fields(field_arrays: &[ArrayRef], fields: &[Series]) -> Vec<ArrowField> {
Expand Down Expand Up @@ -67,6 +68,9 @@ impl StructChunked {
pub fn null_count(&self) -> usize {
self.null_count
}
pub fn total_null_count(&self) -> usize {
self.total_null_count
}
pub fn new(name: &str, fields: &[Series]) -> PolarsResult<Self> {
let mut names = PlHashSet::with_capacity(fields.len());
let first_len = fields.get(0).map(|s| s.len()).unwrap_or(0);
Expand Down Expand Up @@ -173,58 +177,46 @@ impl StructChunked {
field,
chunks: vec![arrow_array],
null_count: 0,
total_null_count: 0,
};
out.set_null_count();
out
}

fn set_null_count(&mut self) {
let mut null_count = 0;
// Count both the total number of nulls and the rows where everything is null
let (mut null_count, mut total_null_count) = (0, 0);
let chunks_lens = self.fields()[0].chunks().len();

// fast path
// we early return if a column doesn't have nulls
for i in 0..chunks_lens {
for s in self.fields() {
let arr = &s.chunks()[i];
let has_nulls = arr.null_count() > 0 || matches!(s.dtype(), DataType::Null);
if !has_nulls {
self.null_count = 0;
return;
}
}
}

// slow path
// we bitand every null validity bitmask to determine
// in which rows all values are null
// A row is null if all values in it are null, so we bitor every validity bitmask since a
// single valid entry makes that row not null. We can also save some work by not bothering
// to bitor fields that would have all 0 validities (Null dtype or everything null). Note
// that since we keep track of the total null count as well, we can't break early, but we
// are only dealing with the validity masks when we absolutely have to.
for i in 0..chunks_lens {
let mut validity_agg = None;

let mut all_null_array = true;
let mut validity_agg: Option<arrow::bitmap::Bitmap> = None;
let mut n_nulls = None;
for s in self.fields() {
let arr = &s.chunks()[i];

if !matches!(s.dtype(), DataType::Null) {
all_null_array = false;
match (&validity_agg, arr.validity()) {
(Some(agg), Some(validity)) => validity_agg = Some(validity.bitand(agg)),
(None, Some(validity)) => validity_agg = Some(validity.clone()),
_ => {}
let nc = arr.null_count();
match (n_nulls, arr.validity(), nc == arr.len() && nc > 0) {
(_, _, true) => total_null_count += arr.len(),
(Some(0), _, _) => {}
(_, Some(v), _) => {
validity_agg =
validity_agg.map_or(Some(v.clone()), |agg| Some(v.bitor(&agg)));
n_nulls = Some(validity_agg.as_ref().unwrap().unset_bits());
}
(_, None, _) => n_nulls = Some(0),
}
}
// we add the null count
if let Some(validity) = &validity_agg {
null_count += validity.unset_bits()
}
// all arrays are null arrays
// we add the length of the chunk to the null_count
else if all_null_array {
null_count += self.fields()[0].chunks()[i].len()
match n_nulls {
// If it's none, every array was either Null-type or all null
None => null_count += self.fields()[0].chunks()[i].len(),
Some(n) => null_count += n,
}
}
self.null_count = null_count
(self.null_count, self.total_null_count) = (null_count, total_null_count)
}

/// Get access to one of this `[StructChunked]`'s fields
Expand Down
34 changes: 34 additions & 0 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,3 +863,37 @@ def test_struct_get_field_by_index() -> None:
df = pl.DataFrame({"val": [{"a": 1, "b": 2}]})
expected = {"b": [2]}
assert df.select(pl.all().struct[1]).to_dict(as_series=False) == expected


def test_struct_null_count_10130() -> None:
a_0 = pl.DataFrame({"x": [None, 0, 0, 1, 1], "y": [0, 0, 1, 0, 1]}).to_struct("xy")
a_1 = pl.DataFrame({"x": [2, 0, 0, 1, 1], "y": [0, 0, 1, 0, 1]}).to_struct("xy")
a_2 = pl.DataFrame({"x": [2, 0, 0, 1, 1], "y": [0, 0, None, 0, 1]}).to_struct("xy")
assert a_0.null_count() == 0
assert a_1.null_count() == 0
assert a_2.null_count() == 0

b_0 = pl.DataFrame(
{"x": [1, None, 0, 0, 1, 1, None], "y": [None, 0, None, 0, 1, 0, 1]}
).to_struct("xy")
b_1 = pl.DataFrame(
{"x": [None, None, 0, 0, 1, 1, None], "y": [None, 0, None, 0, 1, 0, 1]}
).to_struct("xy")
assert b_0.null_count() == 0
assert b_1.null_count() == 1

c_0 = pl.DataFrame({"x": [None, None]}).to_struct("x")
c_1 = pl.DataFrame({"y": [1, 2], "x": [None, None]}).to_struct("xy")
c_2 = pl.DataFrame({"x": [None, None], "y": [1, 2]}).to_struct("xy")
assert c_0.null_count() == 2
assert c_1.null_count() == 0
assert c_2.null_count() == 0

# There was an issue where it could ignore parts of a multi-chunk Series
s = pl.Series([{"a": 1, "b": 2}])
r = pl.Series(
[{"a": None, "b": None}], dtype=pl.Struct({"a": pl.Int64, "b": pl.Int64})
)
s.append(r)

assert s.null_count() == 1

0 comments on commit 2048a5c

Please sign in to comment.