Skip to content

Commit

Permalink
fix: Fix mask and validity confusion in Parquet String decoding (pola…
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Nov 3, 2024
1 parent 7d93ec4 commit 0b0a914
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 3 deletions.
3 changes: 1 addition & 2 deletions crates/polars-parquet/src/arrow/read/deserialize/binview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,13 +276,12 @@ fn decode_masked_optional_plain(
verify_utf8,
);
}

if page_validity.unset_bits() == 0 {
return decode_masked_required_plain(
num_expected_values,
values,
target,
page_validity,
mask,
verify_utf8,
);
}
Expand Down
49 changes: 48 additions & 1 deletion py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from polars.exceptions import ComputeError
from polars.testing import assert_frame_equal, assert_series_equal
from polars.testing.parametric import column, dataframes
from polars.testing.parametric.strategies.core import series

if TYPE_CHECKING:
from pathlib import Path
Expand Down Expand Up @@ -1560,6 +1561,53 @@ def test_predicate_filtering(
assert_frame_equal(result, df.filter(expr))


@pytest.mark.parametrize(
"use_dictionary",
[False, True],
)
@pytest.mark.parametrize(
"data_page_size",
[1, None],
)
@given(
s=series(
min_size=1,
max_size=10,
excluded_dtypes=[
pl.Decimal,
pl.Categorical,
pl.Enum,
pl.Struct, # See #19612.
],
),
offset=st.integers(0, 10),
length=st.integers(0, 10),
)
def test_pyarrow_slice_roundtrip(
s: pl.Series,
use_dictionary: bool,
data_page_size: int | None,
offset: int,
length: int,
) -> None:
offset %= len(s) + 1
length %= len(s) - offset + 1

f = io.BytesIO()
df = s.to_frame()
pq.write_table(
df.to_arrow(),
f,
compression="NONE",
use_dictionary=use_dictionary,
data_page_size=data_page_size,
)

f.seek(0)
scanned = pl.scan_parquet(f).slice(offset, length).collect()
assert_frame_equal(scanned, df.slice(offset, length))


@given(
df=dataframes(
min_size=1,
Expand All @@ -1579,7 +1627,6 @@ def test_slice_roundtrip(df: pl.DataFrame, offset: int, length: int) -> None:
df.write_parquet(f)

f.seek(0)
print((offset, length))
scanned = pl.scan_parquet(f).slice(offset, length).collect()
assert_frame_equal(scanned, df.slice(offset, length))

Expand Down

0 comments on commit 0b0a914

Please sign in to comment.