From fad5b133dbb50b874df29c69cef62de9d5793b3b Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 27 Jul 2023 20:52:38 +0200 Subject: [PATCH] fix(rust, python): don't run file-caching in streaming mode (#10117) --- .../src/logical_plan/optimizer/mod.rs | 2 +- .../tests/unit/streaming/test_streaming.py | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs index 03e9038fae87..1b8b08e68a7b 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs @@ -75,7 +75,7 @@ pub fn optimize( let comm_subexpr_elim = opt_state.comm_subexpr_elim; #[allow(unused_variables)] - let agg_scan_projection = opt_state.file_caching; + let agg_scan_projection = opt_state.file_caching && !streaming; // gradually fill the rules passed to the optimizer let opt = StackOptimizer {}; diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 23dbe58fd31d..1aef2a7c3fa0 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -693,3 +693,28 @@ def test_streaming_groupby_list_9758() -> None: .to_dict(False) == payload ) + + +@pytest.mark.write_disk() +def test_streaming_10115(tmp_path: Path) -> None: + in_path = tmp_path / "in.parquet" + out_path = tmp_path / "out.parquet" + + # this fails if the schema will be incorrectly due to the projection + # pushdown + (pl.DataFrame([{"x": 1, "y": "foo"}]).write_parquet(in_path)) + + joiner = pl.LazyFrame([{"y": "foo", "z": "_"}]) + + ( + pl.scan_parquet(in_path) + .join(joiner, how="left", on="y") + .select("x", "y", "z") + .sink_parquet(out_path) # + ) + + assert pl.read_parquet(out_path).to_dict(False) == { + "x": [1], + "y": ["foo"], + "z": ["_"], + }