From a3c0ecee788bdae4e2449793eb875947c1465a91 Mon Sep 17 00:00:00 2001 From: Alisa Petrova <60570090+sn0rkmaiden@users.noreply.github.com> Date: Thu, 14 Nov 2024 09:55:35 +0300 Subject: [PATCH] fix(python): Fixed typo in file lazy.py (#19769) --- .../src/dsl/function_expr/array.rs | 6 +++- crates/polars-plan/src/plans/aexpr/schema.rs | 13 +++++--- .../simplify_expr/simplify_functions.rs | 6 ---- py-polars/polars/__init__.py | 28 +++++++++++++++++ py-polars/polars/dataframe/_html.py | 4 ++- py-polars/polars/functions/lazy.py | 2 +- .../tests/unit/dataframe/test_repr_html.py | 19 ++++++++++++ py-polars/tests/unit/operations/test_join.py | 10 ++++++ py-polars/tests/unit/test_polars_import.py | 31 +++++++++++++++++++ py-polars/tests/unit/test_schema.py | 18 +++++++++++ 10 files changed, 123 insertions(+), 14 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 2ecd016981e3..08333beb3893 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -132,7 +132,7 @@ impl From for SpecialEq> { #[cfg(feature = "array_count")] CountMatches => map_as_slice!(count_matches), Shift => map_as_slice!(shift), - Explode => unreachable!(), + Explode => map_as_slice!(explode), } } } @@ -253,3 +253,7 @@ pub(super) fn shift(s: &[Column]) -> PolarsResult { ca.array_shift(n.as_materialized_series()).map(Column::from) } + +fn explode(c: &[Column]) -> PolarsResult { + c[0].explode() +} diff --git a/crates/polars-plan/src/plans/aexpr/schema.rs b/crates/polars-plan/src/plans/aexpr/schema.rs index 6c1b675b2bd8..6547c391eaae 100644 --- a/crates/polars-plan/src/plans/aexpr/schema.rs +++ b/crates/polars-plan/src/plans/aexpr/schema.rs @@ -84,11 +84,14 @@ impl AExpr { .get(*expr) .to_field_impl(schema, ctx, arena, &mut false)?; - if let List(inner) = field.dtype() { - Ok(Field::new(field.name().clone(), *inner.clone())) - } else { - Ok(field) - } + let field = match field.dtype() { + List(inner) => Field::new(field.name().clone(), *inner.clone()), + #[cfg(feature = "dtype-array")] + Array(inner, ..) => Field::new(field.name().clone(), *inner.clone()), + _ => field, + }; + + Ok(field) }, Alias(expr, name) => Ok(Field::new( name.clone(), diff --git a/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs index 03f274e5211a..2b5493c62e6b 100644 --- a/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs +++ b/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs @@ -7,12 +7,6 @@ pub(super) fn optimize_functions( expr_arena: &mut Arena, ) -> PolarsResult> { let out = match function { - #[cfg(feature = "dtype-array")] - // arr.explode() -> explode() - FunctionExpr::ArrayExpr(ArrayFunction::Explode) => { - let input_node = input[0].node(); - Some(AExpr::Explode(input_node)) - }, // is_null().any() -> null_count() > 0 // is_not_null().any() -> null_count() < len() // CORRECTNESS: we can ignore 'ignore_nulls' since is_null/is_not_null never produces NULLS diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 83ea52acc822..eb33f23bf53f 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -429,3 +429,31 @@ def __getattr__(name: str) -> Any: msg = f"module {__name__!r} has no attribute {name!r}" raise AttributeError(msg) + + +# fork() breaks Polars thread pool, so warn users who might be doing this. +def __install_postfork_hook() -> None: + message = """\ +Using fork() can cause Polars to deadlock in the child process. +In addition, using fork() with Python in general is a recipe for mysterious +deadlocks and crashes. + +The most likely reason you are seeing this error is because you are using the +multiprocessing module on Linux, which uses fork() by default. This will be +fixed in Python 3.14. Until then, you want to use the "spawn" context instead. + +See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details. +""" + + def before_hook() -> None: + import warnings + + warnings.warn(message, RuntimeWarning, stacklevel=2) + + import os + + if hasattr(os, "register_at_fork"): + os.register_at_fork(before=before_hook) + + +__install_postfork_hook() diff --git a/py-polars/polars/dataframe/_html.py b/py-polars/polars/dataframe/_html.py index 6f034eab0f41..b2f153584b5b 100644 --- a/py-polars/polars/dataframe/_html.py +++ b/py-polars/polars/dataframe/_html.py @@ -119,7 +119,9 @@ def write_body(self) -> None: else: series = self.df[:, c] self.elements.append( - html.escape(series._s.get_fmt(r, str_len_limit)) + html.escape( + series._s.get_fmt(r, str_len_limit) + ).replace(" ", " ") ) def write(self, inner: str) -> None: diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index dfd3f607791c..30185cc6a586 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1030,7 +1030,7 @@ def map_groups( The output for group `1` can be understood as follows: - - group `1` contains Series `'a': [1, 3]` and `'b': [4, 5]` + - group `1` contains Series `'a': [1, 3]` and `'b': [5, 6]` - applying the function to those lists of Series, one gets the output `[1 / 4 + 5, 3 / 4 + 6]`, i.e. `[5.25, 6.75]` """ diff --git a/py-polars/tests/unit/dataframe/test_repr_html.py b/py-polars/tests/unit/dataframe/test_repr_html.py index 8e7a62a6efc2..6ee004fabf22 100644 --- a/py-polars/tests/unit/dataframe/test_repr_html.py +++ b/py-polars/tests/unit/dataframe/test_repr_html.py @@ -77,3 +77,22 @@ def test_series_repr_html_max_rows_default() -> None: expected_rows = 10 assert html.count("") - 2 == expected_rows + + +def test_html_representation_multiple_spaces() -> None: + df = pl.DataFrame( + {"string_col": ["multiple spaces", " trailing and leading "]} + ) + html_repr = df._repr_html_() + + assert ( + html_repr + == """
+shape: (2, 1)
string_col
str
"multiple   spaces"
"  trailing and leading   "
""" + ) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index c65be5ad61c0..93395fafbdd5 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -1113,3 +1113,13 @@ def test_join_key_type_coercion_19597() -> None: left.join( right, left_on=pl.col("a") * 2, right_on=pl.col("a") * 2 ).collect_schema() + + +def test_array_explode_join_19763() -> None: + q = pl.LazyFrame().select( + pl.lit(pl.Series([[1], [2]], dtype=pl.Array(pl.Int64, 1))).explode().alias("k") + ) + + q = q.join(pl.LazyFrame({"k": [1, 2]}), on="k") + + assert_frame_equal(q.collect().sort("k"), pl.DataFrame({"k": [1, 2]})) diff --git a/py-polars/tests/unit/test_polars_import.py b/py-polars/tests/unit/test_polars_import.py index fa1779de3478..2686c094999b 100644 --- a/py-polars/tests/unit/test_polars_import.py +++ b/py-polars/tests/unit/test_polars_import.py @@ -1,6 +1,8 @@ from __future__ import annotations import compileall +import multiprocessing +import os import subprocess import sys from pathlib import Path @@ -97,3 +99,32 @@ def test_polars_import() -> None: import_time_ms = polars_import_time // 1_000 msg = f"Possible import speed regression; took {import_time_ms}ms\n{df_import}" raise AssertionError(msg) + + +def run_in_child() -> int: + return 123 + + +@pytest.mark.skipif(not hasattr(os, "fork"), reason="Requires fork()") +def test_fork_safety(recwarn: pytest.WarningsRecorder) -> None: + def get_num_fork_warnings() -> int: + fork_warnings = 0 + for warning in recwarn: + if issubclass(warning.category, RuntimeWarning) and str( + warning.message + ).startswith("Using fork() can cause Polars"): + fork_warnings += 1 + return fork_warnings + + assert get_num_fork_warnings() == 0 + + # Using forkserver and spawn context should not do any of our warning: + for context in ["spawn", "forkserver"]: + with multiprocessing.get_context(context).Pool(1) as pool: + assert pool.apply(run_in_child) == 123 + assert get_num_fork_warnings() == 0 + + # Using fork()-based multiprocessing should raise a warning: + with multiprocessing.get_context("fork").Pool(1) as pool: + assert pool.apply(run_in_child) == 123 + assert get_num_fork_warnings() == 1 diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index a8f9e43d84c0..43e8840458d3 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -278,3 +278,21 @@ def test_lf_window_schema(expr: pl.Expr, mapping_strategy: str) -> None: ) assert q.collect_schema() == q.collect().collect_schema() + + +def test_lf_explode_schema() -> None: + lf = pl.LazyFrame({"k": [1], "x": pl.Series([[1]], dtype=pl.Array(pl.Int64, 1))}) + + q = lf.select(pl.col("x").explode()) + assert q.collect_schema() == {"x": pl.Int64} + + q = lf.select(pl.col("x").arr.explode()) + assert q.collect_schema() == {"x": pl.Int64} + + lf = pl.LazyFrame({"k": [1], "x": pl.Series([[1]], dtype=pl.List(pl.Int64))}) + + q = lf.select(pl.col("x").explode()) + assert q.collect_schema() == {"x": pl.Int64} + + q = lf.select(pl.col("x").list.explode()) + assert q.collect_schema() == {"x": pl.Int64}