From 1dc25330078131464499be2691b2047fb1481a03 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 18 Aug 2024 12:02:48 +0200 Subject: [PATCH] fix: Fix unit null rank (#18252) --- crates/polars-ops/src/series/ops/rank.rs | 9 ++ py-polars/tests/unit/expr/test_exprs.py | 49 ---------- .../tests/unit/operations/test_random.py | 21 ---- py-polars/tests/unit/operations/test_rank.py | 97 +++++++++++++++++++ py-polars/tests/unit/series/test_series.py | 19 ---- 5 files changed, 106 insertions(+), 89 deletions(-) create mode 100644 py-polars/tests/unit/operations/test_rank.py diff --git a/crates/polars-ops/src/series/ops/rank.rs b/crates/polars-ops/src/series/ops/rank.rs index dd2fe3936945..0c57307626c0 100644 --- a/crates/polars-ops/src/series/ops/rank.rs +++ b/crates/polars-ops/src/series/ops/rank.rs @@ -65,6 +65,15 @@ unsafe fn rank_impl(idxs: &IdxCa, neq: &BooleanArray, fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> Series { let len = s.len(); let null_count = s.null_count(); + + if null_count == len { + let dt = match method { + Average => DataType::Float64, + _ => IDX_DTYPE, + }; + return Series::full_null(s.name(), s.len(), &dt); + } + match len { 1 => { return match method { diff --git a/py-polars/tests/unit/expr/test_exprs.py b/py-polars/tests/unit/expr/test_exprs.py index 012a5c74035b..a8f874e42548 100644 --- a/py-polars/tests/unit/expr/test_exprs.py +++ b/py-polars/tests/unit/expr/test_exprs.py @@ -333,55 +333,6 @@ def test_arr_contains() -> None: } -def test_rank() -> None: - df = pl.DataFrame( - { - "a": [1, 1, 2, 2, 3], - } - ) - - s = df.select(pl.col("a").rank(method="average").alias("b")).to_series() - assert s.to_list() == [1.5, 1.5, 3.5, 3.5, 5.0] - assert s.dtype == pl.Float64 - - s = df.select(pl.col("a").rank(method="max").alias("b")).to_series() - assert s.to_list() == [2, 2, 4, 4, 5] - assert s.dtype == pl.get_index_type() - - -def test_rank_so_4109() -> None: - # also tests ranks null behavior - df = pl.from_dict( - { - "id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4], - "rank": [None, 3, 2, 4, 1, 4, 3, 2, 1, None, 3, 4, 4, 1, None, 3], - } - ).sort(by=["id", "rank"]) - - assert df.group_by("id").agg( - [ - pl.col("rank").alias("original"), - pl.col("rank").rank(method="dense").alias("dense"), - pl.col("rank").rank(method="average").alias("average"), - ] - ).to_dict(as_series=False) == { - "id": [1, 2, 3, 4], - "original": [[None, 2, 3, 4], [1, 2, 3, 4], [None, 1, 3, 4], [None, 1, 3, 4]], - "dense": [[None, 1, 2, 3], [1, 2, 3, 4], [None, 1, 2, 3], [None, 1, 2, 3]], - "average": [ - [None, 1.0, 2.0, 3.0], - [1.0, 2.0, 3.0, 4.0], - [None, 1.0, 2.0, 3.0], - [None, 1.0, 2.0, 3.0], - ], - } - - -def test_rank_string_null_11252() -> None: - rank = pl.Series([None, "", "z", None, "a"]).rank() - assert rank.to_list() == [None, 1.0, 3.0, None, 2.0] - - def test_logical_boolean() -> None: # note, cannot use expressions in logical # boolean context (eg: and/or/not operators) diff --git a/py-polars/tests/unit/operations/test_random.py b/py-polars/tests/unit/operations/test_random.py index 98f781a45432..ce8e5644bc77 100644 --- a/py-polars/tests/unit/operations/test_random.py +++ b/py-polars/tests/unit/operations/test_random.py @@ -116,27 +116,6 @@ def test_sample_series() -> None: assert len(s.sample(n=10, with_replacement=True, seed=0)) == 10 -def test_rank_random_expr() -> None: - df = pl.from_dict( - {"a": [1] * 5, "b": [1, 2, 3, 4, 5], "c": [200, 100, 100, 50, 100]} - ) - - df_ranks1 = df.with_columns( - pl.col("c").rank(method="random", seed=1).over("a").alias("rank") - ) - df_ranks2 = df.with_columns( - pl.col("c").rank(method="random", seed=1).over("a").alias("rank") - ) - assert_frame_equal(df_ranks1, df_ranks2) - - -def test_rank_random_series() -> None: - s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) - assert_series_equal( - s.rank("random", seed=1), pl.Series("a", [2, 4, 7, 3, 5, 6, 1], dtype=pl.UInt32) - ) - - def test_shuffle_expr() -> None: # pl.set_random_seed should lead to reproducible results. s = pl.Series("a", range(20)) diff --git a/py-polars/tests/unit/operations/test_rank.py b/py-polars/tests/unit/operations/test_rank.py new file mode 100644 index 000000000000..6f83663875b6 --- /dev/null +++ b/py-polars/tests/unit/operations/test_rank.py @@ -0,0 +1,97 @@ +import polars as pl +from polars.testing import assert_frame_equal, assert_series_equal + + +def test_rank_nulls() -> None: + assert pl.Series([]).rank().to_list() == [] + assert pl.Series([None]).rank().to_list() == [None] + assert pl.Series([None, None]).rank().to_list() == [None, None] + + +def test_rank_random_expr() -> None: + df = pl.from_dict( + {"a": [1] * 5, "b": [1, 2, 3, 4, 5], "c": [200, 100, 100, 50, 100]} + ) + + df_ranks1 = df.with_columns( + pl.col("c").rank(method="random", seed=1).over("a").alias("rank") + ) + df_ranks2 = df.with_columns( + pl.col("c").rank(method="random", seed=1).over("a").alias("rank") + ) + assert_frame_equal(df_ranks1, df_ranks2) + + +def test_rank_random_series() -> None: + s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) + assert_series_equal( + s.rank("random", seed=1), pl.Series("a", [2, 4, 7, 3, 5, 6, 1], dtype=pl.UInt32) + ) + + +def test_rank_df() -> None: + df = pl.DataFrame( + { + "a": [1, 1, 2, 2, 3], + } + ) + + s = df.select(pl.col("a").rank(method="average").alias("b")).to_series() + assert s.to_list() == [1.5, 1.5, 3.5, 3.5, 5.0] + assert s.dtype == pl.Float64 + + s = df.select(pl.col("a").rank(method="max").alias("b")).to_series() + assert s.to_list() == [2, 2, 4, 4, 5] + assert s.dtype == pl.get_index_type() + + +def test_rank_so_4109() -> None: + # also tests ranks null behavior + df = pl.from_dict( + { + "id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4], + "rank": [None, 3, 2, 4, 1, 4, 3, 2, 1, None, 3, 4, 4, 1, None, 3], + } + ).sort(by=["id", "rank"]) + + assert df.group_by("id").agg( + [ + pl.col("rank").alias("original"), + pl.col("rank").rank(method="dense").alias("dense"), + pl.col("rank").rank(method="average").alias("average"), + ] + ).to_dict(as_series=False) == { + "id": [1, 2, 3, 4], + "original": [[None, 2, 3, 4], [1, 2, 3, 4], [None, 1, 3, 4], [None, 1, 3, 4]], + "dense": [[None, 1, 2, 3], [1, 2, 3, 4], [None, 1, 2, 3], [None, 1, 2, 3]], + "average": [ + [None, 1.0, 2.0, 3.0], + [1.0, 2.0, 3.0, 4.0], + [None, 1.0, 2.0, 3.0], + [None, 1.0, 2.0, 3.0], + ], + } + + +def test_rank_string_null_11252() -> None: + rank = pl.Series([None, "", "z", None, "a"]).rank() + assert rank.to_list() == [None, 1.0, 3.0, None, 2.0] + + +def test_rank_series() -> None: + s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) + + assert_series_equal( + s.rank("dense"), pl.Series("a", [2, 3, 4, 3, 3, 4, 1], dtype=pl.UInt32) + ) + + df = pl.DataFrame([s]) + assert df.select(pl.col("a").rank("dense"))["a"].to_list() == [2, 3, 4, 3, 3, 4, 1] + + assert_series_equal( + s.rank("dense", descending=True), + pl.Series("a", [3, 2, 1, 2, 2, 1, 4], dtype=pl.UInt32), + ) + + assert s.rank(method="average").dtype == pl.Float64 + assert s.rank(method="max").dtype == pl.get_index_type() diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index fea87af082a7..08884308af48 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -994,25 +994,6 @@ def test_mode() -> None: assert pl.int_range(0, 3, eager=True).mode().to_list() == [2, 1, 0] -def test_rank() -> None: - s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) - - assert_series_equal( - s.rank("dense"), pl.Series("a", [2, 3, 4, 3, 3, 4, 1], dtype=UInt32) - ) - - df = pl.DataFrame([s]) - assert df.select(pl.col("a").rank("dense"))["a"].to_list() == [2, 3, 4, 3, 3, 4, 1] - - assert_series_equal( - s.rank("dense", descending=True), - pl.Series("a", [3, 2, 1, 2, 2, 1, 4], dtype=UInt32), - ) - - assert s.rank(method="average").dtype == pl.Float64 - assert s.rank(method="max").dtype == pl.get_index_type() - - def test_diff() -> None: s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) expected = pl.Series("a", [1, 1, -1, 0, 1, -3])