From 9b41eda90d78f75f5bdb4140820bad43a6ac5e67 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 26 Aug 2023 19:20:01 +0200 Subject: [PATCH] fix(rust!, python): Error on `value_counts` on column named `"counts"` (#10737) --- crates/polars-ops/src/series/ops/various.rs | 10 ++- crates/polars-plan/src/dsl/mod.rs | 6 +- py-polars/polars/expr/expr.py | 70 +++++++++++-------- py-polars/polars/expr/list.py | 2 +- py-polars/polars/series/series.py | 63 ++++++++++++----- py-polars/src/expr/general.rs | 7 +- py-polars/src/series/mod.rs | 8 --- py-polars/tests/unit/datatypes/test_struct.py | 42 ----------- .../unit/operations/test_value_counts.py | 57 +++++++++++++++ py-polars/tests/unit/series/test_series.py | 10 --- 10 files changed, 158 insertions(+), 117 deletions(-) create mode 100644 py-polars/tests/unit/operations/test_value_counts.py diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs index 98841c7051aa..c83216d55f80 100644 --- a/crates/polars-ops/src/series/ops/various.rs +++ b/crates/polars-ops/src/series/ops/various.rs @@ -10,15 +10,19 @@ use crate::series::ops::SeriesSealed; pub trait SeriesMethods: SeriesSealed { /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"` /// with dtype [`IdxType`] - fn value_counts(&self, multithreaded: bool, sorted: bool) -> PolarsResult { + fn value_counts(&self, sort: bool, parallel: bool) -> PolarsResult { let s = self.as_series(); + polars_ensure!( + s.name() != "counts", + Duplicate: "using `value_counts` on a column named 'counts' would lead to duplicate column names" + ); // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined - let groups = s.group_tuples(multithreaded, sorted)?; + let groups = s.group_tuples(parallel, sort)?; let values = unsafe { s.agg_first(&groups) }; let counts = groups.group_lengths("counts"); let cols = vec![values, counts.into_series()]; let df = DataFrame::new_no_checks(cols); - if sorted { + if sort { df.sort(["counts"], true, false) } else { Ok(df) diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 72d2fb4f08d6..69c2cd6c7d4d 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1691,11 +1691,11 @@ impl Expr { #[cfg(feature = "dtype-struct")] /// Count all unique values and create a struct mapping value to count - /// Note that it is better to turn multithreaded off in the aggregation context - pub fn value_counts(self, multithreaded: bool, sorted: bool) -> Self { + /// Note that it is better to turn parallel off in the aggregation context + pub fn value_counts(self, sort: bool, parallel: bool) -> Self { self.apply( move |s| { - s.value_counts(multithreaded, sorted) + s.value_counts(sort, parallel) .map(|df| Some(df.into_struct(s.name()).into_series())) }, GetOutput::map_field(|fld| { diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 66d06d7b66a9..94bcc7030333 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -8269,48 +8269,62 @@ def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: return self._from_pyexpr(self._pyexpr.extend_constant(value, n)) - def value_counts(self, *, multithreaded: bool = False, sort: bool = False) -> Self: + @deprecate_renamed_parameter("multithreaded", "parallel", version="0.19.0") + def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self: """ - Count all unique values and create a struct mapping value to count. + Count the occurrences of unique values. Parameters ---------- - multithreaded: - Better to turn this off in the aggregation context, as it can lead to - contention. - sort: - Ensure the output is sorted from most values to least. + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. Returns ------- Expr - Expression of data type :class:`Struct`. + Expression of data type :class:`Struct` with mapping of unique values to + their count. Examples -------- >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").value_counts(sort=True), - ... ] + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT shape: (3, 1) - ┌───────────┐ - │ id │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {"c",3} │ - │ {"b",2} │ - │ {"a",1} │ - └───────────┘ - - """ - return self._from_pyexpr(self._pyexpr.value_counts(multithreaded, sort)) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + """ + return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel)) def unique_counts(self) -> Self: """ diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 4f20957e87d6..dffc2b028537 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -856,7 +856,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Expr: Run all expression parallel. Don't activate this blindly. Parallelism is worth it if there is enough work to do per thread. - This likely should not be use in the group by context, because we already + This likely should not be used in the group by context, because we already parallel execution per group Examples diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index ff56b9489836..8157ee2cd090 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -2290,32 +2290,61 @@ def hist( bins = Series(bins, dtype=Float64)._s return wrap_df(self._s.hist(bins, bin_count)) - def value_counts(self, *, sort: bool = False) -> DataFrame: + def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFrame: """ - Count the unique values in a Series. + Count the occurrences of unique values. Parameters ---------- sort - Ensure the output is sorted from most values to least. + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. Examples -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.value_counts().sort(by="a") + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT shape: (3, 2) - ┌─────┬────────┐ - │ a ┆ counts │ - │ --- ┆ --- │ - │ i64 ┆ u32 │ - ╞═════╪════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - │ 3 ┆ 1 │ - └─────┴────────┘ - - """ - return wrap_df(self._s.value_counts(sort)) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + """ + return ( + self.to_frame() + .select(F.col(self.name).value_counts(sort=sort, parallel=parallel)) + .unnest(self.name) + ) def unique_counts(self) -> Series: """ diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index a66318dafe18..8d3814e7e8cc 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -244,11 +244,8 @@ impl PyExpr { fn count(&self) -> Self { self.clone().inner.count().into() } - fn value_counts(&self, multithreaded: bool, sorted: bool) -> Self { - self.inner - .clone() - .value_counts(multithreaded, sorted) - .into() + fn value_counts(&self, sort: bool, parallel: bool) -> Self { + self.inner.clone().value_counts(sort, parallel).into() } fn unique_counts(&self) -> Self { self.inner.clone().unique_counts().into() diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs index 03955bbef928..b3dcfe28f243 100644 --- a/py-polars/src/series/mod.rs +++ b/py-polars/src/series/mod.rs @@ -266,14 +266,6 @@ impl PySeries { self.series.sort(descending).into() } - fn value_counts(&self, sorted: bool) -> PyResult { - let df = self - .series - .value_counts(true, sorted) - .map_err(PyPolarsErr::from)?; - Ok(df.into()) - } - fn take_with_series(&self, indices: &PySeries) -> PyResult { let idx = indices.series.idx().map_err(PyPolarsErr::from)?; let take = self.series.take(idx).map_err(PyPolarsErr::from)?; diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index cf694f984387..848f377e3090 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -161,48 +161,6 @@ def test_struct_function_expansion() -> None: assert pl.Struct(struct_schema) == s.to_frame().schema["a"] -def test_value_counts_expr() -> None: - df = pl.DataFrame( - { - "id": ["a", "b", "b", "c", "c", "c", "d", "d"], - } - ) - out = ( - df.select( - [ - pl.col("id").value_counts(sort=True), - ] - ) - .to_series() - .to_list() - ) - assert out == [ - {"id": "c", "counts": 3}, - {"id": "b", "counts": 2}, - {"id": "d", "counts": 2}, - {"id": "a", "counts": 1}, - ] - - # nested value counts. Then the series needs the name - # 6200 - - df = pl.DataFrame({"session": [1, 1, 1], "id": [2, 2, 3]}) - - assert df.group_by("session").agg( - [pl.col("id").value_counts(sort=True).first()] - ).to_dict(False) == {"session": [1], "id": [{"id": 2, "counts": 2}]} - - -def test_value_counts_logical_type() -> None: - # test logical type - df = pl.DataFrame({"a": ["b", "c"]}).with_columns( - pl.col("a").cast(pl.Categorical).alias("ac") - ) - out = df.select([pl.all().value_counts()]) - assert out["ac"].struct.field("ac").dtype == pl.Categorical - assert out["a"].struct.field("a").dtype == pl.Utf8 - - def test_nested_struct() -> None: df = pl.DataFrame({"d": [1, 2, 3], "e": ["foo", "bar", "biz"]}) # Nest the dataframe diff --git a/py-polars/tests/unit/operations/test_value_counts.py b/py-polars/tests/unit/operations/test_value_counts.py new file mode 100644 index 000000000000..70bcc8bb00af --- /dev/null +++ b/py-polars/tests/unit/operations/test_value_counts.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +def test_value_counts() -> None: + s = pl.Series("a", [1, 2, 2, 3]) + result = s.value_counts() + expected = pl.DataFrame( + {"a": [1, 2, 3], "counts": [1, 2, 1]}, schema_overrides={"counts": pl.UInt32} + ) + result_sorted = result.sort("a") + assert_frame_equal(result_sorted, expected) + + +def test_value_counts_logical_type() -> None: + # test logical type + df = pl.DataFrame({"a": ["b", "c"]}).with_columns( + pl.col("a").cast(pl.Categorical).alias("ac") + ) + out = df.select(pl.all().value_counts()) + assert out["ac"].struct.field("ac").dtype == pl.Categorical + assert out["a"].struct.field("a").dtype == pl.Utf8 + + +def test_value_counts_expr() -> None: + df = pl.DataFrame( + { + "id": ["a", "b", "b", "c", "c", "c", "d", "d"], + } + ) + out = df.select(pl.col("id").value_counts(sort=True)).to_series().to_list() + assert out == [ + {"id": "c", "counts": 3}, + {"id": "b", "counts": 2}, + {"id": "d", "counts": 2}, + {"id": "a", "counts": 1}, + ] + + # nested value counts. Then the series needs the name + # 6200 + + df = pl.DataFrame({"session": [1, 1, 1], "id": [2, 2, 3]}) + + assert df.group_by("session").agg( + pl.col("id").value_counts(sort=True).first() + ).to_dict(False) == {"session": [1], "id": [{"id": 2, "counts": 2}]} + + +def test_value_counts_duplicate_name() -> None: + s = pl.Series("counts", [1]) + + with pytest.raises(pl.DuplicateError, match="counts"): + s.value_counts() diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index bab062df55f3..98e03647e705 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -1695,16 +1695,6 @@ def test_to_dummies() -> None: assert_frame_equal(result, expected) -def test_value_counts() -> None: - s = pl.Series("a", [1, 2, 2, 3]) - result = s.value_counts() - expected = pl.DataFrame( - {"a": [1, 2, 3], "counts": [1, 2, 1]}, schema_overrides={"counts": pl.UInt32} - ) - result_sorted = result.sort("a") - assert_frame_equal(result_sorted, expected) - - def test_chunk_lengths() -> None: s = pl.Series("a", [1, 2, 2, 3]) # this is a Series with one chunk, of length 4