fix(rust!, python): Error on value_counts on column named "counts" (

#10737)
pola-rs · Aug 26, 2023 · 9b41eda · 9b41eda
1 parent 7dfcd96
commit 9b41eda
Show file tree

Hide file tree

Showing 10 changed files with 158 additions and 117 deletions.
diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs
@@ -10,15 +10,19 @@ use crate::series::ops::SeriesSealed;
 pub trait SeriesMethods: SeriesSealed {
     /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
     /// with dtype [`IdxType`]
-    fn value_counts(&self, multithreaded: bool, sorted: bool) -> PolarsResult<DataFrame> {
+    fn value_counts(&self, sort: bool, parallel: bool) -> PolarsResult<DataFrame> {
         let s = self.as_series();
+        polars_ensure!(
+            s.name() != "counts",
+            Duplicate: "using `value_counts` on a column named 'counts' would lead to duplicate column names"
+        );
         // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
-        let groups = s.group_tuples(multithreaded, sorted)?;
+        let groups = s.group_tuples(parallel, sort)?;
         let values = unsafe { s.agg_first(&groups) };
         let counts = groups.group_lengths("counts");
         let cols = vec![values, counts.into_series()];
         let df = DataFrame::new_no_checks(cols);
-        if sorted {
+        if sort {
             df.sort(["counts"], true, false)
         } else {
             Ok(df)

diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs
@@ -1691,11 +1691,11 @@ impl Expr {
 
     #[cfg(feature = "dtype-struct")]
     /// Count all unique values and create a struct mapping value to count
-    /// Note that it is better to turn multithreaded off in the aggregation context
-    pub fn value_counts(self, multithreaded: bool, sorted: bool) -> Self {
+    /// Note that it is better to turn parallel off in the aggregation context
+    pub fn value_counts(self, sort: bool, parallel: bool) -> Self {
         self.apply(
             move |s| {
-                s.value_counts(multithreaded, sorted)
+                s.value_counts(sort, parallel)
                     .map(|df| Some(df.into_struct(s.name()).into_series()))
             },
             GetOutput::map_field(|fld| {

diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
@@ -8269,48 +8269,62 @@ def extend_constant(self, value: PythonLiteral | None, n: int) -> Self:
 
         return self._from_pyexpr(self._pyexpr.extend_constant(value, n))
 
-    def value_counts(self, *, multithreaded: bool = False, sort: bool = False) -> Self:
+    @deprecate_renamed_parameter("multithreaded", "parallel", version="0.19.0")
+    def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
         """
-        Count all unique values and create a struct mapping value to count.
+        Count the occurrences of unique values.
 
         Parameters
         ----------
-        multithreaded:
-            Better to turn this off in the aggregation context, as it can lead to
-            contention.
-        sort:
-            Ensure the output is sorted from most values to least.
+        sort
+            Sort the output by count in descending order.
+            If set to ``False`` (default), the order of the output is random.
+        parallel
+            Execute the computation in parallel.
+
+            .. note::
+                This option should likely not be enabled in a group by context,
+                as the computation is already parallelized per group.
 
         Returns
         -------
         Expr
-            Expression of data type :class:`Struct`.
+            Expression of data type :class:`Struct` with mapping of unique values to
+            their count.
 
         Examples
         --------
         >>> df = pl.DataFrame(
-        ...     {
-        ...         "id": ["a", "b", "b", "c", "c", "c"],
-        ...     }
-        ... )
-        >>> df.select(
-        ...     [
-        ...         pl.col("id").value_counts(sort=True),
-        ...     ]
+        ...     {"color": ["red", "blue", "red", "green", "blue", "blue"]}
         ... )
+        >>> df.select(pl.col("color").value_counts())  # doctest: +IGNORE_RESULT
         shape: (3, 1)
-        ┌───────────┐
-        │ id        │
-        │ ---       │
-        │ struct[2] │
-        ╞═══════════╡
-        │ {"c",3}   │
-        │ {"b",2}   │
-        │ {"a",1}   │
-        └───────────┘
-
-        """
-        return self._from_pyexpr(self._pyexpr.value_counts(multithreaded, sort))
+        ┌─────────────┐
+        │ color       │
+        │ ---         │
+        │ struct[2]   │
+        ╞═════════════╡
+        │ {"red",2}   │
+        │ {"green",1} │
+        │ {"blue",3}  │
+        └─────────────┘
+
+        Sort the output by count.
+
+        >>> df.select(pl.col("color").value_counts(sort=True))
+        shape: (3, 1)
+        ┌─────────────┐
+        │ color       │
+        │ ---         │
+        │ struct[2]   │
+        ╞═════════════╡
+        │ {"blue",3}  │
+        │ {"red",2}   │
+        │ {"green",1} │
+        └─────────────┘
+
+        """
+        return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel))
 
     def unique_counts(self) -> Self:
         """

diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py
@@ -856,7 +856,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Expr:
             Run all expression parallel. Don't activate this blindly.
             Parallelism is worth it if there is enough work to do per thread.
 
-            This likely should not be use in the group by context, because we already
+            This likely should not be used in the group by context, because we already
             parallel execution per group
 
         Examples

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -2290,32 +2290,61 @@ def hist(
             bins = Series(bins, dtype=Float64)._s
         return wrap_df(self._s.hist(bins, bin_count))
 
-    def value_counts(self, *, sort: bool = False) -> DataFrame:
+    def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFrame:
         """
-        Count the unique values in a Series.
+        Count the occurrences of unique values.
 
         Parameters
         ----------
         sort
-            Ensure the output is sorted from most values to least.
+            Sort the output by count in descending order.
+            If set to ``False`` (default), the order of the output is random.
+        parallel
+            Execute the computation in parallel.
+
+            .. note::
+                This option should likely not be enabled in a group by context,
+                as the computation is already parallelized per group.
+
+        Returns
+        -------
+        DataFrame
+            Mapping of unique values to their count.
 
         Examples
         --------
-        >>> s = pl.Series("a", [1, 2, 2, 3])
-        >>> s.value_counts().sort(by="a")
+        >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"])
+        >>> s.value_counts()  # doctest: +IGNORE_RESULT
         shape: (3, 2)
-        ┌─────┬────────┐
-        │ a   ┆ counts │
-        │ --- ┆ ---    │
-        │ i64 ┆ u32    │
-        ╞═════╪════════╡
-        │ 1   ┆ 1      │
-        │ 2   ┆ 2      │
-        │ 3   ┆ 1      │
-        └─────┴────────┘
-
-        """
-        return wrap_df(self._s.value_counts(sort))
+        ┌───────┬────────┐
+        │ color ┆ counts │
+        │ ---   ┆ ---    │
+        │ str   ┆ u32    │
+        ╞═══════╪════════╡
+        │ red   ┆ 2      │
+        │ green ┆ 1      │
+        │ blue  ┆ 3      │
+        └───────┴────────┘
+
+        Sort the output by count.
+
+        shape: (3, 2)
+        ┌───────┬────────┐
+        │ color ┆ counts │
+        │ ---   ┆ ---    │
+        │ str   ┆ u32    │
+        ╞═══════╪════════╡
+        │ blue  ┆ 3      │
+        │ red   ┆ 2      │
+        │ green ┆ 1      │
+        └───────┴────────┘
+
+        """
+        return (
+            self.to_frame()
+            .select(F.col(self.name).value_counts(sort=sort, parallel=parallel))
+            .unnest(self.name)
+        )
 
     def unique_counts(self) -> Series:
         """

diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs
@@ -244,11 +244,8 @@ impl PyExpr {
     fn count(&self) -> Self {
         self.clone().inner.count().into()
     }
-    fn value_counts(&self, multithreaded: bool, sorted: bool) -> Self {
-        self.inner
-            .clone()
-            .value_counts(multithreaded, sorted)
-            .into()
+    fn value_counts(&self, sort: bool, parallel: bool) -> Self {
+        self.inner.clone().value_counts(sort, parallel).into()
     }
     fn unique_counts(&self) -> Self {
         self.inner.clone().unique_counts().into()

diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs
@@ -266,14 +266,6 @@ impl PySeries {
         self.series.sort(descending).into()
     }
 
-    fn value_counts(&self, sorted: bool) -> PyResult<PyDataFrame> {
-        let df = self
-            .series
-            .value_counts(true, sorted)
-            .map_err(PyPolarsErr::from)?;
-        Ok(df.into())
-    }
-
     fn take_with_series(&self, indices: &PySeries) -> PyResult<Self> {
         let idx = indices.series.idx().map_err(PyPolarsErr::from)?;
         let take = self.series.take(idx).map_err(PyPolarsErr::from)?;

diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py
@@ -161,48 +161,6 @@ def test_struct_function_expansion() -> None:
     assert pl.Struct(struct_schema) == s.to_frame().schema["a"]
 
 
-def test_value_counts_expr() -> None:
-    df = pl.DataFrame(
-        {
-            "id": ["a", "b", "b", "c", "c", "c", "d", "d"],
-        }
-    )
-    out = (
-        df.select(
-            [
-                pl.col("id").value_counts(sort=True),
-            ]
-        )
-        .to_series()
-        .to_list()
-    )
-    assert out == [
-        {"id": "c", "counts": 3},
-        {"id": "b", "counts": 2},
-        {"id": "d", "counts": 2},
-        {"id": "a", "counts": 1},
-    ]
-
-    # nested value counts. Then the series needs the name
-    # 6200
-
-    df = pl.DataFrame({"session": [1, 1, 1], "id": [2, 2, 3]})
-
-    assert df.group_by("session").agg(
-        [pl.col("id").value_counts(sort=True).first()]
-    ).to_dict(False) == {"session": [1], "id": [{"id": 2, "counts": 2}]}
-
-
-def test_value_counts_logical_type() -> None:
-    # test logical type
-    df = pl.DataFrame({"a": ["b", "c"]}).with_columns(
-        pl.col("a").cast(pl.Categorical).alias("ac")
-    )
-    out = df.select([pl.all().value_counts()])
-    assert out["ac"].struct.field("ac").dtype == pl.Categorical
-    assert out["a"].struct.field("a").dtype == pl.Utf8
-
-
 def test_nested_struct() -> None:
     df = pl.DataFrame({"d": [1, 2, 3], "e": ["foo", "bar", "biz"]})
     # Nest the dataframe

diff --git a/py-polars/tests/unit/operations/test_value_counts.py b/py-polars/tests/unit/operations/test_value_counts.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+
+def test_value_counts() -> None:
+    s = pl.Series("a", [1, 2, 2, 3])
+    result = s.value_counts()
+    expected = pl.DataFrame(
+        {"a": [1, 2, 3], "counts": [1, 2, 1]}, schema_overrides={"counts": pl.UInt32}
+    )
+    result_sorted = result.sort("a")
+    assert_frame_equal(result_sorted, expected)
+
+
+def test_value_counts_logical_type() -> None:
+    # test logical type
+    df = pl.DataFrame({"a": ["b", "c"]}).with_columns(
+        pl.col("a").cast(pl.Categorical).alias("ac")
+    )
+    out = df.select(pl.all().value_counts())
+    assert out["ac"].struct.field("ac").dtype == pl.Categorical
+    assert out["a"].struct.field("a").dtype == pl.Utf8
+
+
+def test_value_counts_expr() -> None:
+    df = pl.DataFrame(
+        {
+            "id": ["a", "b", "b", "c", "c", "c", "d", "d"],
+        }
+    )
+    out = df.select(pl.col("id").value_counts(sort=True)).to_series().to_list()
+    assert out == [
+        {"id": "c", "counts": 3},
+        {"id": "b", "counts": 2},
+        {"id": "d", "counts": 2},
+        {"id": "a", "counts": 1},
+    ]
+
+    # nested value counts. Then the series needs the name
+    # 6200
+
+    df = pl.DataFrame({"session": [1, 1, 1], "id": [2, 2, 3]})
+
+    assert df.group_by("session").agg(
+        pl.col("id").value_counts(sort=True).first()
+    ).to_dict(False) == {"session": [1], "id": [{"id": 2, "counts": 2}]}
+
+
+def test_value_counts_duplicate_name() -> None:
+    s = pl.Series("counts", [1])
+
+    with pytest.raises(pl.DuplicateError, match="counts"):
+        s.value_counts()
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
@@ -1695,16 +1695,6 @@ def test_to_dummies() -> None:
     assert_frame_equal(result, expected)
 
 
-def test_value_counts() -> None:
-    s = pl.Series("a", [1, 2, 2, 3])
-    result = s.value_counts()
-    expected = pl.DataFrame(
-        {"a": [1, 2, 3], "counts": [1, 2, 1]}, schema_overrides={"counts": pl.UInt32}
-    )
-    result_sorted = result.sort("a")
-    assert_frame_equal(result_sorted, expected)
-
-
 def test_chunk_lengths() -> None:
     s = pl.Series("a", [1, 2, 2, 3])
     # this is a Series with one chunk, of length 4