From 6874e2f13d14c3d9179767521e9a2b6fd2917b11 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 4 Aug 2023 14:36:40 +0400 Subject: [PATCH] depr(python,rust!): renaming `approx_unique` as `approx_n_unique` (#10290) --- .../series/ops/approx_algo/hyperloglogplus.rs | 2 +- .../src/series/ops/approx_unique.rs | 20 +++++++++---------- .../src/dsl/function_expr/dispatch.rs | 4 ++-- .../polars-plan/src/dsl/function_expr/mod.rs | 6 +++--- .../src/dsl/function_expr/schema.rs | 2 +- crates/polars-plan/src/dsl/mod.rs | 4 ++-- .../reference/expressions/computation.rst | 2 +- .../reference/expressions/functions.rst | 4 ++-- py-polars/polars/__init__.py | 4 ++-- py-polars/polars/dataframe/frame.py | 5 +++++ py-polars/polars/expr/expr.py | 8 ++++---- py-polars/polars/functions/__init__.py | 4 ++-- py-polars/polars/functions/lazy.py | 10 +++++----- py-polars/polars/lazyframe/frame.py | 13 ++++++++---- py-polars/src/expr/general.rs | 8 ++++++-- .../tests/unit/functions/test_functions.py | 8 ++++---- 16 files changed, 59 insertions(+), 45 deletions(-) diff --git a/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs b/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs index c03f67877522..133e4e3f8298 100644 --- a/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs +++ b/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs @@ -1,7 +1,7 @@ //! # HyperLogLogPlus //! //! `hyperloglogplus` module contains implementation of HyperLogLogPlus -//! algorithm for cardinality estimation so that [`crate::series::approx_unique`] function can +//! algorithm for cardinality estimation so that [`crate::series::approx_n_unique`] function can //! be efficiently implemented. //! //! This module borrows code from [arrow-datafusion](https://github.com/apache/arrow-datafusion/blob/93771052c5ac31f2cf22b8c25bf938656afe1047/datafusion/physical-expr/src/aggregate/hyperloglog.rs). diff --git a/crates/polars-ops/src/series/ops/approx_unique.rs b/crates/polars-ops/src/series/ops/approx_unique.rs index 1d925e824bcd..fd539dba07d6 100644 --- a/crates/polars-ops/src/series/ops/approx_unique.rs +++ b/crates/polars-ops/src/series/ops/approx_unique.rs @@ -6,7 +6,7 @@ use polars_core::with_match_physical_integer_polars_type; #[cfg(feature = "approx_unique")] use crate::series::HyperLogLog; -fn approx_unique_ca<'a, T>(ca: &'a ChunkedArray) -> PolarsResult +fn approx_n_unique_ca<'a, T>(ca: &'a ChunkedArray) -> PolarsResult where T: PolarsDataType, &'a ChunkedArray: IntoIterator, @@ -23,22 +23,22 @@ fn dispatcher(s: &Series) -> PolarsResult { let s = s.to_physical_repr(); use DataType::*; match s.dtype() { - Boolean => s.bool().and_then(approx_unique_ca), - Binary => s.binary().and_then(approx_unique_ca), + Boolean => s.bool().and_then(approx_n_unique_ca), + Binary => s.binary().and_then(approx_n_unique_ca), Utf8 => { let s = s.cast(&Binary).unwrap(); let ca = s.binary().unwrap(); - approx_unique_ca(ca) + approx_n_unique_ca(ca) } - Float32 => approx_unique_ca(&s.bit_repr_small()), - Float64 => approx_unique_ca(&s.bit_repr_large()), + Float32 => approx_n_unique_ca(&s.bit_repr_small()), + Float64 => approx_n_unique_ca(&s.bit_repr_large()), dt if dt.is_numeric() => { with_match_physical_integer_polars_type!(s.dtype(), |$T| { let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref(); - approx_unique_ca(ca) + approx_n_unique_ca(ca) }) } - dt => polars_bail!(opq = approx_unique, dt), + dt => polars_bail!(opq = approx_n_unique, dt), } } @@ -57,7 +57,7 @@ fn dispatcher(s: &Series) -> PolarsResult { /// /// let s = Series::new("s", [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]); /// -/// let approx_count = approx_unique(&s).unwrap(); +/// let approx_count = approx_n_unique(&s).unwrap(); /// dbg!(approx_count); /// # } /// ``` @@ -69,6 +69,6 @@ fn dispatcher(s: &Series) -> PolarsResult { /// 3 /// ] /// ``` -pub fn approx_unique(s: &Series) -> PolarsResult { +pub fn approx_n_unique(s: &Series) -> PolarsResult { dispatcher(s) } diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index d44c5844b8d2..bb0db05515cb 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -9,8 +9,8 @@ pub(super) fn reverse(s: &Series) -> PolarsResult { } #[cfg(feature = "approx_unique")] -pub(super) fn approx_unique(s: &Series) -> PolarsResult { - polars_ops::prelude::approx_unique(s) +pub(super) fn approx_n_unique(s: &Series) -> PolarsResult { + polars_ops::prelude::approx_n_unique(s) } #[cfg(feature = "diff")] diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 30add44729da..5feaebb7c990 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -165,7 +165,7 @@ pub enum FunctionExpr { Reverse, Boolean(BooleanFunction), #[cfg(feature = "approx_unique")] - ApproxUnique, + ApproxNUnique, #[cfg(feature = "dtype-categorical")] Categorical(CategoricalFunction), Coalesce, @@ -292,7 +292,7 @@ impl Display for FunctionExpr { Reverse => "reverse", Boolean(func) => return write!(f, "{func}"), #[cfg(feature = "approx_unique")] - ApproxUnique => "approx_unique", + ApproxNUnique => "approx_n_unique", #[cfg(feature = "dtype-categorical")] Categorical(func) => return write!(f, "{func}"), Coalesce => "coalesce", @@ -544,7 +544,7 @@ impl From for SpecialEq> { Reverse => map!(dispatch::reverse), Boolean(func) => func.into(), #[cfg(feature = "approx_unique")] - ApproxUnique => map!(dispatch::approx_unique), + ApproxNUnique => map!(dispatch::approx_n_unique), #[cfg(feature = "dtype-categorical")] Categorical(func) => func.into(), Coalesce => map_as_slice!(fill_null::coalesce), diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index 2533c8f7e047..f76b264745ec 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -224,7 +224,7 @@ impl FunctionExpr { Cummin { .. } => mapper.with_same_dtype(), Cummax { .. } => mapper.with_same_dtype(), #[cfg(feature = "approx_unique")] - ApproxUnique => mapper.with_dtype(IDX_DTYPE), + ApproxNUnique => mapper.with_dtype(IDX_DTYPE), #[cfg(feature = "diff")] Diff(_, _) => mapper.map_dtype(|dt| match dt { #[cfg(feature = "dtype-datetime")] diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 37d132e6f73f..6398b8f64760 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1003,8 +1003,8 @@ impl Expr { /// Get the approximate count of unique values. #[cfg(feature = "approx_unique")] - pub fn approx_unique(self) -> Self { - self.apply_private(FunctionExpr::ApproxUnique) + pub fn approx_n_unique(self) -> Self { + self.apply_private(FunctionExpr::ApproxNUnique) .with_function_options(|mut options| { options.auto_explode = true; options diff --git a/py-polars/docs/source/reference/expressions/computation.rst b/py-polars/docs/source/reference/expressions/computation.rst index 8289041b32c1..7cd02ad7f721 100644 --- a/py-polars/docs/source/reference/expressions/computation.rst +++ b/py-polars/docs/source/reference/expressions/computation.rst @@ -7,7 +7,7 @@ Computation :toctree: api/ Expr.abs - Expr.approx_unique + Expr.approx_n_unique Expr.arccos Expr.arccosh Expr.arcsin diff --git a/py-polars/docs/source/reference/expressions/functions.rst b/py-polars/docs/source/reference/expressions/functions.rst index e7386f782700..09e9afabce4f 100644 --- a/py-polars/docs/source/reference/expressions/functions.rst +++ b/py-polars/docs/source/reference/expressions/functions.rst @@ -17,7 +17,7 @@ These functions are available from the polars module root and can be used as exp any any_horizontal apply - approx_unique + approx_n_unique arange arctan2 arctan2d @@ -89,7 +89,7 @@ These functions are available from the polars module root and can be used as exp Expr.all Expr.any Expr.apply - Expr.approx_unique + Expr.approx_n_unique Expr.count Expr.cumsum Expr.exclude diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index ddc26de3b43c..f45956ea02fe 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -82,7 +82,7 @@ any, any_horizontal, apply, - approx_unique, + approx_n_unique, arange, arctan2, arctan2d, @@ -308,6 +308,7 @@ "sum_horizontal", # polars.functions.lazy "apply", + "approx_n_unique", "arange", "arctan2", "arctan2d", @@ -342,7 +343,6 @@ "mean", "median", "n_unique", - "approx_unique", "quantile", "reduce", "rolling_corr", diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ab9a21278cb6..e17c3645f737 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -89,6 +89,7 @@ from polars.utils._wrap import wrap_expr, wrap_ldf, wrap_s from polars.utils.convert import _timedelta_to_pl_duration from polars.utils.deprecation import ( + deprecate_renamed_methods, deprecate_renamed_parameter, issue_deprecation_warning, ) @@ -181,6 +182,10 @@ P = ParamSpec("P") +@deprecate_renamed_methods( + mapping={"approx_unique": "approx_n_unique"}, + versions={"approx_unique": "0.18.12"}, +) class DataFrame: """ Two-dimensional data structure representing data as a table with rows and columns. diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index f1f2964de614..bdc351afe30a 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -2879,16 +2879,16 @@ def n_unique(self) -> Self: """ return self._from_pyexpr(self._pyexpr.n_unique()) - def approx_unique(self) -> Self: + def approx_n_unique(self) -> Self: """ - Approx count unique values. + Approximate count of unique values. This is done using the HyperLogLog++ algorithm for cardinality estimation. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_unique()) + >>> df.select(pl.col("a").approx_n_unique()) shape: (1, 1) ┌─────┐ │ a │ @@ -2899,7 +2899,7 @@ def approx_unique(self) -> Self: └─────┘ """ - return self._from_pyexpr(self._pyexpr.approx_unique()) + return self._from_pyexpr(self._pyexpr.approx_n_unique()) def null_count(self) -> Self: """ diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index 136293a28610..2a97ef0e836a 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -25,7 +25,7 @@ from polars.functions.eager import align_frames, concat from polars.functions.lazy import ( apply, - approx_unique, + approx_n_unique, arctan2, arctan2d, arg_sort_by, @@ -91,7 +91,7 @@ "sum_horizontal", # polars.functions.eager "align_frames", - "approx_unique", + "approx_n_unique", "arg_where", "concat", "date_range", diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 1a9c0f21cacf..4d020494464a 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -590,9 +590,9 @@ def n_unique(column: str | Series) -> Expr | int: return col(column).n_unique() -def approx_unique(column: str | Expr) -> Expr: +def approx_n_unique(column: str | Expr) -> Expr: """ - Approx count unique values. + Approximate count of unique values. This is done using the HyperLogLog++ algorithm for cardinality estimation. @@ -604,7 +604,7 @@ def approx_unique(column: str | Expr) -> Expr: Examples -------- >>> df = pl.DataFrame({"a": [1, 8, 1], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]}) - >>> df.select(pl.approx_unique("a")) + >>> df.select(pl.approx_n_unique("a")) shape: (1, 1) ┌─────┐ │ a │ @@ -616,8 +616,8 @@ def approx_unique(column: str | Expr) -> Expr: """ if isinstance(column, pl.Expr): - return column.approx_unique() - return col(column).approx_unique() + return column.approx_n_unique() + return col(column).approx_n_unique() @overload diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index ea8f23071bd4..b3d169410a36 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -55,6 +55,7 @@ from polars.utils._wrap import wrap_df, wrap_expr from polars.utils.convert import _timedelta_to_pl_duration from polars.utils.deprecation import ( + deprecate_renamed_methods, deprecate_renamed_parameter, issue_deprecation_warning, ) @@ -110,6 +111,10 @@ P = ParamSpec("P") +@deprecate_renamed_methods( + mapping={"approx_unique": "approx_n_unique"}, + versions={"approx_unique": "0.18.12"}, +) class LazyFrame: """ Representation of a Lazy computation graph/query against a DataFrame. @@ -3798,9 +3803,9 @@ def first(self) -> Self: """ return self.slice(0, 1) - def approx_unique(self) -> Self: + def approx_n_unique(self) -> Self: """ - Approx count unique values. + Approximate count of unique values. This is done using the HyperLogLog++ algorithm for cardinality estimation. @@ -3812,7 +3817,7 @@ def approx_unique(self) -> Self: ... "b": [1, 2, 1, 1], ... } ... ) - >>> lf.approx_unique().collect() + >>> lf.approx_n_unique().collect() shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -3823,7 +3828,7 @@ def approx_unique(self) -> Self: └─────┴─────┘ """ - return self.select(F.all().approx_unique()) + return self.select(F.all().approx_n_unique()) def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: """ diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index 960733f8353c..bccde7ad8538 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -372,21 +372,25 @@ impl PyExpr { fn filter(&self, predicate: Self) -> Self { self.clone().inner.filter(predicate.inner).into() } + fn reverse(&self) -> Self { self.clone().inner.reverse().into() } + fn std(&self, ddof: u8) -> Self { self.clone().inner.std(ddof).into() } + fn var(&self, ddof: u8) -> Self { self.clone().inner.var(ddof).into() } + fn is_unique(&self) -> Self { self.clone().inner.is_unique().into() } - fn approx_unique(&self) -> Self { - self.clone().inner.approx_unique().into() + fn approx_n_unique(&self) -> Self { + self.clone().inner.approx_n_unique().into() } fn is_first(&self) -> Self { diff --git a/py-polars/tests/unit/functions/test_functions.py b/py-polars/tests/unit/functions/test_functions.py index b756e84269c8..01ad444049a1 100644 --- a/py-polars/tests/unit/functions/test_functions.py +++ b/py-polars/tests/unit/functions/test_functions.py @@ -332,21 +332,21 @@ def test_abs_logical_type() -> None: assert s.abs().to_list() == [timedelta(hours=1), timedelta(hours=1)] -def test_approx_unique() -> None: +def test_approx_n_unique() -> None: df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]}) assert_frame_equal( - df1.select(pl.approx_unique("b")), + df1.select(pl.approx_n_unique("b")), pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}), ) assert_frame_equal( - df1.select(pl.approx_unique(pl.col("b"))), + df1.select(pl.approx_n_unique(pl.col("b"))), pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}), ) assert_frame_equal( - df1.select(pl.col("b").approx_unique()), + df1.select(pl.col("b").approx_n_unique()), pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}), )