From 655391c69d84e29d10aaeb445844028aef38a488 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Fri, 17 Nov 2023 09:20:41 +0100 Subject: [PATCH] feat(python,rust!): Add dedicated horizontal aggregation methods to `DataFrame` (#12492) --- crates/polars-core/src/frame/mod.rs | 28 +- .../polars-ops/src/series/ops/horizontal.rs | 6 +- .../reference/dataframe/aggregation.rst | 4 + py-polars/polars/dataframe/frame.py | 263 +++++++++++++++--- py-polars/src/dataframe.rs | 33 ++- py-polars/tests/unit/dataframe/test_df.py | 18 +- .../unit/operations/test_aggregations.py | 4 +- .../tests/unit/operations/test_arithmetic.py | 4 +- py-polars/tests/unit/test_exprs.py | 6 +- 9 files changed, 291 insertions(+), 75 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 9872b9a7254e..cea3f6b3df55 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -2800,7 +2800,7 @@ impl DataFrame { /// Aggregate the column horizontally to their min values. #[cfg(feature = "zip_with")] - pub fn hmin(&self) -> PolarsResult> { + pub fn min_horizontal(&self) -> PolarsResult> { let min_fn = |acc: &Series, s: &Series| min_max_binary_series(acc, s, true); match self.columns.len() { @@ -2826,7 +2826,7 @@ impl DataFrame { /// Aggregate the column horizontally to their max values. #[cfg(feature = "zip_with")] - pub fn hmax(&self) -> PolarsResult> { + pub fn max_horizontal(&self) -> PolarsResult> { let max_fn = |acc: &Series, s: &Series| min_max_binary_series(acc, s, false); match self.columns.len() { @@ -2851,12 +2851,12 @@ impl DataFrame { } /// Aggregate the column horizontally to their sum values. - pub fn hsum(&self, none_strategy: NullStrategy) -> PolarsResult> { + pub fn sum_horizontal(&self, null_strategy: NullStrategy) -> PolarsResult> { let sum_fn = - |acc: &Series, s: &Series, none_strategy: NullStrategy| -> PolarsResult { + |acc: &Series, s: &Series, null_strategy: NullStrategy| -> PolarsResult { let mut acc = acc.clone(); let mut s = s.clone(); - if let NullStrategy::Ignore = none_strategy { + if let NullStrategy::Ignore = null_strategy { // if has nulls if acc.has_validity() { acc = acc.fill_null(FillNullStrategy::Zero)?; @@ -2871,7 +2871,7 @@ impl DataFrame { match self.columns.len() { 0 => Ok(None), 1 => Ok(Some(self.columns[0].clone())), - 2 => sum_fn(&self.columns[0], &self.columns[1], none_strategy).map(Some), + 2 => sum_fn(&self.columns[0], &self.columns[1], null_strategy).map(Some), _ => { // the try_reduce_with is a bit slower in parallelism, // but I don't think it matters here as we parallelize over columns, not over elements @@ -2879,7 +2879,7 @@ impl DataFrame { self.columns .par_iter() .map(|s| Ok(Cow::Borrowed(s))) - .try_reduce_with(|l, r| sum_fn(&l, &r, none_strategy).map(Cow::Owned)) + .try_reduce_with(|l, r| sum_fn(&l, &r, null_strategy).map(Cow::Owned)) // we can unwrap the option, because we are certain there is a column // we started this operation on 3 columns .unwrap() @@ -2890,7 +2890,7 @@ impl DataFrame { } /// Aggregate the column horizontally to their mean values. - pub fn hmean(&self, none_strategy: NullStrategy) -> PolarsResult> { + pub fn mean_horizontal(&self, null_strategy: NullStrategy) -> PolarsResult> { match self.columns.len() { 0 => Ok(None), 1 => Ok(Some(self.columns[0].clone())), @@ -2906,7 +2906,7 @@ impl DataFrame { .collect(); let numeric_df = DataFrame::new_no_checks(columns); - let sum = || numeric_df.hsum(none_strategy); + let sum = || numeric_df.sum_horizontal(null_strategy); let null_count = || { numeric_df @@ -3588,7 +3588,7 @@ mod test { #[test] #[cfg(feature = "zip_with")] #[cfg_attr(miri, ignore)] - fn test_h_agg() { + fn test_horizontal_agg() { let a = Series::new("a", &[1, 2, 6]); let b = Series::new("b", &[Some(1), None, None]); let c = Series::new("c", &[Some(4), None, Some(3)]); @@ -3596,7 +3596,7 @@ mod test { let df = DataFrame::new(vec![a, b, c]).unwrap(); assert_eq!( Vec::from( - df.hmean(NullStrategy::Ignore) + df.mean_horizontal(NullStrategy::Ignore) .unwrap() .unwrap() .f64() @@ -3606,7 +3606,7 @@ mod test { ); assert_eq!( Vec::from( - df.hsum(NullStrategy::Ignore) + df.sum_horizontal(NullStrategy::Ignore) .unwrap() .unwrap() .i32() @@ -3615,11 +3615,11 @@ mod test { &[Some(6), Some(2), Some(9)] ); assert_eq!( - Vec::from(df.hmin().unwrap().unwrap().i32().unwrap()), + Vec::from(df.min_horizontal().unwrap().unwrap().i32().unwrap()), &[Some(1), Some(2), Some(3)] ); assert_eq!( - Vec::from(df.hmax().unwrap().unwrap().i32().unwrap()), + Vec::from(df.max_horizontal().unwrap().unwrap().i32().unwrap()), &[Some(4), Some(2), Some(6)] ); } diff --git a/crates/polars-ops/src/series/ops/horizontal.rs b/crates/polars-ops/src/series/ops/horizontal.rs index 3e2c7d639be5..ba87790c19b6 100644 --- a/crates/polars-ops/src/series/ops/horizontal.rs +++ b/crates/polars-ops/src/series/ops/horizontal.rs @@ -70,11 +70,13 @@ pub fn all_horizontal(s: &[Series]) -> PolarsResult { #[cfg(feature = "zip_with")] pub fn max_horizontal(s: &[Series]) -> PolarsResult> { let df = DataFrame::new_no_checks(Vec::from(s)); - df.hmax().map(|opt_s| opt_s.map(|s| s.with_name("max"))) + df.max_horizontal() + .map(|opt_s| opt_s.map(|s| s.with_name("max"))) } #[cfg(feature = "zip_with")] pub fn min_horizontal(s: &[Series]) -> PolarsResult> { let df = DataFrame::new_no_checks(Vec::from(s)); - df.hmin().map(|opt_s| opt_s.map(|s| s.with_name("min"))) + df.min_horizontal() + .map(|opt_s| opt_s.map(|s| s.with_name("min"))) } diff --git a/py-polars/docs/source/reference/dataframe/aggregation.rst b/py-polars/docs/source/reference/dataframe/aggregation.rst index 703b47faaff3..7986e764d966 100644 --- a/py-polars/docs/source/reference/dataframe/aggregation.rst +++ b/py-polars/docs/source/reference/dataframe/aggregation.rst @@ -7,11 +7,15 @@ Aggregation :toctree: api/ DataFrame.max + DataFrame.max_horizontal DataFrame.mean + DataFrame.mean_horizontal DataFrame.median DataFrame.min + DataFrame.min_horizontal DataFrame.product DataFrame.quantile DataFrame.std DataFrame.sum + DataFrame.sum_horizontal DataFrame.var diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index f3411d83b8d4..b839b055b4b5 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -88,6 +88,7 @@ deprecate_renamed_function, deprecate_renamed_parameter, deprecate_saturating, + issue_deprecation_warning, ) from polars.utils.various import ( _prepare_row_count_args, @@ -8053,10 +8054,20 @@ def max(self, axis: Literal[1]) -> Series: def max(self, axis: int = 0) -> Self | Series: ... - def max(self, axis: int = 0) -> Self | Series: + def max(self, axis: int | None = None) -> Self | Series: """ Aggregate the columns of this DataFrame to their maximum value. + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + Examples -------- >>> df = pl.DataFrame( @@ -8077,14 +8088,51 @@ def max(self, axis: int = 0) -> Self | Series: └─────┴─────┴─────┘ """ + if axis is not None: + issue_deprecation_warning( + "The `axis` parameter for `DataFrame.max` is deprecated." + " Use `DataFrame.max_horizontal()` to perform horizontal aggregation.", + version="0.19.14", + ) + else: + axis = 0 + if axis == 0: return self._from_pydf(self._df.max()) if axis == 1: - return wrap_s(self._df.hmax()) + return wrap_s(self._df.max_horizontal()) raise ValueError("axis should be 0 or 1") + def max_horizontal(self) -> Series: + """ + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: 'max' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + """ + return self.select(F.max_horizontal(F.all())).to_series() + @overload - def min(self, axis: Literal[0] = ...) -> Self: + def min(self, axis: Literal[0] | None = ...) -> Self: ... @overload @@ -8092,13 +8140,23 @@ def min(self, axis: Literal[1]) -> Series: ... @overload - def min(self, axis: int = 0) -> Self | Series: + def min(self, axis: int) -> Self | Series: ... - def min(self, axis: int = 0) -> Self | Series: + def min(self, axis: int | None = None) -> Self | Series: """ Aggregate the columns of this DataFrame to their minimum value. + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + Examples -------- >>> df = pl.DataFrame( @@ -8119,12 +8177,49 @@ def min(self, axis: int = 0) -> Self | Series: └─────┴─────┴─────┘ """ + if axis is not None: + issue_deprecation_warning( + "The `axis` parameter for `DataFrame.min` is deprecated." + " Use `DataFrame.min_horizontal()` to perform horizontal aggregation.", + version="0.19.14", + ) + else: + axis = 0 + if axis == 0: return self._from_pydf(self._df.min()) if axis == 1: - return wrap_s(self._df.hmin()) + return wrap_s(self._df.min_horizontal()) raise ValueError("axis should be 0 or 1") + def min_horizontal(self) -> Series: + """ + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: 'min' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + return self.select(F.min_horizontal(F.all())).to_series() + @overload def sum( self, @@ -8147,7 +8242,7 @@ def sum( def sum( self, *, - axis: int = 0, + axis: int, null_strategy: NullStrategy = "ignore", ) -> Self | Series: ... @@ -8155,7 +8250,7 @@ def sum( def sum( self, *, - axis: int = 0, + axis: int | None = None, null_strategy: NullStrategy = "ignore", ) -> Self | Series: """ @@ -8164,9 +8259,17 @@ def sum( Parameters ---------- axis - Either 0 or 1. + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. null_strategy : {'ignore', 'propagate'} - This argument is only used if axis == 1. + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. Examples -------- @@ -8186,22 +8289,64 @@ def sum( ╞═════╪═════╪══════╡ │ 6 ┆ 21 ┆ null │ └─────┴─────┴──────┘ - >>> df.sum(axis=1) - shape: (3,) - Series: 'foo' [str] - [ - "16a" - "27b" - "38c" - ] - """ + if axis is not None: + issue_deprecation_warning( + "The `axis` parameter for `DataFrame.sum` is deprecated." + " Use `DataFrame.min_horizontal()` to perform horizontal aggregation.", + version="0.19.14", + ) + else: + axis = 0 + if axis == 0: return self._from_pydf(self._df.sum()) if axis == 1: - return wrap_s(self._df.hsum(null_strategy)) + if null_strategy == "ignore": + ignore_nulls = True + elif null_strategy == "propagate": + ignore_nulls = False + else: + raise ValueError( + f"`null_strategy` must be one of {{'ignore', 'propagate'}}, got {null_strategy}" + ) + return self.sum_horizontal(ignore_nulls=ignore_nulls) raise ValueError("axis should be 0 or 1") + def sum_horizontal(self, *, ignore_nulls: bool = True) -> Series: + """ + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: 'sum' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + """ + return wrap_s(self._df.sum_horizontal(ignore_nulls)).alias("sum") + @overload def mean( self, @@ -8224,7 +8369,7 @@ def mean( def mean( self, *, - axis: int = 0, + axis: int, null_strategy: NullStrategy = "ignore", ) -> Self | Series: ... @@ -8232,7 +8377,7 @@ def mean( def mean( self, *, - axis: int = 0, + axis: int | None = None, null_strategy: NullStrategy = "ignore", ) -> Self | Series: """ @@ -8241,9 +8386,17 @@ def mean( Parameters ---------- axis - Either 0 or 1. + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. null_strategy : {'ignore', 'propagate'} - This argument is only used if axis == 1. + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. Examples -------- @@ -8264,22 +8417,64 @@ def mean( ╞═════╪═════╪══════╪══════╡ │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ └─────┴─────┴──────┴──────┘ - >>> df.mean(axis=1) - shape: (3,) - Series: 'foo' [f64] - [ - 2.666667 - 3.0 - 5.5 - ] - """ + if axis is not None: + issue_deprecation_warning( + "The `axis` parameter for `DataFrame.mean` is deprecated." + " Use `DataFrame.mean_horizontal()` to perform horizontal aggregation.", + version="0.19.14", + ) + else: + axis = 0 + if axis == 0: return self._from_pydf(self._df.mean()) if axis == 1: - return wrap_s(self._df.hmean(null_strategy)) + if null_strategy == "ignore": + ignore_nulls = True + elif null_strategy == "propagate": + ignore_nulls = False + else: + raise ValueError( + f"`null_strategy` must be one of {{'ignore', 'propagate'}}, got {null_strategy}" + ) + return self.mean_horizontal(ignore_nulls=ignore_nulls) raise ValueError("axis should be 0 or 1") + def mean_horizontal(self, *, ignore_nulls: bool = True) -> Series: + """ + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: 'mean' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + """ + return wrap_s(self._df.mean_horizontal(ignore_nulls)).alias("mean") + def std(self, ddof: int = 1) -> Self: """ Aggregate the columns of this DataFrame to their standard deviation value. diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index ab57dbb20db7..102de53de798 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -4,6 +4,7 @@ use std::ops::Deref; use either::Either; use numpy::IntoPyArray; use polars::frame::row::{rows_to_schema_supertypes, Row}; +use polars::frame::NullStrategy; #[cfg(feature = "avro")] use polars::io::avro::AvroCompression; #[cfg(feature = "ipc")] @@ -1281,23 +1282,39 @@ impl PyDataFrame { self.df.median().into() } - pub fn hmean(&self, null_strategy: Wrap) -> PyResult> { - let s = self.df.hmean(null_strategy.0).map_err(PyPolarsErr::from)?; + pub fn max_horizontal(&self) -> PyResult> { + let s = self.df.max_horizontal().map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.into())) } - pub fn hmax(&self) -> PyResult> { - let s = self.df.hmax().map_err(PyPolarsErr::from)?; + pub fn min_horizontal(&self) -> PyResult> { + let s = self.df.min_horizontal().map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.into())) } - pub fn hmin(&self) -> PyResult> { - let s = self.df.hmin().map_err(PyPolarsErr::from)?; + pub fn sum_horizontal(&self, ignore_nulls: bool) -> PyResult> { + let null_strategy = if ignore_nulls { + NullStrategy::Ignore + } else { + NullStrategy::Propagate + }; + let s = self + .df + .sum_horizontal(null_strategy) + .map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.into())) } - pub fn hsum(&self, null_strategy: Wrap) -> PyResult> { - let s = self.df.hsum(null_strategy.0).map_err(PyPolarsErr::from)?; + pub fn mean_horizontal(&self, ignore_nulls: bool) -> PyResult> { + let null_strategy = if ignore_nulls { + NullStrategy::Ignore + } else { + NullStrategy::Propagate + }; + let s = self + .df + .mean_horizontal(null_strategy) + .map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.into())) } diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index cb163f4ddeba..c040f4ab47aa 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -676,10 +676,10 @@ def test_df_fold() -> None: df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) # just check dispatch. values are tested on rust side. - assert len(df.sum(axis=1)) == 3 - assert len(df.mean(axis=1)) == 3 - assert len(df.min(axis=1)) == 3 - assert len(df.max(axis=1)) == 3 + assert len(df.sum_horizontal()) == 3 + assert len(df.mean_horizontal()) == 3 + assert len(df.min_horizontal()) == 3 + assert len(df.max_horizontal()) == 3 df_width_one = df[["a"]] assert_series_equal(df_width_one.fold(lambda s1, s2: s1), df["a"]) @@ -1710,17 +1710,15 @@ def test_panic() -> None: a.filter(pl.col("col1") != "b") -def test_h_agg() -> None: +def test_horizontal_agg() -> None: df = pl.DataFrame({"a": [1, None, 3], "b": [1, 2, 3]}) + assert_series_equal(df.sum_horizontal(), pl.Series("sum", [2, 2, 6])) assert_series_equal( - df.sum(axis=1, null_strategy="ignore"), pl.Series("a", [2, 2, 6]) + df.sum_horizontal(ignore_nulls=False), pl.Series("sum", [2, None, 6]) ) assert_series_equal( - df.sum(axis=1, null_strategy="propagate"), pl.Series("a", [2, None, 6]) - ) - assert_series_equal( - df.mean(axis=1, null_strategy="propagate"), pl.Series("a", [1.0, None, 3.0]) + df.mean_horizontal(ignore_nulls=False), pl.Series("mean", [1.0, None, 3.0]) ) diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/test_aggregations.py index d4bb1cecf687..12f752a3698f 100644 --- a/py-polars/tests/unit/operations/test_aggregations.py +++ b/py-polars/tests/unit/operations/test_aggregations.py @@ -75,10 +75,10 @@ def test_duration_aggs() -> None: } -def test_hmean_with_str_column() -> None: +def test_mean_horizontal_with_str_column() -> None: assert pl.DataFrame( {"int": [1, 2, 3], "bool": [True, True, None], "str": ["a", "b", "c"]} - ).mean(axis=1).to_list() == [1.0, 1.5, 3.0] + ).mean_horizontal().to_list() == [1.0, 1.5, 3.0] def test_list_aggregation_that_filters_all_data_6017() -> None: diff --git a/py-polars/tests/unit/operations/test_arithmetic.py b/py-polars/tests/unit/operations/test_arithmetic.py index 39ce13ea4be6..abe7d09661c2 100644 --- a/py-polars/tests/unit/operations/test_arithmetic.py +++ b/py-polars/tests/unit/operations/test_arithmetic.py @@ -206,7 +206,9 @@ def test_literal_no_upcast() -> None: def test_boolean_addition() -> None: - s = pl.DataFrame({"a": [True, False, False], "b": [True, False, True]}).sum(axis=1) + s = pl.DataFrame( + {"a": [True, False, False], "b": [True, False, True]} + ).sum_horizontal() assert s.dtype == pl.utils.get_index_type() assert s.to_list() == [2, 0, 1] diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index 1d2a28209793..db1436881799 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -468,11 +468,9 @@ def test_ewm_with_multiple_chunks() -> None: assert df1.n_chunks() == 2 ewm_std = df1.with_columns( - [ - pl.all().ewm_std(com=20).name.prefix("ewm_"), - ] + pl.all().ewm_std(com=20).name.prefix("ewm_"), ) - assert ewm_std.null_count().sum(axis=1)[0] == 4 + assert ewm_std.null_count().sum_horizontal()[0] == 4 def test_map_dict() -> None: