diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index a12ff14b910df..bf3211f8f3586 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3887,10 +3887,10 @@ def describe( │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ """ @@ -3900,7 +3900,7 @@ def describe( raise ValueError("Percentiles must all be in the range [0, 1].") # determine metrics (optional/additional percentiles) - metrics = ["count", "null_count", "mean", "std", "min", "max", "median"] + metrics = ["count", "null_count", "mean", "std", "min", "max", "50%"] percentile_exprs = [] for p in percentiles or (): percentile_exprs.append(F.all().quantile(p).prefix(f"{p}:")) @@ -3924,6 +3924,17 @@ def describe( df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(0, len(metrics)) ] + # sort percentiles, put `max` last + metric_idxs, sorted_metrics = zip( + *sorted( + enumerate(metrics), + key=lambda t: ( + t[1] == "max", + int(t[1].rstrip("%")) if t[1].endswith("%") else float("nan"), + ), + ) + ) + # cast by column type (numeric/bool -> float), (other -> string) summary = dict(zip(self.columns, list(zip(*described)))) num_or_bool = NUMERIC_DTYPES | {Boolean} @@ -3932,12 +3943,13 @@ def describe( None if (v is None or isinstance(v, dict)) else (float(v) if tp in num_or_bool else str(v)) - for v in summary[c] + for idx in metric_idxs + for v in [summary[c][idx]] ] # return results as a frame df_summary = self.__class__(summary) - df_summary.insert_at_idx(0, pl.Series("describe", metrics)) + df_summary.insert_at_idx(0, pl.Series("describe", sorted_metrics)) return df_summary def find_idx_by_name(self, name: str) -> int: diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 6f55be80295d8..bd71b9eed0a05 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1333,8 +1333,8 @@ def describe( │ std ┆ 1.581139 │ │ min ┆ 1.0 │ │ max ┆ 5.0 │ - │ median ┆ 3.0 │ │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ │ 75% ┆ 4.0 │ └────────────┴──────────┘ @@ -1370,11 +1370,20 @@ def describe( "mean": s.mean(), "std": s.std(), "min": s.min(), - "max": s.max(), - "median": s.median(), } if percentiles: - stats.update({f"{p:.0%}": s.quantile(p) for p in percentiles}) + pcts = {f"{p:.0%}": s.quantile(p) for p in percentiles} + pcts["50%"] = s.median() + stats.update( + { + k: pcts[k] + for k in sorted(pcts, key=lambda pct: int(pct.rstrip("%"))) + } + ) + else: + stats["50%"] = s.median() + + stats["max"] = s.max() elif self.is_boolean(): stats = { @@ -1395,8 +1404,8 @@ def describe( "count": str(self.len()), "null_count": str(self.null_count()), "min": str(self.dt.min()), + "50%": str(self.dt.median()), "max": str(self.dt.max()), - "median": str(self.dt.median()), } else: raise TypeError("This type is not supported") diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 8da2f7dccf8a0..167960812dd5f 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1155,27 +1155,37 @@ def test_describe() -> None: "mean", "std", "min", - "max", - "median", "25%", + "50%", "75%", + "max", + ], + "a": [ + 3.0, + 0.0, + 2.2666666666666666, + 1.1015141094572205, + 1.0, + 1.0, + 2.8, + 3.0, + 3.0, ], - "a": [3.0, 0.0, 2.2666667, 1.101514, 1.0, 3.0, 2.8, 1.0, 3.0], - "b": [3.0, 1.0, 4.5, 0.7071067811865476, 4.0, 5.0, 4.5, 4.0, 5.0], + "b": [3.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 4.5, 5.0, 5.0], "c": [ 3.0, 0.0, 0.6666666666666666, 0.5773502588272095, 0.0, - 1.0, - 1.0, None, + 1.0, None, + 1.0, ], - "d": ["3", "1", None, None, "b", "c", None, None, None], + "d": ["3", "1", None, None, "b", None, None, None, "c"], "e": ["3", "1", None, None, None, None, None, None, None], - "f": ["3", "0", None, None, "2020-01-01", "2022-01-01", None, None, None], + "f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"], } ) assert_frame_equal(df.describe(), expected) @@ -1196,10 +1206,10 @@ def test_describe() -> None: "mean", "std", "min", - "max", - "median", "25%", + "50%", "75%", + "max", ], "numerical": [ 4.0, @@ -1207,10 +1217,10 @@ def test_describe() -> None: 1.3333333333333333, 0.5773502691896257, 1.0, - 2.0, 1.0, 1.0, 2.0, + 2.0, ], "struct": ["4", "1", None, None, None, None, None, None, None], "list": ["4", "1", None, None, None, None, None, None, None], @@ -1223,8 +1233,8 @@ def test_describe() -> None: ("mean", 1.3333333333333333, None, None), ("std", 0.5773502691896257, None, None), ("min", 1.0, None, None), + ("50%", 1.0, None, None), ("max", 2.0, None, None), - ("median", 1.0, None, None), ] described = df.describe(percentiles=(0.2, 0.4, 0.6, 0.8)) @@ -1240,12 +1250,12 @@ def test_describe() -> None: ("mean", 1.3333333333333333, None, None), ("std", 0.5773502691896257, None, None), ("min", 1.0, None, None), - ("max", 2.0, None, None), - ("median", 1.0, None, None), ("20%", 1.0, None, None), ("40%", 1.0, None, None), + ("50%", 1.0, None, None), ("60%", 1.0, None, None), ("80%", 2.0, None, None), + ("max", 2.0, None, None), ] diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 4b3dce6c071cb..afdfdaf53b929 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -1155,7 +1155,6 @@ def test_describe() -> None: date_s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) empty_s = pl.Series(np.empty(0)) - pl.DataFrame assert dict(num_s.describe().rows()) == { # type: ignore[arg-type] "count": 3.0, "max": 3.0, @@ -1163,7 +1162,7 @@ def test_describe() -> None: "min": 1.0, "null_count": 0.0, "std": 1.0, - "median": 2.0, + "50%": 2.0, "25%": 1.0, "75%": 3.0, } @@ -1174,7 +1173,7 @@ def test_describe() -> None: "min": 1.3, "null_count": 0.0, "std": 3.8109491381194442, - "median": 4.6, + "50%": 4.6, "25%": 1.3, "75%": 8.9, } @@ -1192,7 +1191,7 @@ def test_describe() -> None: "count": "3", "max": "2021-01-03", "min": "2021-01-01", - "median": "2021-01-02", + "50%": "2021-01-02", "null_count": "0", }